# RAG提取文中的氨基酸肽序列

In [1]:
import pandas as pd
import json

#获取文章的pmcid-pmid键值对
dic={'pmcid':[],'pmid':[]}
with open('/mnt/nas/handu/data/anti_inflammation_pubmed/full_text_pmcid_pmid_map.json','r') as f:
    js_list=json.load(f)
    for unit in js_list:
        dic['pmcid'].append(unit['pmcid'])
        dic['pmid'].append(unit['pmid'])

df_patent=pd.DataFrame(dic)

In [2]:
#获取review标签
with open('/mnt/nas/handu/data/anti_inflammation_pubmed/review_pmid_list.txt','r') as f:
    review_list=f.read().split('\n')
df_patent['is_review']=df_patent['pmid'].apply(lambda x: 1*(x in review_list))

In [3]:
df_patent

Unnamed: 0,pmcid,pmid,is_review
0,PMC6544583,31193882,0
1,PMC10409771,37553374,0
2,PMC4760456,26660549,0
3,PMC7429751,32795323,0
4,PMC8017164,33815353,0
...,...,...,...
2737,PMC9014016,35444957,0
2738,PMC4484568,26175842,0
2739,PMC4281220,25551573,0
2740,PMC4671727,26642048,0


## 商用大模型API加载

In [4]:
import os

os.environ["http_proxy"] = "http://192.168.1.254:7890"
os.environ["https_proxy"] = "http://192.168.1.254:7890"

In [5]:
from llama_index.llms.perplexity import Perplexity

pplx_api_key = "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"

llm = Perplexity(
    api_key=pplx_api_key, model="mixtral-8x7b-instruct", temperature=0.0,
    api_base="https://api.perplexity.ai",
)

In [6]:
from llama_index import SimpleDirectoryReader
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    set_global_service_context,
)

from llama_index.node_parser import (
    HierarchicalNodeParser,
    SentenceSplitter,
)

from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage import StorageContext
from llama_index.node_parser import get_leaf_nodes, get_root_nodes
from llama_index import VectorStoreIndex
from llama_index.retrievers.auto_merging_retriever import AutoMergingRetriever
from llama_index.query_engine import RetrieverQueryEngine

#载入模型以及embedding功能
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(
    model_name="/home/cymei/bge-small-en",device='cuda:3'
)

service_context = ServiceContext.from_defaults(
    llm=llm, embed_model=embed_model #"local:/home/cymei/bge-small-en"
)
set_global_service_context(service_context)





## 混合索引引擎

In [7]:
# import QueryBundle
from llama_index import QueryBundle

# import NodeWithScore
from llama_index.schema import NodeWithScore

# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
)
from llama_index.schema import BaseNode

from typing import List

In [8]:
from nltk import word_tokenize 

class CustomRetriever(BaseRetriever):
    """自定义的混合索引类：相似度索引+关键词索引"""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        vector_retriever_large: VectorIndexRetriever,
        max_num_keyword_nodes=3,
        keywords=[]
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever #头部相似度节点
        self._vector_retriever_large = vector_retriever_large #更大范围的头部相似度节点(用于关键词搜索)

        self.keywords=keywords #索引依据的关键词
        self.max_num_keyword_nodes=max_num_keyword_nodes #设置最大关键词节点数量
        super().__init__()

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        #利用两个不同的参数的retriever进行retrieve
        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        vector_nodes_large = self._vector_retriever_large.retrieve(query_bundle)

        #确保集合中的节点id对应的相似度由大到小排列
        vector_ids = {n.node_id for n in sorted(vector_nodes,key=lambda node: node.score,reverse=True)} 
        # vector_ids_large = {n.node_id for n in vector_nodes_large}

        #对于更大范围的相似度索引结果，取出其中含有关键词的节点
        keyword_ids = []
        for n in sorted(vector_nodes_large,key=lambda node: node.score,reverse=True):
            for k in self.keywords:
                if(k in word_tokenize(n.get_content())):
                    #判断关键词是否在文章片段的分词结果中
                    keyword_ids.append(n.node_id)
                    break

        combined_dict = {n.node_id: n for n in vector_nodes}
        combined_dict.update({n.node_id: n for n in vector_nodes_large if n.node_id in keyword_ids })

        #合并两组节点
        if(keyword_ids==[]):
            #不含有关键词的情况下照常进行相似度索引
            retrieve_ids = vector_ids
        else:
            keyword_ids_top=set(keyword_ids[:self.max_num_keyword_nodes]) #取相似度最高的几个关键词节点
            vector_ids_unique=vector_ids-keyword_ids_top  #top相似度集合中独有的节点
            retrieve_ids=keyword_ids_top #关键词集合和top相似度集合共有的节点+关键词集合中独有的节点
            add_num=len(vector_ids)-len(keyword_ids_top)
            retrieve_ids=set(list(vector_ids_unique)[:add_num]).union(retrieve_ids) #额外添加部分top相似度集合中独有的节点

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes

In [9]:
class LLM_summary:
    def __init__(self,documents=[],keywords=[]):
        self.keywords=keywords
        #载入专利文本
        self.documents=documents
    
    def hierarchy_node_index(self,chunk_list=[1024, 512, 256],similarity_top_k=6,max_num_keyword_nodes=3):
        '''
        获取分层节点，并建立对应索引
        设置每一层的token数量、选入prompt的节点数量以及最多含有关键词的节点数量
        '''
        node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_list)
        #获取分段后的每个节点
        self.nodes = node_parser.get_nodes_from_documents(self.documents)

        #建立各个节点的存储
        docstore = SimpleDocumentStore()
        # insert nodes into docstore
        docstore.add_documents(self.nodes)
        # define storage context (will include vector store by default too)
        storage_context = StorageContext.from_defaults(docstore=docstore)
        self.leaf_nodes = get_leaf_nodes(self.nodes)
    
        # 利用叶子节点计算相似度，并关联到对应的父节点
        base_index = VectorStoreIndex(
            self.leaf_nodes,
            storage_context=storage_context,
            #service_context=self.service_context,
        )

        #用于寻找和问题最相近的头部文本片段
        base_retriever = base_index.as_retriever(similarity_top_k=similarity_top_k)
        #用于从更大范围中寻找含有关键词的文本片段
        large_retriever = base_index.as_retriever(similarity_top_k=len(self.leaf_nodes))
        #构建混合索引
        custom_retriever = CustomRetriever(base_retriever, large_retriever,max_num_keyword_nodes=max_num_keyword_nodes,keywords=self.keywords)

        retriever = AutoMergingRetriever(custom_retriever, storage_context, verbose=True)

        return retriever 

## 批量文章挖掘seq

In [10]:
from llama_index.llms import ChatMessage

#设置system_prompt
system_str=  """You are a biologist and patent expert. 
    You will be provided with some contents from a patent and will be asked to answer specific questions related to the patent. 
    Please answer the question only using the provided contents and do not make up the answer with prior knowledge."""

In [11]:
import json
import os
import re
from llama_index.schema import Document
#os.chdir('/home/cymei/nlp_task/anti_inflammation')

#专利文本载入函数。由于此次任务是寻找seq序列，和之前seq序列判断任务不同，暂时不考虑文本预处理
def load_doc(file_name):
    with open(file_name,'r') as f:
        js_file=json.load(f)
        js_text=js_file['text'] #只使用专利文件中的text内容

    return Document(text=js_text)

In [12]:
import os
from IPython.display import Markdown, display
import time

os.environ["http_proxy"] = "http://192.168.1.254:7890"
os.environ["https_proxy"] = "http://192.168.1.254:7890"

#从review和非review样本中随机各抽取5个进行尝试
df_test=pd.concat([df_patent[df_patent['is_review']==0].sample(5),df_patent[df_patent['is_review']==1].sample(5)]) 

#记录模型回答以及支持观点的段落
df_test['model_response']=''
df_test['supportive_context']=''

patent_list=df_test['pmcid'].unique()   #df_seq['pmcid'].unique()  PMC2408570  PMC6554247  PMC9786789  PMC6544583

for patent in patent_list:

    file_loc='/mnt/nas/handu/data/anti_inflammation_pubmed/pmc_parse/{}.json'.format(patent)
    doc_text=load_doc(file_loc)
    
    #使用llama-index建立查询引擎
    summarier=LLM_summary([doc_text])
    #分别从claim和des中进行检索
    # summarier_claim=LLM_summary([doc_dic['claim']])
    # summarier_des=LLM_summary([doc_dic['description']])

    #设置检索关键词
    summarier.keywords=['peptide','anti-inflammatory'] 

        # summarier_claim.keywords=[row['seq_no']]
        # summarier_des.keywords=[row['seq_no']]
    retriever=summarier.hierarchy_node_index(similarity_top_k=10,max_num_keyword_nodes=4)
        # retriever_claim=summarier_claim.hierarchy_node_index(similarity_top_k=2,max_num_keyword_nodes=2)
        # retriever_des=summarier_des.hierarchy_node_index(similarity_top_k=8,max_num_keyword_nodes=3)

    #query中的问题
    #目前该prompt可以引导模型在部分文字中抓取一些seq，但模型无法判断是完整的肽序列还是片段
    question="""Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence is not included). The target sequence should only consist of different kinds of peptide.
        Finally, plesae do analysis and explain the reason first, 
        then list all peptides sequences that have the ability of anti-inflammation line by line.
        Please focuse on the given context and don't use prior knowledge. Please return the top 3 most supportive context number.
            """
        #Make sure the target sequence numbers are specific about anti-inflammation and don't make any guesses.
        #Please focuse on the given context and don't use prior knowledge. Please return the most supportive context number.

    #retriever寻找目标节点
    response_nodes=retriever.retrieve(question)  

        #分别从claim和des中进行检索，确保两部分内容都会被用上
        # response_nodes_claim=retriever_claim.retrieve(question) 
        # response_nodes_des=retriever_des.retrieve(question) 
        # response_nodes=response_nodes_claim + response_nodes_des
    print(question)
    context_str="""Context is :
            {}
            ---------------------
            Given the provided context, please answer the query:
            {}
            """.format('\n'.join(['Context{}:\n'.format(index) + node.text for index,node in enumerate(response_nodes)]),question)  
    messages_dict = [
            {"role": "system", "content": system_str},
            {"role": "user", "content": context_str},
        ]
    messages = [ChatMessage(**msg) for msg in messages_dict]
    response = llm.chat(messages)

    display(Markdown(f"<b>{response.raw['usage']}</b>"))
    print('--------------------------------------------------------------------')
    display(Markdown(f"<b>{response}</b>"))

    #根据模型回答找出支持的文本片段
    response_ans=response.message.content
    context_list=re.findall('Context[0-9]+',response_ans)
    context_nums=set([int(re.sub('Context','',context)) for context in context_list])
    condition=(df_test['pmcid']==patent)

    df_test.loc[condition,'model_response']=response_ans
    df_test.loc[condition,'supportive_context']='\n'.join(['Context{}:\n'.format(index) + response_nodes[index].text for index in context_nums if(index<len(response_nodes))])
    time.sleep(30) #设置间隔防止api调用频率过高

> Merging 1 nodes into parent node.
> Parent node id: b2425a31-5b0b-4363-855b-a2032fc3987f.
> Parent node text: These results indicate that a slight drug association exists when the peptides are within the pol...

> Merging 2 nodes into parent node.
> Parent node id: 4f42d184-b618-482e-bd03-b5a963d7756e.
> Parent node text: Figure S4                  MALDI-TOF spectra of (A) Rho-POD and (B) Rho-HIV-Tat peptides.        ...

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence is not included). The target sequence should only consist of different kinds of peptide.
        Finally, plesae do analysis and explain the reason first, 
        then list all peptides sequences that have the ability of anti-inflammation line by line.
        Please focuse on the given context and don't use prior knowledge. Please return the top 3 most supportive context number.
            


<b>{'prompt_tokens': 2492, 'completion_tokens': 374, 'total_tokens': 2866}</b>

--------------------------------------------------------------------


<b>assistant: After analyzing the given contexts, I have found the following peptide sequences:

1. POD (CGGG[ARKKAAKA]4)
2. HIV-Tat (CGGGGYGRKKRRQRRR)

Now, I will explain the reason for selecting these two sequences and why I believe they have the ability for anti-inflammation.

Context0 states that a "slight drug association exists when the peptides are within the polymer matrix" and different characteristics were observed depending on the preparation method. This implies that the peptides used in the study have some kind of therapeutic effect, possibly including anti-inflammatory properties.

In Context1, an anti-inflammatory efficacy assay is described, where inflammation is induced by topical administration of sodium arachidonate (SA) and the treatment protocol consists of instilling formulations after 30 minutes. The formulations include NPs with peptides, suggesting that the peptides might have an anti-inflammatory effect.

Context3 mentions that anti-inflammatory efficacy assays were performed to determine whether the novel formulations (PLGA-NPs-PEG-peptide and PLGA-PEG-peptide NPs) could form the basis of a suitable strategy for ocular delivery of NSAIDs, further implying that the peptides might have anti-inflammatory properties.

Based on the above analysis, I believe that both POD and HIV-Tat peptides have the ability for anti-inflammation.

Top 3 most supportive context numbers: 1, 3, and 0.</b>

> Merging 2 nodes into parent node.
> Parent node id: 74a05a2f-1beb-4fc2-95de-3f8245aece59.
> Parent node text: 1C,D,G,H). Further quantification of the area infiltrated with lymphocytes in the whole SMG showe...

> Merging 2 nodes into parent node.
> Parent node id: 6f56f46f-8d6a-45d5-bc66-81216ece52af.
> Parent node text: Our results indicated that the pro-inflammatory cytokines TNF-α, IL-6, IFN-β and IFN-γ were signi...

> Merging 2 nodes into parent node.
> Parent node id: 48ae593a-fa9c-4e93-b545-bed834f7692c.
> Parent node text: 1C,D,G,H). Further quantification of the area infiltrated with lymphocytes in the whole SMG showe...

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence is not included). The target sequence should only consist of different kinds of peptide.
        Finally, plesae do analysis and explain the reason first, 
        then list all peptides sequences that have the abi

<b>{'prompt_tokens': 3192, 'completion_tokens': 275, 'total_tokens': 3467}</b>

--------------------------------------------------------------------


<b>assistant: Based on the provided context, there are no peptide sequences, either partial or full, mentioned in the study. Therefore, I cannot list any peptide sequences with anti-inflammatory properties. The study does mention several molecules, including cytokines and lipid mediators, that play a role in inflammation and its resolution, but these are not peptide sequences.

The most supportive context numbers for this question are:

1. Context0: This context does not contain any peptide sequences, but it provides background information on the study, which investigates the effects of ALX/FPR2 deletion on salivary gland inflammation and function.
2. Context2: This context describes the upregulation of pro-inflammatory cytokines in ALX/FPR2-deficient mice treated with LPS, but it does not mention any peptide sequences.
3. Context6: This context provides an overview of the role of inflammation in salivary gland dysfunction and the potential therapeutic applications of resolvins, but it does not mention any peptide sequences.

In summary, there are no peptide sequences with anti-inflammatory properties mentioned in the provided context.</b>

> Merging 1 nodes into parent node.
> Parent node id: 6746e6a9-13bc-4602-af70-6a842acca983.
> Parent node text: 

> Merging 1 nodes into parent node.
> Parent node id: 95ea28b7-08b0-4bfb-a7c7-20de9471de2b.
> Parent node text: 

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence is not included). The target sequence should only consist of different kinds of peptide.
        Finally, plesae do analysis and explain the reason first, 
        then list all peptides sequences that have the ability of anti-inflammation line by line.
        Please focuse on the given context and don't use prior knowledge. Please return the top 3 most supportive context number.
            


<b>{'prompt_tokens': 209, 'completion_tokens': 330, 'total_tokens': 539}</b>

--------------------------------------------------------------------


<b>assistant: Without the provided contents, I am unable to answer this question directly. However, I will provide a general approach for identifying peptide sequences with anti-inflammatory functions based on the given context.

1. Identify peptide sequences: From the patent contents, locate all mentions of peptide sequences. Exclude any sequences that are not peptides, such as DNA or RNA sequences. Write down the unique peptide sequences found in the context.
2. Analyze anti-inflammatory function: Carefully read the patent contents to determine which of the identified peptides have anti-inflammatory functions. Look for phrases such as "anti-inflammatory," "reduces inflammation," or "inhibits inflammatory response" in relation to the peptide sequences.
3. Explain the reason: Based on the patent contents, provide a rationale for why the identified peptides have anti-inflammatory functions. This may include descriptions of experiments, in vitro or in vivo studies, or other evidence provided in the patent.
4. List peptide sequences: Create a list of the peptide sequences with anti-inflammatory functions, writing each sequence on a separate line.
5. Top 3 most supportive context numbers: Identify the top 3 most supportive context numbers (patent paragraphs or sentences) that directly link the peptide sequences to their anti-inflammatory functions.

Please provide the patent contents for a specific analysis.</b>

> Merging 2 nodes into parent node.
> Parent node id: 03667e5c-3ae1-4bd5-8fda-6e86be74edf6.
> Parent node text: 4).                    Ibuprofen, but not aspirin, slightly increases the expression of MHC molec...

> Merging 2 nodes into parent node.
> Parent node id: 9f02800e-3105-47ea-9d95-b08c4ea0d953.
> Parent node text: INTRODUCTION      Nonsteroidal anti-inflammatory drugs (NSAIDs) inhibit cyclooxygenase (COX), the...

> Merging 1 nodes into parent node.
> Parent node id: 30095eca-f71f-422f-a393-a38a4887d212.
> Parent node text: In all of the experiments described in the present study, DCs were exposed to the drugs for 18 h,...

> Merging 1 nodes into parent node.
> Parent node id: f7912ff8-2758-49dc-a625-f9b8c3318998.
> Parent node text: In all of the experiments described in the present study, DCs were exposed to the drugs for 18 h,...

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence 

<b>{'prompt_tokens': 3015, 'completion_tokens': 281, 'total_tokens': 3296}</b>

--------------------------------------------------------------------


<b>assistant: After reviewing the provided contexts, I cannot find any specific peptide sequences listed in the text. Therefore, I am unable to provide the exact sequences of the peptides with anti-inflammatory functions. However, I can analyze the given contexts and explain the reason why the peptides have anti-inflammatory properties.

The patent is related to the use of certain drugs, aspirin and ibuprofen, which have anti-inflammatory properties. These drugs inhibit the MHC-restricted presentation of exogenous antigens, such as ovalbumin (OVA), in dendritic cells (DCs). The MHC-restricted presentation is a process by which DCs present antigens to T cells, activating the immune response. By inhibiting this process, aspirin and ibuprofen suppress the activation of T cells and the subsequent immune response, leading to their anti-inflammatory effects.

Based on the given contexts, the top 3 most supportive context numbers are Context2, Context5, and Context7. These contexts directly discuss the anti-inflammatory properties of aspirin and ibuprofen and their effects on the MHC-restricted presentation of exogenous antigens in DCs.</b>

> Merging 2 nodes into parent node.
> Parent node id: 8a0381a9-9a2c-44a5-ae31-b3aa0f02fa1c.
> Parent node text: Notably, four peptides with 14-mer length more strongly inhibited NO secretion in RAW 264.7 cells...

> Merging 3 nodes into parent node.
> Parent node id: 8a0bc435-7cda-455e-84da-e5b52a43126d.
> Parent node text: AMPs have a high affinity for LPS, mainly via electrostatic interactions, because gram-negative b...

> Merging 1 nodes into parent node.
> Parent node id: c1fbccab-1ed2-47a4-b149-6bccf81f5496.
> Parent node text: 3.10. Statistical Data Analysis        The mean values of at least four independent determination...

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence is not included). The target sequence should only consist of different kinds of peptide.
        Finally, plesae do analysis and explain the reason first, 
        then list all peptides sequences that have the abi

<b>{'prompt_tokens': 2822, 'completion_tokens': 98, 'total_tokens': 2920}</b>

--------------------------------------------------------------------


<b>assistant: Based on the provided context, here are the sequences of the peptides mentioned:

Context0 and Context2:
WIK-10, WIR-10, WIK-14, and WIKE-14
No specific sequences were provided.

Context3:
10 designed peptides, six peptides with 10 amino acids (10-mer) and four peptides with 1</b>

> Merging 2 nodes into parent node.
> Parent node id: 6ce4fcca-71f9-4963-9c85-dc2248e3ec1e.
> Parent node text: Hormones in the body, such as oxytocin ghrelin, alpha MSH, ACTH and hCG, have a significant role ...

> Merging 2 nodes into parent node.
> Parent node id: 6ac6b024-0bbc-4fcc-b9d7-c92c6fea6f75.
> Parent node text: A recent study showed that bronchodilation caused by GLP-1 analog Exendin-4 was inhibited by GLP-...

> Merging 1 nodes into parent node.
> Parent node id: 29900ad1-80a6-4993-9632-9e6b4cd45e86.
> Parent node text: Exendin-4, a GLP-1R agonist, was shown in another study to lower hepatic production of the inflam...

> Merging 1 nodes into parent node.
> Parent node id: c85201de-e475-467c-ac29-55bf4afb451c.
> Parent node text: Hormones in the body, such as oxytocin ghrelin, alpha MSH, ACTH and hCG, have a significant role ...

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence 

<b>{'prompt_tokens': 2564, 'completion_tokens': 597, 'total_tokens': 3161}</b>

--------------------------------------------------------------------


<b>assistant: Based on the provided context, there are several peptides mentioned that have anti-inflammatory properties. However, the exact peptide sequences are not provided in the context. Here are the peptides mentioned in the context that have anti-inflammatory properties:

1. GLP-1 (Glucagon-like peptide-1)
2. Liraglutide (a GLP-1 analog)
3. Exendin-4 (a GLP-1R agonist)

Analysis:
GLP-1 is a peptide hormone that is produced in the intestine and in multiple other sites. It has been shown to have anti-inflammatory properties in various models, including polymicrobial sepsis, where it reduces oxidative stress and inflammation. Liraglutide, a GLP-1 analog, has also been shown to have anti-inflammatory effects on endothelial cells, reducing the activation of NF-kB, inhibiting TNF-alpha, and increasing nitric oxide production. Additionally, liraglutide has benefits in the treatment of acute lung injury, increasing surfactant protein A (SPA) expression in type 2 pneumocytes, decreasing the concentration of neutrophils and pro-inflammatory cytokines in the bronchoalveolar lavage fluid, and reducing the levels of TNF-α, IL-1β, IL-6, and the severity of lung injury in mouse models. Exendin-4, a GLP-1R agonist, has been shown to lower hepatic production of the inflammatory markers TNF-, IL-1, and IL-6, as well as macrophage markers cluster of differentiation 68 (CD68), and F4/80 in mice fed a western-type (high fat) diet.

Therefore, based on the given context, GLP-1, liraglutide, and exendin-4 have anti-inflammatory properties.

Top 3 most supportive context numbers:
Context1: GLP-1 analog liraglutide improves vascular function in polymicrobial sepsis by reduction of oxidative stress and inflammation.
Context2: Liraglutide dramatically lowers the production of TNF-A IL1B and raises CCL5 in PBMCs.
Context4: GLP-1 and its agonists have opened new avenues for treatment of inflammatory diseases to mitigate organ dysfunction, septicemia, and post-sepsis syndrome.</b>

> Merging 1 nodes into parent node.
> Parent node id: 9ab5ce55-e814-4027-9cd5-a6fa1524fde0.
> Parent node text: [63]When administered once in a week assistes more patients in reaching the majority of ADA-recom...

> Merging 1 nodes into parent node.
> Parent node id: 27c1b86f-6a49-4bab-a34f-e863ad582d44.
> Parent node text: The actual evaluation of cardiovascular benefits will be achieved with the completion of the Lira...

> Merging 2 nodes into parent node.
> Parent node id: 18c9d3fa-9471-4605-9b64-47b284eff6bd.
> Parent node text: They assumed that improvements in endothelial function with GLP-1R agonists might be limited to t...

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence is not included). The target sequence should only consist of different kinds of peptide.
        Finally, plesae do analysis and explain the reason first, 
        then list all peptides sequences that have the abi

<b>{'prompt_tokens': 2824, 'completion_tokens': 451, 'total_tokens': 3275}</b>

--------------------------------------------------------------------


<b>assistant: Based on the given context, there are several peptides and related agonists mentioned, but the exact peptide sequences are not provided in the context. However, I can list the names of the peptides and agonists that have been associated with anti-inflammatory effects, according to the context:

1. GLP-1 (glucagon-like peptide-1)
2. Exenatide (a GLP-1 receptor agonist)
3. Liraglutide (a GLP-1 receptor agonist)

The anti-inflammatory effects of these peptides and agonists are described in the context as follows:

Context0:
Reduces inflammatory markers: MCP-1 and TNF-α in response to lipopolysaccharide in cultured peritoneal macrophages harvested from mice.
Reduces monocyte adhesion to aortic endothelial cells and atherosclerotic lesion size in nondiabetic C57BL/6 and ApoE−/− mice.

Context1:
GLP-1-mediated improvements in blood flow were considerably attenuated after a 2-month period of better glycemic control.

Context4:
Continuous infusion of exendin-4 for 4 wk. in C57BL/6 mice reduced neointimal formation after endothelial denudation of the femoral artery.
Nagashima et al. reported that continuous GLP-1 infusion reduced foam cell formation and the development of atherosclerotic lesions in ApoE−/− mice.
Recently, the same group demonstrated similar effects after infusion of liraglutide in apoE−/− mice.

The top 3 most supportive context numbers for the anti-inflammatory effects of the peptides are Context0, Context1, and Context4.</b>

> Merging 2 nodes into parent node.
> Parent node id: b5169ff9-7838-49a4-ae64-fef1ee86a2f7.
> Parent node text: The treatment acts by upregulating the anti-inflammatory cytokine conversation while at same time...

> Merging 2 nodes into parent node.
> Parent node id: 606c5400-d51a-4faa-9c00-ea46ee3bcca9.
> Parent node text: The tolerance that these cells impose can be overridden by infections and other challenges that d...

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence is not included). The target sequence should only consist of different kinds of peptide.
        Finally, plesae do analysis and explain the reason first, 
        then list all peptides sequences that have the ability of anti-inflammation line by line.
        Please focuse on the given context and don't use prior knowledge. Please return the top 3 most supportive context number.
            


<b>{'prompt_tokens': 3033, 'completion_tokens': 509, 'total_tokens': 3542}</b>

--------------------------------------------------------------------


<b>assistant: After analyzing the provided context, I have found several peptides that have the ability of anti-inflammation. Here are the peptides sequences and the reasons:

1. J-LEAPS: This peptide binds to precursors of mouse (bone marrow) or human (blood monocytes) DCs and promotes their maturation to produce IL-12p70 and present the attached peptide to elicit antigen-specific protective responses, including the production of IFN-γ. This Th1 cytokine conversation and involvement of both CD4 and CD8 T cells allowed the designing of both anti-viral and anti-tumor vaccines (Context4).
2. CEL-2000: This peptide attaches a collagen peptide to the J-LEAPS ICBL and upregulated IL-12p70, IFN-γ, and also IL-10 while downregulating TNF-α, IL-6, and IL-17A to stop the progression of ongoing disease in the CIA model (Context4).
3. CEL-4000: This peptide attaches the DerG LEAPS ICBL to peptides from proteoglycan and elicited antigen-specific responses to stop the progression of disease by acting on memory T cells, including those in the spleen, to increase the expression of anti-inflammatory IL-4, IL-10, and TGF-β cytokines and Treg numbers while decreasing the expression of pro-inflammatory IFN-γ and IL-17 in the PGIA/GIA models (Context4).

The top 3 most supportive context numbers are Context4, Context2, and Context1. Context4 provides the most detailed information about the anti-inflammatory effects of the peptides. Context2 mentions the Group I therapies that focus on the cytokine-secreting, antigen-specific T cells, i.e., therapeutic vaccines, which is related to the peptides. Context1 introduces the LEAPS approach that derives benefits by addressing the underlying disease cause and not just being palliative or treating the symptoms of the disease, which is the general idea of the patent.</b>

> Merging 2 nodes into parent node.
> Parent node id: d5b9a160-f4d1-4e80-a111-54158fe29793.
> Parent node text: Unlike the LEADER and SCALE trials, in the LIRA-RENAL trial liraglutide was found to be ineffecti...

> Merging 2 nodes into parent node.
> Parent node id: 7e9b3ec1-f316-477e-8a1b-02d496d083e7.
> Parent node text: Emerging data support the efficacy of GLP-1RAs, in reducing the incidence of major adverse cardio...

> Merging 2 nodes into parent node.
> Parent node id: c8110247-6427-4acd-8af6-08891c6a8be0.
> Parent node text: Again, the primary endpoint was the incidence of MACEs. After 2.1 years of follow-up, the semaglu...

> Merging 2 nodes into parent node.
> Parent node id: 9df5e33f-0127-4a19-ba81-7477eafff69a.
> Parent node text: Unlike the LEADER and SCALE trials, in the LIRA-RENAL trial liraglutide was found to be ineffecti...

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence 

<b>{'prompt_tokens': 3019, 'completion_tokens': 285, 'total_tokens': 3304}</b>

--------------------------------------------------------------------


<b>assistant: Based on the given context, there are several peptides mentioned that are relevant to anti-inflammatory functions. Here are the peptides sequences and the reasons why they have the ability of anti-inflammation:

1. GLP-1 (glucagon-like peptide-1)
	* Sequence: not explicitly mentioned in the context, but it is a well-known 30-amino acid peptide hormone derived from the preproglucagon gene.
	* Anti-inflammatory function: GLP-1 has been shown to exert broad anti-inflammatory actions and reduce biomarkers of systemic inflammation in human subjects with type 2 diabetes and people with obesity (Context1). Preclinical studies also suggest that GLP-1RAs reduce cytokine production and attenuate pulmonary inflammation (Context1).
2. Exenatide
	* Sequence: Exenatide is a 39-amino acid peptide with the sequence: HGEGTFTSDLSKQMEEEAVRLFIEWLKNGGPDFCRDKLSTRKQQKVDDTTKGRKRYTMTSQTQRLNR (Context4).
	</b>

> Merging 2 nodes into parent node.
> Parent node id: 453c3828-e61d-4fce-a6b5-f6daf8ca5884.
> Parent node text: Interestingly, the anti-inflammatory effects rely mostly on suppressive action, while pro-resolvi...

> Merging 2 nodes into parent node.
> Parent node id: 4ef9681a-038b-4ed9-9b7a-e402e66a3b93.
> Parent node text: ANXA1 was reported to also be associated with the early stage of AD in patients and in animal mod...

> Merging 2 nodes into parent node.
> Parent node id: c420e97f-f2a7-4106-bc5e-0cf7ef437b54.
> Parent node text: ANXA1 was reported to also be associated with the early stage of AD in patients and in animal mod...

Given content describing the functions and applications of the patent.
        Please find out all the peptides sequence firstly(motif sequence is not included). The target sequence should only consist of different kinds of peptide.
        Finally, plesae do analysis and explain the reason first, 
        then list all peptides sequences that have the abi

<b>{'prompt_tokens': 2900, 'completion_tokens': 544, 'total_tokens': 3444}</b>

--------------------------------------------------------------------


<b>assistant: After reviewing the provided context, I have found several peptides that have the ability of anti-inflammation. Here are the analysis and the list of peptides:

Analysis:
According to Context0, ANXA1 peptides, such as Ac2–26 and Ac9–25, have the ability to fully activate FPR1 and induce pro-inflammatory response at high concentration. However, there is no mention of their anti-inflammatory effects. Therefore, we cannot include ANXA1 peptides in the list of anti-inflammatory peptides.

According to Context2, LXA4 has been shown to promote the release of anti-inflammatory factors and to exhibit the ability to reduce Aβ and phosphorylated tau levels. LXA4 binds to FPR2 receptor and activates many intracellular signaling pathways, leading to an increase in macrophages life span, inhibition of pro-inflammatory cytokines synthesis, and reduction of Aβ and phosphorylated tau levels. Therefore, LXA4 has the ability of anti-inflammation.

According to Context4, SPMs, including LXA4 and RvD1, activate cascades that induce remodeling within sites damaged by inflammatory processes. They promote several processes crucial for resolution of inflammation, such as neutrophil extravasation blockade, promotion non-phlogistic monocyte recruitment, suppression of proinflammatory mediators while potentiating anti-inflammatory cytokines release and macrophage phagocytosis and efferocytosis, altering macrophages phenotype and instructing cells to favor repair. Therefore, SPMs have the ability of anti-inflammation.

Based on the above analysis, I have found two peptides with the ability of anti-inflammation: LXA4 and RvD1.

List of peptides with anti-inflammatory effects:

1. LXA4
2. RvD1

Top 3 most supportive context numbers: Context2, Context4, Context5 (although Context5 does not directly mention LXA4 or RvD1, it supports the role of SPMs in anti-inflammation, which includes LXA4 and RvD1).</b>

In [13]:
df_test

Unnamed: 0,pmcid,pmid,is_review,model_response,supportive_context
1606,PMC4315550,25670897,0,"After analyzing the given contexts, I have fou...",Context0:\nThese results indicate that a sligh...
1911,PMC4827125,27064029,0,"Based on the provided context, there are no pe...",Context0:\nThe goal of this study was to deter...
257,PMC3232377,22059987,0,"Without the provided contents, I am unable to ...",
2024,PMC2902675,20631879,0,"After reviewing the provided contexts, I canno...",Context2:\nIn all of the experiments described...
425,PMC9686674,36421317,0,"Based on the provided context, here are the se...",Context0:\nIt was observed that WIKE-14 can ex...
2320,PMC10230051,37266425,1,"Based on the provided context, there are sever...",Context1:\nActivation of the GLP-1 pathways de...
793,PMC4216654,25338737,1,"Based on the given context, there are several ...",Context0:\n[42]Inhibts glucagon release by a m...
752,PMC8772713,35052724,1,"After analyzing the provided context, I have f...",Context1:\nThe LEAPS approach derives benefits...
1206,PMC9394722,36003669,1,"Based on the given context, there are several ...",Context1:\nThe so-called ‘Cytokine Storm’ that...
225,PMC8413167,34105114,1,"After reviewing the provided context, I have f...",Context0:\nIt seems that the dual properties m...


In [45]:
df_test.to_csv('./10patents_example.csv',index=False)

In [27]:
from llama_index.response.notebook_utils import display_source_node
for node in response_nodes:
    display_source_node(node, source_length=20000)
    print("--------")

**Node ID:** bc7b9bc5-089c-492d-b968-96c72a8e391f<br>**Similarity:** 0.8580143326929651<br>**Text:** In fact, a number of current TCM prescriptions use epimedium as the main ingredient or monarch such as the Xianling Gubao capsule, Er-Xian Decoction and Xian-Zhen Decoction. Other ingredients such as psoralea and drynaria rhizome serve as ministers or assistants to the monarch ingredient and are also included in these formulations.38      In this study, we present the development of a minimal TCM formula primarily containing two ingredients, epimedium and drynaria. Epimedium was chosen as the main or monarch ingredient partially because of its TCM warming properties but also because of its in vitro biological activities. Drynaria was chosen as a minister ingredient to the monarch to further strengthen the kidney and to support CRE activation and possible osteogenic activities. The other ingredients such as danshen were considered as assistants for invigorating the spleen and replenishment of qi as well as their anti-inflammatory activity.<br>

--------


**Node ID:** 67fb29a4-ade9-4616-95dd-e561678a8abb<br>**Similarity:** 0.8572586696340335<br>**Text:** LTD and E2 was from Bayer Pharmaceutical Co. Ltd. (Guangzhou, CN).        Vectors pGL4.29[luc2P/CRE/Hygro], pGL4.26 [luc2/minP/Hygro], pGL4.17 [luc2/Neo], pGL4.35 [luc2P/9XGAL4UAS/Hygro], pBIND-ERα-LBD (residues 299-595), and Fugene 6 and Fugene HD were purchased from Promega Inc. (Madison, WI). Vector pNF-κβ-Luc was from Clontech Laboratories (Mountain View, CA). Vectors pCMV6-XL4 ER β (Cat# SC119216), and pCMV6-NEO were purchased from Origene Technologies (Rockville, MD). Vector pCR BluntII TOPO ER α (Cat# MHS6278-211691051) was from Dharmacon (Lafayette, CO).<br>

--------


**Node ID:** 04a36885-efdb-4092-aa83-bf8bb589b0b1<br>**Similarity:** 0.8518454004028271<br>**Text:** 4                    3.3        Anti-inflammatory activity determination        Pretreatment of A549 cells containing the NF-κβ activation reporter construct with either danshen, epimedium, or milkvetch extracts (40 μg/mL) inhibited IL-1β (100 pg/mL) activation of NF-κβ (Fig. 5). The other extracts did not demonstrate appreciable anti-inflammatory activity. The anti-inflammatory activity did not correlate to CRE activation. Drynaria did not inhibit NF-κβ activation by IL-1β despite activating CRE. Bay 11-7082 (12.5 μM) was used as a control inhibitor of IL-1β activated NF-κβ.34Fig. 5Effect of selected TCM plant extracts on the inhibition of NF-κβ activation by IL-1β. Plant extracts were resuspended in DMSO and added at a final concentration of 40 μg/mL to A549 cells stably transfected with an NF-κβ responsive luciferase reporter construct. Cells were stimulated with IL-1β (100 pg/mL). Control treatments were IL-1β only and Bay11 (B; 12.5 μM). Cells were treated for 6 h. Extract abbreviations were as follows: An, Angelica; As, Asiatic Cornelian Cherry; D, Danshen; Dr, Drynaria; DS, Dodder Seed; E, Epimedium, Eu, Eucommia; GP, Glossy Privet; IB, Indian Bread; M, Milkvetch; MC, Medicinal Cyathula; P, Psoralea: R, Rehmannia; TA, Two-toothed Achyranthes; WB, Wolfberry root bark; and Y, Common Yan Rhizome. Data are represented as mean ± SD of relative luminescence units (RLUs (n = 3); #p < 0.0001 compared to untreated (Unt) controls).Fig. 5                    3.4        Activation of bone specific genes Runx2 and Bmp-4        For subsequent evaluations, epimedium and drynaria were selected as companion ingredients and tested in additional bone related assays. Epimedium was selected as the primary (or monarch) ingredient for four reasons: 1) it activates the full-length ERα and ERβ (Fig.<br>

--------


**Node ID:** e8623f07-fa6c-4e4c-a55f-44681f23d28a<br>**Similarity:** 0.850741723090024<br>**Text:** The thermo-cycling conditions were as follows: 95○C for 30 s; 40 cycles of 58○C for 5 s; and 95○C for 5 s. Each column represents the mean response ± SD of 4 separate individual experiments (n = 4; compared to untreated (Unt) control group; *p < 0.05, **p < 0.01, and #p < 0.0001).Fig. 6                    3.5        Inhibition of osteoclast development        The E:Dr combination was further evaluated in an osteoclast development model while testing the addition of a third ingredient. Danshen was selected as a possible third ingredient because of its robust anti-inflammatory activity (Fig. 5). Angelica and Wolfberry Bark were also selected as possible third ingredients to be tested because of their modest anti-inflammatory activity and their lack of activity on isolated ER LBDs. Psorelea and milkvetch were avoided as the third ingredients because of their minimal activity on isolated ERβ LBD (Fig. 2).        All treatments inhibited the differentiation of pre-osteoclast to osteoclasts containing the E:Dr combo alone or in combination with a third ingredient in a dose dependent manner as measured by the development of TRAP concentration, an indicator of osteoclast development (Fig. 7). At 13 and 40 μg/ml the group containing D as the third ingredient appeared to outperform the groups containing An or WB as a third ingredient which also had less anti-inflammatory activity than D. Further evaluation of these ingredient combinations were completed in the ovariectomized rat model to determine if inhibition of TRAP activity was a good indicator of in vivo activity.Fig. 7Effect of epimedium/drynaria combinations with and without a third plant extract on the development of osteoclasts as determined by alkaline phosphatase levels.<br>

--------


**Node ID:** 01df1321-1c78-44d8-9e88-ae32647e5499<br>**Similarity:** 0.8472513373429086<br>**Text:** 2.6        Response element reporter assays        Stable cell lines for the CRE and NF-κβ reporter assays were created by transfecting HEK293T/17 and A549 cells with vectors pGL4.29 CRE and pGL4.17 NF-κβ Luc2, respectively. Hygromycin (1200 μg/mL) and neomycin (400 μg/mL) supplemented DMEM and F12K media, respectively, were used to select monoclonal cell lines which were tested for assay responsiveness and selected for further development. The resulting stable cell lines were maintained in their respective growth media supplemented with 600 μg/mL hygromycin or 200 μg/mL neomycin.        The CRE activation assay was described previously.22 Briefly, ∼15,000 CRE reporter transfectant cells were plated per well in white walled 96 well poly-d-lysine coated plates (Corning™ Biocoat™). After 24 h, the media was changed to DMEM media lacking FBS but containing 0.5% BSA and incubated for another 24 h. Test ingredients were diluted in DMEM containing 0.5% BSA and added to the cells then incubated 6 h. Luciferase was quantified by activity assay described below.        To test for the inhibition of NF-κβ activation, ∼15,000 NF-κβ Luc2 reporter transfectant cells were plated per well in white walled 96 well plates in F12K growth media. After 24 h, the media was changed from F12K growth media to OptiPRO SFM supplemented with 4 mM l-glutamine, 0.5% BSA and incubated for another 24 h. Test ingredients were diluted in OptiPRO SFM/l-glutamine containing 0.5% BSA and added to the cells. After 20 min, NF-κβ activating compound, IL-1β, was added to each well at a concentration of 100 pg/mL and the treatments incubated for another 6 h. Luciferase was quantified by activity assay described below.                    2.7        Ligand binding domain reporter assays        Cell lines containing two different vectors were needed to evaluate activation of the individual LBD of ERα and/or ERβ.<br>

--------


**Node ID:** f3f9b180-bc05-4dd6-88bd-8bf795a7a45b<br>**Similarity:** 0.8411315442913609<br>**Text:** 3); but 2) it does not behave like estrogen because it does not activate either ERα LBD or ERβ LBD (Fig. 2); 3) it is a strong inhibitor of NF-κβ activation by IL-1β (Fig. 5); and 4) it activates CRE a possible bone building pathway. Drynaria was selected as a companion ingredient to epimedium because of its capability to activate CRE, a known downstream target of PTH, a bone building treatment.        The epimedium and drynaria (E:Dr; 1:1) combination was tested in a bone-precursor, CRE-sensitive cell line, MC3T3-E1.35 Bone morphogenic signals in MC3T3-E1 cells,36 Runx2 (Runt-Related Transcription Factor 2) and Bmp4 (bone morphogenic peptide 4) were measured following treatment with different doses of a 1:1 combination of epimedium and drynaria. Forskolin, a known CRE activator, was used as a positive control.<br>

--------


**Node ID:** f523df7e-1940-4280-a504-826da26492a7<br>**Similarity:** 0.840144826928874<br>**Text:** There is little information on the development of a simple formulation using two or three TCM ingredients on bone mineral density (BMD) improvement and their biological activities on bone specific targets. Medicines containing fewer extracts would be more suitable for maintaining integrity and quality of the individual components and would also help in the standardization of TCM.      In this study, we selected 16 plant extracts used in TCM to improve bone health. These plants were evaluated for bone building potential by examining their ability to activate CRE, as well as their ability to block osteoclast development, and inflammation through NF-κβ inhibition.<br>

--------


**Node ID:** 9a02eaa0-5f93-4579-a089-3ac3136e5230<br>**Similarity:** 0.8399784484138657<br>**Text:** Epimedium and drynaria activated these targets despite the presence of estrogenic materials, phenol-red and FBS.49 We suggest that epimedium and drynaria may be upregulating these genes through their activation of CRE. Fsk, a adenylate cyclase activator and epimedium have previously been shown to upregulate CRE-related responses.44 Drynaria has not been shown previously to activate CRE.      Osteoclast development was inhibited by the epimedium and drynaria combination in a dose-dependent manner and the addition of a third ingredient, either danshen, angelica or wolfberry augmented the inhibition at selected doses. Previously, epimedium alone has been shown to inhibit NF-κβ activation as well as to stimulate p38 MAPK to inhibit NF-κβ during osteoclast differentiation.45, 46, 47, 48 The NF-κβ inhibition by epimedium does not appear to be the result of either CRE activation and/or ER activation50, 51, 52 because other extracts that activate these pathways (drynaria and psoralea, respectively) do not inhibit NF-κβ activation.<br>

--------


**Node ID:** 2ebcb2ed-9bae-4a77-a7a7-3be231763a1f<br>**Similarity:** 0.8328065251198418<br>**Text:** The ERβ-LBD (residues 227-530) was PCR amplified from full-length human ERβ (SC119216, OriGene Technologies (Rockville, MD)) using forward and reverse primers 5′-CGATCGTTCGCCTTGTGCGGAGACAGAGAAGTGCC-3′ and 5′-TCTAGATTACTGAGACTGTGGGTTCTGGGAGCCCTC-3′, incorporating PvuI and XbaI sites at the 5′ and 3′ ends, respectively. The ERβ-LBD fragment was then ligated into the pBIND3 PvuI/XbaI site forming a Gal4- ERβ -LBD fusion protein.                            2.5        Cell culture and stable cell lines        Chinese hamster ovary (CHO-K1), MC3T3-E1 subclone 14, HEK293T/17, SK-N-SH, and A549 were purchased from ATCC (Manasas, VA) and grown in in DMEM or αMEM or DMEM or EMEM or F12K media, respectively.<br>

--------
