In [102]:
pip install ollama


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


RAG Application using Ollama and Langchain

In [103]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_ollama import ChatOllama

In [104]:
from langchain.document_loaders import PyPDFLoader

# Load PDF files
loader_1 = PyPDFLoader("./Context.pdf")
#loader_2 = PyPDFLoader("./Herbs that synergize with Carboplatin and Cisplatin V3.pdf")
#loader_3 = PyPDFLoader("./McKinney Surgery Pre and Post Op protocol.pdf")

# Load the documents
raw_documents_1 = loader_1.load()
#raw_documents_2 = loader_2.load()
#raw_documents_3 = loader_3.load()

# Combine all documents into one list
all_raw_documents = raw_documents_1


In [105]:
#raw_documents= all_raw_documents
#raw_documents= ""
raw_documents = TextLoader("./context.txt").load()

In [106]:
raw_documents

[Document(metadata={'source': './context.txt'}, page_content="ï»¿* P-value: The p-value is a measure that helps scientists determine whether their findings are significant or just due to chance. A low p-value (usually less than 0.05) suggests that the results are likely not random and are worth paying attention to.\n* Confidence Interval: A confidence interval gives a range of values within which the true value of something (like an average or effect) is expected to lie. For example, if a study says the average height is 170 cm with a 95% confidence interval of 160-180 cm, it means we can be 95% sure the true average height is between 160 and 180 cm.\n* Odds Ratio: The odds ratio is a way to compare whether the odds of a certain event happening are the same for two groups. For example, if you're comparing the odds of getting a disease in people who smoke versus those who don't, an odds ratio of 2 would mean smokers are twice as likely to get the disease.\n* Correlation Coefficient: The

In [107]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=20)
documents = text_splitter.split_documents(raw_documents)

In [108]:
len(documents)

28

In [109]:
print(documents[0])
print(documents[1])

page_content='ï»¿* P-value: The p-value is a measure that helps scientists determine whether their findings are significant or just due to chance. A low p-value (usually less than 0.05) suggests that the results are likely not random and are worth paying attention to.' metadata={'source': './context.txt'}
page_content='* Confidence Interval: A confidence interval gives a range of values within which the true value of something (like an average or effect) is expected to lie. For example, if a study says the average height is 170 cm with a 95% confidence interval of 160-180 cm, it means we can be 95% sure the true' metadata={'source': './context.txt'}


In [110]:
from langchain_ollama import OllamaEmbeddings

In [111]:
oembed = OllamaEmbeddings(base_url="http://localhost:11434", model="nomic-embed-text")

In [112]:
db = Chroma.from_documents(documents, embedding=oembed)

In [113]:
query = "Summarize the article about Carboplatin usage in cancer treatment."
docs = db.similarity_search(query)

In [114]:
len(docs)

4

In [115]:
print(docs[3].page_content)

* Intention-to-Treat Analysis (ITT): A method where all participants in a clinical trial are analyzed according to the group they were originally assigned, regardless of whether they completed the treatment. It helps provide a realistic view of the treatment's effectiveness in real-world


In [116]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [155]:
template = """Please summarize the following medical article in simple language suitable for patients:

{article_text}

Thank you."""
prompt = ChatPromptTemplate.from_template(template)

In [156]:
model = ChatOllama(
    model="llama3.1:latest",
    temperature=0
)

In [157]:
retriever = db.as_retriever()

In [158]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

In [159]:
import requests
from bs4 import BeautifulSoup

def extract_pmc_article_xml_test(pmc_id):
    xml_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/?report=xml&format=text"
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(xml_url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'lxml')

    paragraphs = soup.find_all('p')
    text = "\n\n".join(p.get_text(strip=True) for p in paragraphs)

    return text.strip()

# Usage example:
pmc_id = '10046228'
article_text = extract_pmc_article_xml_test(pmc_id)
print(article_text[:3000])  # preview first 3000 chars



An official website of the United States government

Official websites use .govA.govwebsite belongs to an official
                            government organization in the United States.

Secure .gov websites use HTTPSAlock(LockLocked padlock icon) orhttps://means you've safely
                                connected to the .gov website. Share sensitive
                                information only on official, secure websites.

Primary site navigation

Logged in as:

Correspondence:ismail_onco@yahoo.fr(I.E.);gilles.freyer@univ-lyon1.fr(G.F.)

Received 2023 Feb 19; Revised 2023 Mar 7; Accepted 2023 Mar 11; Collection date 2023 Mar.

Licensee MDPI, Basel, Switzerland. This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution (CC BY) license (https://creativecommons.org/licenses/by/4.0/).

Since the advent of trastuzumab in HER2-positive metastatic breast cancer management, the natural history of this disease continues to

In [160]:
import requests
from bs4 import BeautifulSoup

def extract_pmc_article_xml(pmc_url):
    pmc_id = pmc_url.strip('/').split('/')[-1].replace('PMC', '')
    xml_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/?report=xml&format=text"
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    response = requests.get(xml_url, headers=headers)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'lxml')

    paragraphs = soup.find_all('p')
    if not paragraphs:
        raise ValueError("Could not extract content from XML format.")

    text = "\n\n".join(p.get_text(strip=True) for p in paragraphs)
    return text.strip()


In [161]:
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

# Use RunnableLambda to integrate your Python function
url_to_text = RunnableLambda(lambda url: {"article_text": extract_pmc_article_xml(url)})

In [162]:
chain = (
    RunnablePassthrough()
    | url_to_text
    | prompt
    | model
    | StrOutputParser()
)

In [163]:
summary = chain.invoke("https://pmc.ncbi.nlm.nih.gov/articles/PMC10046228/")

In [164]:
print(summary)

This appears to be a research article on HER2-positive breast cancer, specifically discussing the latest developments in treatment options and therapies. Here's a summary:

**Key Findings:**

1. **New treatments:** The article discusses several new anti-HER2 drugs, including T-DXd (trastuzumab-duparlimab), SYD985 (trastuzumab-duocarmazine), and tucatinib.
2. **Double anti-HER2 blockade:** The double anti-HER2 blockade remains the "gold standard" in initial management of metastatic HER2 breast cancer.
3. **T-DXd superiority:** T-DXd has shown superiority over T-DM1 (trastuzumab-emtansine) in second-line treatment.
4. **Tyrosine kinase inhibitor tucatinib:** Tucatinib has shown benefit even on brain lesions and may become a new option for first-line treatment.

**Open Questions:**

1. **Biomarkers:** The article highlights the need to identify biomarkers that can select responder patients.
2. **Therapeutic combinations and sequences:** The optimal therapeutic combinations and sequences r