In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "gpt-3.5-turbo"
#MODEL = "mixtral:8x7b"
MODEL = "llama2"

In [2]:
from langchain_community.llms import Ollama
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.embeddings import OllamaEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings

if MODEL.startswith("gpt"):
    model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
    embeddings = OpenAIEmbeddings()
else:
    model = Ollama(model=MODEL)
    embeddings = OllamaEmbeddings(model=MODEL)

model.invoke("Tell me a joke")

'\nWhy was the math book sad? Because it had too many problems! :D'

In [3]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser 
chain.invoke("Tell me a joke")

"Sure, here's one:\n\nWhy don't scientists trust atoms?\nBecause they make up everything!\n\nI hope that brought a smile to your face! Do you want to hear another one?"

In [4]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data\hpo_term_info.pdf")
pages = loader.load_and_split()
pages

  loader = PyPDFLoader("data\hpo_term_info.pdf")


[Document(page_content='HPO Term Information\n \nId: HP:0002650\nName: Scoliosis\nIndex: 2650\nComment:\nDefinition: "The presence of an abnormal lateral curvature of the spine."\n[https://orcid.org/0000-0002-0736-9199]\nSynonym: [\'Cobb angle greater than ten degrees\']\nXref: [\'Fyler:4160\', \'MSH:D013121\', \'SNOMEDCT_US:111266001\', \'SNOMEDCT_US:64217002\',\n\'UMLS:C0037932\', \'UMLS:C0700208\']\nAlt Id: [\'HP:0002770\', \'HP:0003303\', \'HP:0003317\', \'HP:0003415\']\nIs Obsolete: No\nReplaced By:\nConsider: []\nParents: HP:0010674 | Abnormal curvature of the vertebral column\nChildren: HP:0100884 | Compensatory scoliosis, HP:0002944 | Thoracolumbar scoliosis, HP:0002751\n| Kyphoscoliosis, HP:0002943 | Thoracic scoliosis, HP:0008458 | Progressive congenital scoliosis\nGenes: Gene ID: 5830, Gene Name: PEX5, Gene ID: 6804, Gene Name: STX1A, Gene ID: 6655,\nGene Name: SOS2, Gene ID: 51574, Gene Name: LARP7, Gene ID: 23522, Gene Name: KAT6B,\nGene ID: 2812, Gene Name: GP1BB, Gene ID

In [5]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="Here is a question")

'\nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Here is some context\n\nQuestion: Here is a question\n'

In [6]:
chain = prompt | model | parser

In [7]:
chain.input_schema.schema()


{'title': 'PromptInput',
 'type': 'object',
 'properties': {'context': {'title': 'Context', 'type': 'string'},
  'question': {'title': 'Question', 'type': 'string'}}}

In [8]:
!pip install docarray




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [9]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore = DocArrayInMemorySearch.from_documents(
    pages,
    embedding=embeddings)



In [10]:
retriever = vectorstore.as_retriever()
retriever.invoke("scoliosis")

[Document(page_content='syndrome, Seckel syndrome, Acromesomelic dysplasia, Hunter-Thompson type, 45,X/46,XY mixed\ngonadal dysgenesis, Lissencephaly type 1 due to doublecortin gene mutation, Spinocerebellar ataxia\ntype 42, TBCK-related intellectual disability syndrome, Laing early-onset distal myopathy, Oliver\nsyndrome, MAGEL2-related Prader-Willi-like syndrome, GM1 gangliosidosis, Focal dermal hypoplasia,\nHereditary neuropathy with liability to pressure palsies, 13q12.3 microdeletion syndrome, Aicardi\nsyndrome, Desbuquois syndrome, Trichodermodysplasia-dental alterations syndrome,\nCongenital-onset Steinert myotonic dystrophy, Trisomy 5p, Homocystinuria due to cystathionine\nbeta-synthase deficiency, IVIC syndrome, Joubert syndrome, Wrinkly skin syndrome,\nCryptorchidism-arachnodactyly-intellectual disability syndrome, ALG6-CDG, Marshall-Smith syndrome,\nMaffucci syndrome, Nail-patella syndrome, Femoral-facial syndrome, Skeletal dysplasia-epilepsy-short\nstature syndrome, Alpha-s

In [11]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question"),
    }
    | prompt
    | model
    | parser
)

In [12]:
questions = [
    "What is the purpose of thid pdf?",
    "How many genes are there?",
    "How many terms",
   
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print()

Question: What is the purpose of thid pdf?
Answer: The purpose of this PDF document appears to be a compilation of gene names and IDs along with their corresponding Gene Ontology (GO) terms. The document contains a list of genes associated with various human diseases, along with their Gene ID, Gene Name, and GO terms.

The GO terms provide information about the biological process, molecular function, and cellular location of each gene. This information can be useful for researchers interested in understanding the role of specific genes in different diseases and how they relate to other genes in the same pathway or network.

Overall, the purpose of this PDF document seems to be a resource for researchers to identify and explore genes associated with specific diseases and understand their functional roles in the context of those diseases.

Question: How many genes are there?
Answer: There are 40 genes listed in the output from the API call.

Question: How many terms
Answer: To count the 

In [14]:
# Example test question and expected result
question = "What are the symptoms of Scoliosis?"
result = chain.invoke({"context": "Some context from the PDF", "question": question})
print(f"RAG model result: {result}")


RAG model result: Scoliosis is a medical condition where there is an abnormal curvature of the spine. The symptoms of scoliosis can vary depending on the severity of the curvature, but they may include:

1. Uneven shoulders or hips: One shoulder or hip may appear higher or lower than the other.
2. Difficulty standing up straight: People with scoliosis may have a hard time maintaining their balance and posture when standing or walking.
3. Back pain: The curvature of the spine can put pressure on the vertebrae, discs, and muscles, leading to back pain.
4. Breathing difficulties: In severe cases of scoliosis, the curvature of the spine can compress the lungs and ribcage, making it difficult to breathe.
5. Headaches: The abnormal curvature of the spine can also lead to headaches and neck pain due to the altered alignment of the vertebrae and discs.
6. Numbness or tingling: People with scoliosis may experience numbness or tingling in their arms and legs due to the compression of nerves by t

In [13]:
chain.batch([{"question": q} for q in questions])
for s in chain.stream({"question": "What is the purpose of the course?"}):
    print(s, end="", flush=True)

The purpose of the course is to provide students with a comprehensive understanding of the Human Protein Atlas (HPA), which is a comprehensive dataset of protein expression levels across different human tissues and cells. The course covers various aspects of the HPA, including its history, data collection methods, quality control procedures, and analysis tools.

The course also focuses on practical applications of the HPA, such as identifying disease-specific protein expression patterns, understanding the molecular mechanisms underlying human diseases, and developing new diagnostic and therapeutic strategies. Additionally, the course covers the ethical and regulatory considerations involved in the use of human tissue samples for protein analysis.

Overall, the purpose of the course is to equip students with the knowledge and skills necessary to utilize the HPA for advancing our understanding of human biology and disease, and to develop new diagnostic and therapeutic strategies.