In [1]:
import os
import logging
import sys
import pandas as pd
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import VectorStoreIndex
from llama_index.core import PromptTemplate
from IPython.display import Markdown, display
from llama_index.llms.ollama import Ollama

### Local Embeddings

In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings
Settings.embed_model = HuggingFaceEmbedding(model_name="w601sxs/b1ade-embed-kd")

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: w601sxs/b1ade-embed-kd
Load pretrained SentenceTransformer: w601sxs/b1ade-embed-kd
No sentence-transformers model found with name w601sxs/b1ade-embed-kd. Creating a new one with MEAN pooling.




INFO:sentence_transformers.SentenceTransformer:2 prompts are loaded, with the keys: ['query', 'text']
2 prompts are loaded, with the keys: ['query', 'text']


In [42]:
from llama_index.llms.ollama import Ollama

ollama = Ollama(model="llama3", request_timeout=120.0, 
             temperature=0.5)

response = ollama.complete("What is the capital of France?")
print(response)

INFO:httpx:HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/generate "HTTP/1.1 200 OK"
The capital of France is Paris.


In [28]:
# import openai
# import os
# from llama_index.llms.openai import OpenAI
# 
# openai.api_key = os.environ["OPENAI_API_KEY"]
# gpt_llm = OpenAI(model="gpt-4")
# response = gpt_llm.complete("What is the capital of France?")
# print(response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
The capital of France is Paris.


### Load Data
The Llama paper

In [3]:
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "../data/llama2.pdf"

--2024-05-27 06:25:14--  https://arxiv.org/pdf/2307.09288.pdf
Resolving arxiv.org (arxiv.org)... 151.101.67.42, 151.101.195.42, 151.101.3.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.67.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://arxiv.org/pdf/2307.09288 [following]
--2024-05-27 06:25:14--  http://arxiv.org/pdf/2307.09288
Connecting to arxiv.org (arxiv.org)|151.101.67.42|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13661300 (13M) [application/pdf]
Saving to: ‘../data/llama2.pdf’


2024-05-27 06:25:17 (5.83 MB/s) - ‘../data/llama2.pdf’ saved [13661300/13661300]



In [9]:
from pathlib import Path
from llama_index.readers.file import PyMuPDFReader
from llama_index.core import VectorStoreIndex

loader = PyMuPDFReader()
documents = loader.load(file_path="../data/llama2.pdf")
index = VectorStoreIndex.from_documents(documents)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
query_str = "What are the potential risks associated with the use of Llama 2 as mentioned in the context?"
query_engine = index.as_query_engine(similarity_top_k=2, llm=llm)
vector_retriever = index.as_retriever(similary_top_k=2)

In [24]:
response = query_engine.query(query_str)
print(str(response))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
According to the provided context, the potential risks associated with the use of Llama 2 include generating misinformation, retrieving information about topics like bioterrorism or cybercrime, and the model being used for nefarious purposes. Additionally, the paper mentions that the open release of LLMs, including Llama 2, carries potential risks with use.


In [48]:
# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

In [36]:
response = query_engine.query(query_str)
print(str(response))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
Based on the provided context, I don't know the specific potential risks associated with the use of Llama 2. However, it is mentioned that not everyone who uses AI models has good intentions, and conversational AI agents like Llama 2 could potentially be used for nefarious purposes such as generating misinformation or retrieving information about topics like bioterrorism or cybercrime.


In [11]:
enrichr_csv_path = "/mnt/hdd_2/abdu/gtex/brown_preadipocytes_irx3_enrichr.csv"
enrichr_df = pd.read_csv(enrichr_csv_path)
enrichr_df.head()

Unnamed: 0,ID,Term,Adjusted P-value
0,GO:0002181,Cytoplasmic Translation,3.4016990000000003e-69
1,GO:0009059,Macromolecule Biosynthetic Process,2.501583e-60
2,GO:0006412,Translation,3.5506229999999997e-56
3,GO:0043043,Peptide Biosynthetic Process,5.14309e-56
4,GO:0010467,Gene Expression,6.660641e-47


In [52]:
import pickle

go_map = {}

with open(f"/mnt/hdd_2/abdu/llm_exp/go.obo") as fp:
    lines = fp.readlines()
    print(f"Len lines: {len(lines)}")
    k = 0
    while k < len(lines): 
        line = lines[k]
        if line.strip() == "[Term]":
            go_id = (":".join(lines[k+1].strip().split(":")[1:])).strip()
            go_name = lines[k+2].strip().split(":")[-1]
            go_desc = lines[k+4].strip().split(":")[1].split("[")[0].strip()
            go_desc  = go_desc.replace('"', '').replace("'", "").strip()
            go_map[go_id] = {"name": go_name, "desc": go_desc}
        
        k += 1
        
pickle.dump(go_map, open("/mnt/hdd_2/abdu/llm_exp/go_map.pkl", "wb"))

Len lines: 618673


In [12]:
import pickle
go_map = pickle.load(open("/mnt/hdd_2/abdu/llm_exp/go_map.pkl", "rb"))
desc = []

for _, row in enrichr_df.iterrows():
    go_id = row["ID"]
    go_name = row["Term"]
    try:
        go_desc = go_map[go_id]["desc"]
        desc.append(go_desc)
    except KeyError:
        print(f"Couldn't find term {go_id}, {go_name}")
        desc.append("NA")

enrichr_df["Desc"] = desc
enrichr_df.head()

Couldn't find term SSU-rRNA, 5.8S rRNA, LSU-rRNA, Maturation Of SSU-rRNA From Tricistronic rRNA Transcript 


Unnamed: 0,ID,Term,Adjusted P-value,Desc
0,GO:0002181,Cytoplasmic Translation,3.4016990000000003e-69,The chemical reactions and pathways resulting ...
1,GO:0009059,Macromolecule Biosynthetic Process,2.501583e-60,GO
2,GO:0006412,Translation,3.5506229999999997e-56,GO
3,GO:0043043,Peptide Biosynthetic Process,5.14309e-56,The chemical reactions and pathways resulting ...
4,GO:0010467,Gene Expression,6.660641e-47,The process in which a genes sequence is conve...


In [13]:
import tempfile
tmp_file = tempfile.NamedTemporaryFile("w+")
df = enrichr_df.copy()
df.drop(columns=["ID", "Adjusted P-value"], inplace=True)
df.to_csv(tmp_file, index=False)
df.head()

Unnamed: 0,Term,Desc
0,Cytoplasmic Translation,The chemical reactions and pathways resulting ...
1,Macromolecule Biosynthetic Process,GO
2,Translation,GO
3,Peptide Biosynthetic Process,The chemical reactions and pathways resulting ...
4,Gene Expression,The process in which a genes sequence is conve...


In [None]:
from llama_index.readers.file import CSVReader
from pathlib import Path
reader = CSVReader(concat_rows=False)
docs = reader.load_data(file=Path(tmp_file.name))
index = VectorStoreIndex.from_documents(docs)

In [58]:
from llama_index.core.program import LLMTextCompletionProgram
from pydantic import BaseModel
from typing import List

class GoTerm(BaseModel):
    name: str
    reason: str
    
class GoTerms(BaseModel):
    variant: str
    phenotype: str
    go_terms: List[GoTerm]
    
# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

qa_prompt_tmpl_str = """\
Context information is below.
---------------------
<s>[INST] <<SYS>>
You are an AI assistant helping biologists understand the mechanism of action of genomic mutation and how it brings about a phenotype .You should explain each of your response as detailed as possible. Don't write introductions or conclusions.
<</SYS>>
---------------------
Query: A biologist is studying the causal relationship between SNP {variant} and {phenotype}. Select 10 most relevant GO terms that are most likely to explain {phenotype}.Your response should be a valid json that includes two  fields 'Name' for name of the term and 'Reason' for the reason of your answer. Don't include introduction and conclusion remarks.
Answer: \
"""

# query_str = "A biologist is studying the causal relationship between SNP rs1421085 and obesity. Select 10 most relevant GO terms that are most likely to explain obesity.Your response should be a valid json that includes two  fields 'Name' for name of the term and 'Reason' for the reason of your answer. Don't include introduction and conclusion remarks."

qa_prompt_tmpl = PromptTemplate(
    qa_prompt_tmpl_str
)

query_engine = index.as_query_engine(similarity_top_k=10, llm=ollama, 
                                     text_qa_template=qa_prompt_tmpl)

program = LLMTextCompletionProgram.from_defaults(
    output_cls=GoTerms,
    prompt_template_str=qa_prompt_tmpl_str,
    llm=ollama,
    verbose=True
)
# vector_retriever = index.as_retriever(similary_top_k=10)
# prompt = qa_prompt_tmpl.format(query_str=query_str)
# query_engine.update_prompts(qa_prompt_tmpl)

In [59]:
output = program(variant="rs1421085", phenotype="obesity")
print(output)

INFO:httpx:HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"


ValidationError: 1 validation error for GoTerms
__root__
  Extra data: line 4 column 4 (char 204) [type=value_error.jsondecode, input_value='{\n    "Name": "regulati...buting to obesity"\n  }', input_type=str]

In [62]:
ollama.tokenizer

AttributeError: 'Ollama' object has no attribute 'tokenizer'

In [61]:
response = query_engine.query(query_str, variant="rs1421085", phenotype="obesity")
print(str(response))

TypeError: got an unexpected keyword argument 'variant'

In [46]:
import json
subset_go_df = {"ID": [], "Name": [], "Reason": [], "Adjusted P-value": []}
llm_res = json.loads(response.response)
for res in llm_res:
    row = enrichr_df[enrichr_df["Term"].str.contains(res["Name"], case=False)]
    if len(row) == 0:
        print(f"Couldn't find {res['Name']}")
        continue
    elif len(row) > 1:
        row = row.head(1)
    go_id, name, reason, pval = row["ID"].iloc[0], row["Term"].iloc[0], \
                                res["Reason"], row["Adjusted P-value"].iloc[0]
    if go_id not in subset_go_df["ID"]:
        subset_go_df["ID"].append(go_id)
        subset_go_df["Name"].append(name)
        subset_go_df["Reason"].append(reason)
        subset_go_df["Adjusted P-value"].append(pval)

Couldn't find Regulation Of Cellular Response To Low-Density Lipoprotein Particle Stimulus


In [33]:
pd.DataFrame(subset_go_df)

Unnamed: 0,ID,Name,Reason,Adjusted P-value
0,GO:1904177,Regulation Of Adipose Tissue Development,This term is relevant as it involves the devel...,0.004956
1,GO:0071404,Cellular Response To Low-Density Lipoprotein P...,Low-density lipoprotein (LDL) is often referre...,0.003305
2,GO:0045598,Regulation Of Fat Cell Differentiation,This term is relevant as it involves the diffe...,0.026643
3,GO:0009893,Positive Regulation Of Metabolic Process,Obesity can be caused by an imbalance in metab...,5e-06
4,GO:0009892,Negative Regulation Of Metabolic Process,A decrease in certain metabolic activities can...,0.001581
5,GO:0045923,Positive Regulation Of Fatty Acid Metabolic Pr...,An increase in the metabolism of fatty acids c...,0.001479
6,GO:0033211,Adiponectin-Activated Signaling Pathway,Adiponectin is a hormone that regulates glucos...,0.015115
7,GO:0071385,Cellular Response To Glucocorticoid Stimulus,Glucocorticoids can influence carbohydrate and...,0.014414
8,GO:0051247,Positive Regulation Of Protein Metabolic Process,An increase in protein metabolism can lead to ...,0.000776
9,GO:0050728,Negative Regulation Of Inflammatory Response,Obesity is often associated with chronic low-g...,0.000179


In [26]:
pd.DataFrame(subset_go_df)

Unnamed: 0,ID,Name,Reason,Adjusted P-value
0,GO:1904177,Regulation Of Adipose Tissue Development,This term directly relates to the development ...,0.004956
1,GO:0071404,Cellular Response To Low-Density Lipoprotein P...,Low-density lipoprotein particles are associat...,0.003305
2,GO:0045598,Regulation Of Fat Cell Differentiation,The differentiation of fat cells plays a cruci...,0.026643
3,GO:0045923,Positive Regulation Of Fatty Acid Metabolic Pr...,Fatty acid metabolism is closely linked to obe...,0.001479
4,GO:0033211,Adiponectin-Activated Signaling Pathway,Adiponectin is a key hormone involved in regul...,0.015115
5,GO:0071385,Cellular Response To Glucocorticoid Stimulus,Glucocorticoids can impact metabolism and infl...,0.014414
6,GO:0051247,Positive Regulation Of Protein Metabolic Process,Protein metabolism is essential for various ph...,0.000776
7,GO:0009892,Negative Regulation Of Metabolic Process,Dysregulation of metabolic processes can contr...,0.001581
8,GO:0050728,Negative Regulation Of Inflammatory Response,Inflammation is associated with obesity-relate...,0.000179
9,GO:0009893,Positive Regulation Of Metabolic Process,Efficient metabolic processes are crucial for ...,5e-06


In [47]:
pd.DataFrame(subset_go_df)

Unnamed: 0,ID,Name,Reason,Adjusted P-value
0,GO:1904177,Regulation Of Adipose Tissue Development,The causal relationship between SNP rs1421085 ...,0.004956
1,GO:0045923,Positive Regulation Of Fatty Acid Metabolic Pr...,SNP rs1421085 may influence the regulation of ...,0.001479
2,GO:0045598,Regulation Of Fat Cell Differentiation,The SNP may affect the differentiation of fat ...,0.026643
3,GO:0009893,Positive Regulation Of Metabolic Process,Changes in metabolic processes can contribute ...,5e-06
4,GO:0033211,Adiponectin-Activated Signaling Pathway,Adiponectin is a hormone that regulates glucos...,0.015115
5,GO:0050728,Negative Regulation Of Inflammatory Response,Inflammation is a known contributor to obesity...,0.000179
6,GO:0051247,Positive Regulation Of Protein Metabolic Process,Changes in protein metabolism can impact energ...,0.000776
7,GO:0071385,Cellular Response To Glucocorticoid Stimulus,Glucocorticoids play a critical role in regula...,0.014414
8,GO:0009892,Negative Regulation Of Metabolic Process,The negative regulation of metabolic process t...,0.001581


In [80]:
terms = []
with open("/mnt/hdd_2/abdu/gtex/brown_preadipocytes_irx3_enrichr_terms.txt", "r") as f:
    for line in f:
        go_id = line.split("(")[0]
        terms.append(go_id.strip().upper())
len(terms)

679