In [1]:
import os
from llama_index.core.llms import ChatMessage
from llama_index.llms.openai import OpenAI
from llama_index.llms.ollama import Ollama
from llama_index.llms.anthropic import Anthropic
import pandas as pd
import numpy as np

In [2]:
BENCHMARK_DIR = "/mnt/hdd_2/abdu/llm_exp/zenodo_directory/data/benchmark_datasets"

In [3]:
openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
gpt4 = OpenAI(api_key=openai_api_key, model="gpt-4-0613", temperature=0.0)
claude35 = Anthropic(model="claude-3-5-sonnet-20240620", api_key=anthropic_api_key, temperature=0.0)
llama3 = Ollama(model="llama3", request_timeout=120.0, 
             temperature=0.0)

gpt4_response = gpt4.complete("What is the capital of France?")
llama3_response = llama3.complete("What is the capital of France?")
claude_response = claude35.complete("What is the capital of France?")
print(f"gpt4: {gpt4_response}\nllama3: {llama3_response}\nclaude: {claude_response}")

gpt4: The capital of France is Paris.
llama3: The capital of France is Paris.
claude: The capital of France is Paris.


In [4]:
system_prompt = """You are an expert in biology and genetics.

Your task is to identify likely causal genes within a locus for a given GWAS phenotype based on literature evidence.

From the list, provide the likely causal gene (matching one of the given genes), confidence (0: very unsure to 1: very confident), and a brief reason (50 words or less) for your choice.

Return your response in JSON format, excludin g the GWAS phenotype name and gene list in the locus. JSON keys should be ‘causal_gene’,‘confidence’,‘reason’.
Don't add any additional information to the response.
"""

In [5]:
#OpenTargets
open_target_data = pd.read_table(f"{BENCHMARK_DIR}/opentargets_step2.for_llm.tsv")
open_target_labels = pd.read_table(f"{BENCHMARK_DIR}/opentargets_step2.labels")
open_target_labels.columns = ["ground_truth_symbol", "ground_truth_gene_id"]
open_target_df = pd.concat([open_target_data, open_target_labels], axis=1)
#GWAS Catalog
gwas_catalog_data = pd.read_table(f"{BENCHMARK_DIR}/gwas_catalog_step2.for_llm.tsv")
gwas_catalog_labels = pd.read_table(f"{BENCHMARK_DIR}/gwas_catalog_step2.labels")
gwas_catalog_labels.columns = ["ground_truth_symbol", "ground_truth_gene_id"]
gwas_catalog_df = pd.concat([gwas_catalog_data, gwas_catalog_labels], axis=1)
#Pharmaprojects
pharmaprojects_data = pd.read_table(f"{BENCHMARK_DIR}/pharmaprojects_step2.for_llm.tsv")
pharmaprojects_labels = pd.read_table(f"{BENCHMARK_DIR}/pharmaprojects_step2.labels")
pharmaprojects_labels.columns = ["ground_truth_symbol", "ground_truth_gene_id"]
pharmaprojects_df = pd.concat([pharmaprojects_data, pharmaprojects_labels], axis=1)
open_target_df.head()

Unnamed: 0,row_number,description,symbol_gene_string,ensembl_gene_string,ground_truth_symbol,ground_truth_gene_id
0,1,Carotenoid and tocopherol levels (beta-caroten...,"{ATMIN},{BCO1},{C16orf46},{CDYL2},{CENPN},{CMC...","{ENSG00000103121},{ENSG00000135697},{ENSG00000...",BCO1,ENSG00000135697
1,2,Bilirubin levels,"{ATG16L1},{DGKD},{HJURP},{MROH2A},{SAG},{SPP2}...","{ENSG00000072080},{ENSG00000077044},{ENSG00000...",UGT1A8,ENSG00000242366
2,3,Bilirubin levels,"{ARHGAP4},{ATP6AP1},{AVPR2},{CTAG1A},{CTAG1B},...","{ENSG00000007350},{ENSG00000013563},{ENSG00000...",G6PD,ENSG00000160211
3,4,Sphingolipid levels (SM 14:0Mol%) (sphingomyel...,"{ESR2},{GPHB5},{PPP2R5E},{RHOJ},{SGPP1},{SYNE2...","{ENSG00000054654},{ENSG00000126785},{ENSG00000...",SGPP1,ENSG00000126821
4,5,Fasting glucose-related traits (FPG) (D-Glucose),"{ABCB11},{CERS6},{DHRS9},{G6PC2},{LRP2},{NOSTR...","{ENSG00000073734},{ENSG00000073737},{ENSG00000...",G6PC2,ENSG00000152254


In [6]:
gwas_catalog_df.head()

Unnamed: 0,row_number,description,symbol_gene_string,ensembl_gene_string,ground_truth_symbol,ground_truth_gene_id
0,1,High fluorescence immature platelet fraction,"{ENSG00000239395},{GCSAML},{NLRP3},{OR11L1},{O...","{ENSG00000135747},{ENSG00000153230},{ENSG00000...",GCSAML,ENSG00000169224
1,2,Immature platelet fraction,"{ENSG00000239395},{GCSAML},{NLRP3},{OR11L1},{O...","{ENSG00000135747},{ENSG00000153230},{ENSG00000...",GCSAML,ENSG00000169224
2,3,Metabolite levels (cis-4-decenoate (10:1n6)),"{ACADM},{ASB17},{MSH4},{RABGGTB},{SLC44A5},{ST...","{ENSG00000057468},{ENSG00000117054},{ENSG00000...",ACADM,ENSG00000117054
3,4,Metabolite levels (glutarylcarnitine (C5); glu...,"{CPT2},{CZIB},{DMRTB1},{ECHDC2},{GLIS1},{LRP8}...","{ENSG00000116171},{ENSG00000121310},{ENSG00000...",CPT2,ENSG00000157184
4,5,Metabolite levels (imidazole lactate; imidazol...,"{GBP1},{GBP2},{GBP3},{GBP4},{GBP5},{GBP6},{GBP...","{ENSG00000065243},{ENSG00000117226},{ENSG00000...",KYAT3,ENSG00000137944


In [7]:
pharmaprojects_df.head()

Unnamed: 0,row_number,description,symbol_gene_string,ensembl_gene_string,ground_truth_symbol,ground_truth_gene_id
0,1,Lysosomal acid lipase deficiency,"{ACTA2},{ANKRD22},{CH25H},{FAS},{IFIT1},{IFIT1...","{ENSG00000026103},{ENSG00000107796},{ENSG00000...",LIPA,ENSG00000107798
1,2,Recurrent respiratory papillomatosis,"{C21orf62},{CRYZL1},{DNAJC28},{DONSON},{ENSG00...","{ENSG00000142166},{ENSG00000142188},{ENSG00000...",IFNAR2,ENSG00000159110
2,3,"Cholestasis, progressive familial intrahepatic 1","{BIVM},{BIVM-ERCC5},{CCDC168},{ERCC5},{METTL21...","{ENSG00000125255},{ENSG00000134897},{ENSG00000...",SLC10A2,ENSG00000125255
3,4,"Periodic fever, familial, autosomal dominant","{CHCHD5},{CKAP2L},{IL1A},{IL1B},{IL1F10},{IL1R...","{ENSG00000114999},{ENSG00000115008},{ENSG00000...",IL1B,ENSG00000125538
4,5,Aromatic amino acid decarboxylase deficiency,"{COBL},{DDC},{FIGNL1},{GRB10},{IKZF1}","{ENSG00000106070},{ENSG00000106078},{ENSG00000...",DDC,ENSG00000132437


In [8]:
ex_query = "GWAS Phenotype: Type 2 diabetes (type II diabetes mellitus)\n" + \
        "Genes: ADA},{CCN5},{FITM2},{GDAP1L1},{HNF4A},{JPH2},{KCNK15},{OSER1},{PABPC1L},{PKIG},{R3HDML},{RIMS4},{SERINC3},{TOX2},{TTPAL},{YWHAB}"
messages = [
    ChatMessage(
        role="system", content=system_prompt
    ),
    ChatMessage(role="user", content=ex_query),
]
gpt4_response = gpt4.chat(messages)
llama3_response = llama3.chat(messages)
claude_response = claude35.chat(messages)
print(f"gpt4: {gpt4_response}\nllama3: {llama3_response}\nclaude: {claude_response}")

gpt4: assistant: {"causal_gene": "HNF4A", "confidence": 0.8, "reason": "HNF4A is associated with MODY1, a form of diabetes. It plays a crucial role in glucose metabolism and insulin regulation."}
llama3: assistant: {
"causal_gene": "HNF4A",
"confidence": 0.8,
"reason": "Strong evidence from multiple studies suggests that HNF4A is a key regulator of glucose and insulin homeostasis, making it a strong candidate for type 2 diabetes susceptibility."
}
claude: assistant: {
  "causal_gene": "HNF4A",
  "confidence": 0.9,
  "reason": "HNF4A is a well-established transcription factor crucial for pancreatic beta-cell function and insulin secretion. Mutations in HNF4A are associated with maturity-onset diabetes of the young (MODY), a monogenic form of diabetes."
}


In [10]:
import json
from tqdm import tqdm

def evaluate_llm_on_benchmark(seed, benchmark_df, num_samples=100):
    np.random.seed(seed)
    total_samples = len(benchmark_df)
    sample_indices = np.random.choice(total_samples, num_samples, replace=False)
    results = {
        "GWAS Phenotype": [],
        "Genes": [],
        "ground_truth_symbol": [],
        "gpt4_pred": [],
        "gpt4_confidence": [],
        "gpt4_reason": [],
        "claude35_pred": [],
        "claude35_confidence": [],
        "claude35_reason": [],
        "llama3_pred": [],
        "llama3_confidence": [],
        "llama3_reason": []
    }
    gpt4_pred = []
    claude35_pred = []
    llama3_pred = []
    for idx in tqdm(sample_indices):
        row = benchmark_df.iloc[idx]
        gwas_phenotype = row["description"]
        genes = row["symbol_gene_string"]
        ground_truth_symbol = row["ground_truth_symbol"]
        query = f"GWAS Phenotype: {gwas_phenotype}\nGenes: {genes}"
        messages = [
            ChatMessage(
                role="system", content=system_prompt
            ),
            ChatMessage(role="user", content=query),
        ]
        gpt4_response = gpt4.chat(messages).message.content
        llama3_response = llama3.chat(messages).message.content
        claude35_response = claude35.chat(messages).message.content
        try:
            gpt4_response = json.loads(gpt4_response)
        except:
            # retry
            gpt4_response = gpt4.chat(messages)
            gpt4_response = json.loads(gpt4_response)
        try:
            llama3_response = json.loads(llama3_response)
        except:
            # retry
            llama3_response = llama3.chat(messages)
            llama3_response = json.loads(llama3_response)
        
        try:
            claude35_response = json.loads(claude35_response)
        except:
            # retry
            claude35_response = claude35.chat(messages)
            claude35_response = json.loads(claude35_response)
        
        if gpt4_response["causal_gene"] in genes:
            if gpt4_response["causal_gene"] == ground_truth_symbol:
                gpt4_pred.append(1)
            else:
                gpt4_pred.append(0)
        else:
            gpt4_pred.append(-1) #hallucination
        
        if claude35_response["causal_gene"] in genes:
            if claude35_response["causal_gene"] == ground_truth_symbol:
                claude35_pred.append(1)
            else:
                claude35_pred.append(0)
        else:
            claude35_pred.append(-1)
        
        if llama3_response["causal_gene"] in genes:
            if llama3_response["causal_gene"] == ground_truth_symbol:
                llama3_pred.append(1)
            else:
                llama3_pred.append(0)
        else:
            llama3_pred.append(-1) #hallucination
        
        
        results["GWAS Phenotype"].append(gwas_phenotype)
        results["Genes"].append(genes)
        results["ground_truth_symbol"].append(ground_truth_symbol)
        results["gpt4_pred"].append(gpt4_response["causal_gene"])
        results["gpt4_confidence"].append(gpt4_response["confidence"])
        results["gpt4_reason"].append(gpt4_response["reason"])
        results["claude35_pred"].append(claude35_response["causal_gene"])
        results["claude35_confidence"].append(claude35_response["confidence"])
        results["claude35_reason"].append(claude35_response["reason"])
        results["llama3_pred"].append(llama3_response["causal_gene"])
        results["llama3_confidence"].append(llama3_response["confidence"])
        results["llama3_reason"].append(llama3_response["reason"])
        
    return results, gpt4_pred, claude35_pred, llama3_pred

In [16]:
seed = 42
open_target_results, gpt4_pred, claude35_pred, llama3_pred = evaluate_llm_on_benchmark(seed, open_target_df)

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [09:21<00:00,  5.61s/it]


In [17]:
n_hallucination_gpt4 = sum([1 for p in gpt4_pred if p == -1])
n_correct_gpt4 = sum([1 for p in gpt4_pred if p == 1])
n_incorrect_gpt4 = sum([1 for p in gpt4_pred if p == 0])

n_hallucination_claud35 = sum([1 for p in claude35_pred if p == -1])
n_correct_claud35 = sum([1 for p in claude35_pred if p == 1])
n_incorrect_claud35 = sum([1 for p in claude35_pred if p == 0])

n_hallucination_llama3 = sum([1 for p in llama3_pred if p == -1])
n_correct_llama3 = sum([1 for p in llama3_pred if p == 1])
n_incorrect_llama3 = sum([1 for p in llama3_pred if p == 0])

print(f"OpenTargets: GPT-4 hallucination: {n_hallucination_gpt4}, correct: {n_correct_gpt4}, incorrect: {n_incorrect_gpt4}")
print(f"OpenTargets: Claude3.5 hallucination: {n_hallucination_claud35}, correct: {n_correct_claud35}, incorrect: {n_incorrect_claud35}")
print(f"OpenTargets: Llama3 hallucination: {n_hallucination_llama3}, correct: {n_correct_llama3}, incorrect: {n_incorrect_llama3}")

OpenTargets: GPT-4 hallucination: 0, correct: 80, incorrect: 20
OpenTargets: Claude3.5 hallucination: 0, correct: 81, incorrect: 19
OpenTargets: Llama3 hallucination: 4, correct: 63, incorrect: 33


In [19]:
gpt4_precision = n_correct_gpt4 / (n_correct_gpt4 + n_incorrect_gpt4)
gpt4_recall = n_correct_gpt4 / (n_correct_gpt4 + n_hallucination_gpt4 + n_incorrect_gpt4)

claude35_precision = n_correct_claud35 / (n_correct_claud35 + n_incorrect_claud35)
claude35_recall = n_correct_claud35 / (n_correct_claud35 + n_hallucination_claud35 + n_incorrect_claud35)

llama3_precision = n_correct_llama3 / (n_correct_llama3 + n_incorrect_llama3)
llama3_recall = n_correct_llama3 / (n_correct_llama3 + n_hallucination_llama3 + n_incorrect_llama3)

gpt4_f1 = 2 * (gpt4_precision * gpt4_recall) / (gpt4_precision + gpt4_recall)
claude35_f1 = 2 * (claude35_precision * claude35_recall) / (claude35_precision + claude35_recall)
llama3_f1 = 2 * (llama3_precision * llama3_recall) / (llama3_precision + llama3_recall)

print(f"OpenTargets: GPT-4 precision: {gpt4_precision}, recall: {gpt4_recall}, F1: {gpt4_f1}")
print(f"OpenTargets: Claude3.5 precision: {claude35_precision}, recall: {claude35_recall}, F1: {claude35_f1}")
print(f"OpenTargets: Llama3 precision: {llama3_precision}, recall: {llama3_recall}, F1: {llama3_f1}")

OpenTargets: GPT-4 precision: 0.8, recall: 0.8, F1: 0.8000000000000002
OpenTargets: Claude3.5 precision: 0.81, recall: 0.81, F1: 0.81
OpenTargets: Llama3 precision: 0.65625, recall: 0.63, F1: 0.6428571428571429


In [22]:
open_target_results_df = pd.DataFrame(open_target_results)
open_target_results_df.head(50)

Unnamed: 0,GWAS Phenotype,Genes,ground_truth_symbol,gpt4_pred,gpt4_confidence,gpt4_reason,claude35_pred,claude35_confidence,claude35_reason,llama3_pred,llama3_confidence,llama3_reason
0,Total Cholesterol,"{CYP7A1},{FAM110B},{NSMAF},{SDCBP},{TOX},{UBXN2B}",CYP7A1,CYP7A1,0.9,"CYP7A1 is involved in cholesterol metabolism, ...",CYP7A1,0.9,CYP7A1 encodes cholesterol 7-alpha-hydroxylase...,CYP7A1,0.8,CYP7A1 is a well-established enzyme involved i...
1,Heart failure,"{APH1B},{CA12},{DAPK2},{FBXL22},{HERC1},{LACTB...",CA12,TPM1,0.7,TPM1 is involved in cardiac muscle contraction...,TPM1,0.8,"TPM1 encodes tropomyosin, a crucial protein fo...",RAB8B,0.7,RAB8B has been implicated in cardiac developme...
2,Type 2 diabetes (type II diabetes mellitus),"{ARF5},{FSCN3},{GCC1},{GRM8},{PAX4},{SND1},{ZN...",PAX4,PAX4,0.8,PAX4 is a known regulator of pancreatic beta c...,PAX4,0.9,PAX4 is a transcription factor crucial for pan...,PAX4,0.7,PAX4 has been previously implicated in type 2 ...
3,metabolite: glycine (glycine measurement),"{ACADL},{CPS1},{LANCL1},{MYL1}",CPS1,CPS1,0.9,"CPS1 is directly involved in the urea cycle, w...",CPS1,0.9,"CPS1 encodes carbamoyl phosphate synthetase I,...",CPS1,0.8,CPS1 is the primary enzyme responsible for con...
4,Disorders of lipoid metabolism (Disorder of li...,"{ANKDD1B},{ANKRD31},{CERT1},{FAM169A},{GCNT4},...",HMGCR,HMGCR,0.9,HMGCR is directly involved in cholesterol bios...,HMGCR,0.95,"HMGCR encodes HMG-CoA reductase, the rate-limi...",HMGCR,0.8,HMGCR is a well-established gene involved in c...
5,Type 2 diabetes (type II diabetes mellitus),"{BLOC1S4},{C4orf50},{CRMP1},{ENSG00000170846},...",WFS1,WFS1,0.9,"WFS1 gene is associated with Wolfram syndrome,...",WFS1,0.9,"WFS1 mutations cause Wolfram syndrome, which i...",WFS1,0.8,WFS1 has been previously implicated in type 2 ...
6,Psoriasis,"{EBF1},{IL12B},{RNF145},{UBLCP1}",IL12B,IL12B,0.9,"IL12B is involved in the immune response, and ...",IL12B,0.9,"IL12B encodes interleukin-12 subunit beta, a c...",IL12B,0.8,IL12B has been implicated in the pathogenesis ...
7,"Nerve, nerve root and plexus disorders (periph...","{CCDC85A},{CCDC88A},{CFAP36},{EFEMP1},{PNPT1},...",EFEMP1,CCDC88A,0.7,"CCDC88A is involved in axon guidance, a critic...",PNPT1,0.8,PNPT1 mutations are associated with various pe...,CFAP36,0.8,CFAP36 is a peripheral myelin protein involved...
8,reticulocyte volume,"{AIM2},{CD1A},{CD1B},{CD1C},{CD1D},{CD1E},{IFI...",SPTA1,SPTA1,0.7,SPTA1 gene is involved in the production of sp...,SPTA1,0.9,"SPTA1 encodes spectrin alpha, a key component ...",IFI16,0.8,IFI16 is a known regulator of erythropoiesis a...
9,treatment: statin use (low density lipoprotein...,"{ACP5},{ANGPTL8},{AP1M2},{C19orf38},{CARM1},{C...",LDLR,LDLR,0.9,LDLR is directly involved in low-density lipop...,LDLR,0.9,LDLR encodes the low-density lipoprotein recep...,LDLR,0.8,LDLR is a well-established gene involved in LD...


In [37]:
open_target_results_df.iloc[41]["claude35_reason"]

'FTO is strongly associated with obesity risk. It regulates energy homeostasis and adipocyte differentiation. Variants in FTO are consistently linked to increased BMI and obesity in multiple populations.'

In [31]:
open_target_results_df.to_csv("/mnt/hdd_2/abdu/llm_exp/open_targets_results_llms.csv", index=False) 
# open_target_results_df = pd.read_csv("/mnt/hdd_2/abdu/llm_exp/open_targets_results_llms.csv")
open_target_results_df.tail()

Unnamed: 0,GWAS Phenotype,Genes,ground_truth_symbol,gpt4_pred,gpt4_confidence,gpt4_reason,claude35_pred,claude35_confidence,claude35_reason,llama3_pred,llama3_confidence,llama3_reason
95,Disorders of lipoid metabolism (Disorder of li...,"{ACOT11},{BSND},{DHCR24},{FAM151A},{LEXM},{MRO...",PCSK9,PCSK9,0.9,PCSK9 is well-documented in literature for its...,PCSK9,0.9,PCSK9 is well-known to regulate LDL cholestero...,PCSK9,0.8,PCSK9 is a well-established gene involved in l...
96,treatment with nicorandil (coronary artery dis...,"{CDKN2A},{CDKN2B},{DMRTA1},{MTAP}",CDKN2B,CDKN2A,0.7,CDKN2A is associated with coronary artery dise...,CDKN2B,0.8,CDKN2B is associated with coronary artery dise...,CDKN2A,0.8,CDKN2A is a well-established regulator of cell...
97,Type 2 diabetes (type II diabetes mellitus),"{ANKH},{OTULIN},{OTULINL},{TRIO}",ANKH,ANKH,0.7,ANKH gene is associated with type 2 diabetes t...,ANKH,0.7,"ANKH regulates pyrophosphate levels, which are...",ANKH,0.8,ANKH has been previously implicated in type 2 ...
98,Type 2 diabetes (type II diabetes mellitus),"{AK3},{CDC37L1},{GLIS3},{PLPP6},{SLC1A1},{SPAT...",GLIS3,GLIS3,0.9,Multiple studies have identified GLIS3 as a ge...,GLIS3,0.9,GLIS3 is a transcription factor involved in pa...,GLIS3,0.8,GLIS3 has been previously implicated in type 2...
99,Fasting Glucose (D-Glucose),"{AEBP1},{BLVRA},{CAMK2B},{COA1},{DBNL},{DDX56}...",GCK,GCK,0.9,GCK (Glucokinase) is a key regulator of glucos...,GCK,0.9,GCK (Glucokinase) is a key enzyme in glucose m...,GCK,0.8,GCK is a well-established gene involved in glu...


In [27]:
gwas_catalog_results, gwas_catalog_gpt4_pred, gwas_catalog_claude35_pred, \
    gwas_catalog_llama3_pred = evaluate_llm_on_benchmark(seed, gwas_catalog_df)

100%|█████████████████████████████████████████████████████████████████████████████| 100/100 [09:40<00:00,  5.80s/it]


In [28]:
n_hallucination_gpt4 = sum([1 for p in gwas_catalog_gpt4_pred if p == -1])
n_correct_gpt4 = sum([1 for p in gwas_catalog_gpt4_pred if p == 1])
n_incorrect_gpt4 = sum([1 for p in gwas_catalog_gpt4_pred if p == 0])

n_hallucination_claud35 = sum([1 for p in gwas_catalog_claude35_pred if p == -1])
n_correct_claud35 = sum([1 for p in gwas_catalog_claude35_pred if p == 1])
n_incorrect_claud35 = sum([1 for p in gwas_catalog_claude35_pred if p == 0])

n_hallucination_llama3 = sum([1 for p in gwas_catalog_llama3_pred if p == -1])
n_correct_llama3 = sum([1 for p in gwas_catalog_llama3_pred if p == 1])
n_incorrect_llama3 = sum([1 for p in gwas_catalog_llama3_pred if p == 0])

print(f"GWAS Catalog: GPT-4 hallucination: {n_hallucination_gpt4}, correct: {n_correct_gpt4}, incorrect: {n_incorrect_gpt4}")
print(f"GWAS Catalog: Claude3.5 hallucination: {n_hallucination_claud35}, correct: {n_correct_claud35}, incorrect: {n_incorrect_claud35}")
print(f"GWAS Catalog: Llama3 hallucination: {n_hallucination_llama3}, correct: {n_correct_llama3}, incorrect: {n_incorrect_llama3}")

GWAS Catalog: GPT-4 hallucination: 0, correct: 62, incorrect: 38
GWAS Catalog: Claude3.5 hallucination: 0, correct: 69, incorrect: 31
GWAS Catalog: Llama3 hallucination: 2, correct: 26, incorrect: 72


In [36]:
gpt4_precision = n_correct_gpt4 / (n_correct_gpt4 + n_incorrect_gpt4)
gpt4_recall = n_correct_gpt4 / (n_correct_gpt4 + n_hallucination_gpt4 + n_incorrect_gpt4)

claude35_precision = n_correct_claud35 / (n_correct_claud35 + n_incorrect_claud35)
claude35_recall = n_correct_claud35 / (n_correct_claud35 + n_hallucination_claud35 + n_incorrect_claud35)

llama3_precision = n_correct_llama3 / (n_correct_llama3 + n_incorrect_llama3)
llama3_recall = n_correct_llama3 / (n_correct_llama3 + n_hallucination_llama3 + n_incorrect_llama3)

gpt4_f1 = 2 * (gpt4_precision * gpt4_recall) / (gpt4_precision + gpt4_recall)
claude35_f1 = 2 * (claude35_precision * claude35_recall) / (claude35_precision + claude35_recall)
llama3_f1 = 2 * (llama3_precision * llama3_recall) / (llama3_precision + llama3_recall)

print(f"GWAS Catalog: GPT-4 precision: {gpt4_precision}, recall: {gpt4_recall}, F1: {gpt4_f1}")
print(f"GWAS Catalog: Claude3.5 precision: {claude35_precision}, recall: {claude35_recall}, F1: {claude35_f1}")
print(f"GWAS Catalog: Llama3 precision: {llama3_precision}, recall: {llama3_recall}, F1: {llama3_f1}")

GWAS Catalog: GPT-4 precision: 0.62, recall: 0.62, F1: 0.62
GWAS Catalog: Claude3.5 precision: 0.69, recall: 0.69, F1: 0.69
GWAS Catalog: Llama3 precision: 0.2653061224489796, recall: 0.26, F1: 0.2626262626262626


In [32]:
gwas_catalog_results_df = pd.DataFrame(gwas_catalog_results)
gwas_catalog_results_df.to_csv("/mnt/hdd_2/abdu/llm_exp/gwas_catalog_results_llms.csv", index=False)
# gwas_catalog_results_df = pd.read_csv("/mnt/hdd_2/abdu/llm_exp/gwas_catalog_results_llms.csv")
gwas_catalog_results_df.tail()

Unnamed: 0,GWAS Phenotype,Genes,ground_truth_symbol,gpt4_pred,gpt4_confidence,gpt4_reason,claude35_pred,claude35_confidence,claude35_reason,llama3_pred,llama3_confidence,llama3_reason
95,Multi-trait sex score,"{ATP6V1B2},{CSGALNACT1},{INTS10},{LPL},{LZTS1}...",LPL,LPL,0.7,LPL gene is associated with lipid metabolism a...,LPL,0.8,"LPL encodes lipoprotein lipase, which plays a ...",CSGALNACT1,0.8,CSGALNACT1 is a known sex-determining gene in ...
96,Metabolite levels (N-acetylglucosaminylasparag...,"{CCR6},{CEP43},{ENSG00000249141},{ENSG00000272...",UNC93A,RPS6KA2,0.7,RPS6KA2 is involved in the regulation of cellu...,RNASET2,0.7,RNASET2 is involved in RNA metabolism and cell...,ENSG00...002790,0.8,Previous studies have linked GPR31 to the regu...
97,Multi-trait sex score,"{ATP6V1B2},{CSGALNACT1},{INTS10},{LPL},{LZTS1}...",LPL,LPL,0.7,LPL gene is associated with lipid metabolism a...,LPL,0.7,"LPL encodes lipoprotein lipase, involved in li...",CSGALNACT1,0.8,CSGALNACT1 is a known sex-determining gene in ...
98,Metabolite levels (propyl 4-hydroxybenzoate su...,"{APOBR},{ATP2A1},{ATXN2L},{CD19},{CLN3},{EIF3C...",SULT1A1,SULT1A1,0.8,SULT1A1 is involved in the sulfation and metab...,SULT1A1,0.9,SULT1A1 encodes a sulfotransferase enzyme invo...,SULT1A1,0.8,"Sulfotransferase enzymes, including SULT1A1, a..."
99,Metabolite levels (cysteine-glutathione disulf...,"{ADORA2A},{CABIN1},{ENSG00000286070},{GGT1},{G...",GGT1,GGT1,0.9,GGT1 is directly involved in glutathione metab...,GGT1,0.9,"GGT1 encodes gamma-glutamyltransferase 1, an e...",GGT1,0.8,GGT1 is a well-established enzyme responsible ...


In [34]:
gwas_catalog_gpt4_miss = gwas_catalog_results_df[gwas_catalog_results_df["ground_truth_symbol"] != gwas_catalog_results_df["gpt4_pred"]]
gwas_catalog_gpt4_miss
# for idx, row in gwas_gpt4_miss.iterrows():
#     print(f"GWAS Phenotype: {row['GWAS Phenotype']}")
#     print(f"Genes: {row['Genes']}")
#     print(f"Ground Truth: {row['ground_truth_symbol']}")
#     print(f"GPT-4 Prediction: {row['gpt4_pred']}")
#     print(f"GPT-4 Confidence: {row['gpt4_confidence']}")
#     print(f"GPT-4 Reason: {row['gpt4_reason']}")
#     print(f"Llama3 Prediction: {row['llama3_pred']}")
#     print(f"Llama3 Confidence: {row['llama3_confidence']}")
#     print(f"Llama3 Reason: {row['llama3_reason']}")
#     print("\n")

Unnamed: 0,GWAS Phenotype,Genes,ground_truth_symbol,gpt4_pred,gpt4_confidence,gpt4_reason,claude35_pred,claude35_confidence,claude35_reason,llama3_pred,llama3_confidence,llama3_reason
5,Multi-trait sex score,"{APOA1},{APOA4},{APOA5},{APOC3},{BUD13},{PAFAH...",APOA4,APOA5,0.7,APOA5 is associated with lipid metabolism and ...,APOA5,0.8,APOA5 is involved in lipid metabolism and has ...,APOA4,0.8,APOA4 has been implicated in sex determination...
7,Multi-trait sex score,"{ANKRD29},{CABLES1},{LAMA3},{NPC1},{RBBP8},{RI...",TMEM241,NPC1,0.7,NPC1 gene is associated with lipid metabolism ...,NPC1,0.8,NPC1 is involved in cholesterol trafficking an...,ANKRD29,0.7,ANKRD29 has been implicated in sex determinati...
9,Multi-trait sex score,"{ARL2BP},{BBS2},{CCL17},{CCL22},{CETP},{CIAPIN...",CETP,ARL2BP,0.7,ARL2BP is involved in sex determination and di...,CETP,0.8,CETP (Cholesteryl Ester Transfer Protein) is s...,ARL2BP,0.8,ARL2BP has been implicated in sex determinatio...
16,Triglyceride to HDL cholesterol ratio,"{APOA1},{APOA4},{APOA5},{APOC3},{BACE1},{BUD13...",CEP164,APOA5,0.9,APOA5 is directly involved in triglyceride met...,APOA5,0.9,APOA5 is strongly associated with triglyceride...,APOA1,0.8,APOA1 is a well-established regulator of trigl...
18,Anterior amygdaloid area volume,"{BANK1},{MANBA},{NFKB1},{SLC39A8}",SLC39A8,NFKB1,0.7,NFKB1 is involved in neurodevelopment and neur...,SLC39A8,0.7,SLC39A8 is involved in zinc transport and has ...,BANK1,0.9,BANK1 has been implicated in the development a...
19,Multi-trait sex score,"{BMP8A},{BMP8B},{CAP1},{HEYL},{HPCAL4},{MACF1}...",MACF1,BMP8B,0.7,BMP8B is involved in reproductive processes an...,MFSD2A,0.7,MFSD2A is involved in lipid transport and meta...,BMP8A,0.7,BMP8A has been implicated in sex determination...
20,Triglyceride to HDL cholesterol ratio,"{IGF2R},{LPA},{MAP3K4},{PLG},{SLC22A1},{SLC22A...",SLC22A1,LPA,0.8,LPA gene is associated with lipid metabolism a...,LPA,0.9,"LPA encodes lipoprotein(a), strongly associate...",LPA,0.8,Strong association with triglyceride levels an...
23,Body mass index,"{CACNA1D},{DCP1A},{ENSG00000243696},{ENSG00000...",ITIH3,CACNA1D,0.7,"CACNA1D is involved in calcium signaling, whic...",PRKCD,0.7,"PRKCD encodes protein kinase C delta, involved...",CACNA1D,0.8,CACNA1D has been implicated in obesity and met...
24,Multi-trait sex score,"{ADAP2},{ATAD5},{BLMH},{CPD},{CRLF3},{GOSR1},{...",ATAD5,SLC6A4,0.7,SLC6A4 is involved in the regulation of neurot...,SLC6A4,0.8,"SLC6A4 encodes the serotonin transporter, whic...",ADAP2,0.8,ADAP2 has been implicated in sex-specific gene...
27,Medial nucleus volume,"{BANK1},{PPP3CA},{SLC39A8}",SLC39A8,PPP3CA,0.7,PPP3CA is involved in neurodevelopment and syn...,SLC39A8,0.7,SLC39A8 is involved in metal ion transport and...,BANK1,0.9,BANK1 has been implicated in neurodevelopmenta...


In [35]:
gwas_catalog_claude35_miss = gwas_catalog_results_df[gwas_catalog_results_df["ground_truth_symbol"] != gwas_catalog_results_df["claude35_pred"]]
gwas_catalog_claude35_miss

Unnamed: 0,GWAS Phenotype,Genes,ground_truth_symbol,gpt4_pred,gpt4_confidence,gpt4_reason,claude35_pred,claude35_confidence,claude35_reason,llama3_pred,llama3_confidence,llama3_reason
5,Multi-trait sex score,"{APOA1},{APOA4},{APOA5},{APOC3},{BUD13},{PAFAH...",APOA4,APOA5,0.7,APOA5 is associated with lipid metabolism and ...,APOA5,0.8,APOA5 is involved in lipid metabolism and has ...,APOA4,0.8,APOA4 has been implicated in sex determination...
7,Multi-trait sex score,"{ANKRD29},{CABLES1},{LAMA3},{NPC1},{RBBP8},{RI...",TMEM241,NPC1,0.7,NPC1 gene is associated with lipid metabolism ...,NPC1,0.8,NPC1 is involved in cholesterol trafficking an...,ANKRD29,0.7,ANKRD29 has been implicated in sex determinati...
16,Triglyceride to HDL cholesterol ratio,"{APOA1},{APOA4},{APOA5},{APOC3},{BACE1},{BUD13...",CEP164,APOA5,0.9,APOA5 is directly involved in triglyceride met...,APOA5,0.9,APOA5 is strongly associated with triglyceride...,APOA1,0.8,APOA1 is a well-established regulator of trigl...
19,Multi-trait sex score,"{BMP8A},{BMP8B},{CAP1},{HEYL},{HPCAL4},{MACF1}...",MACF1,BMP8B,0.7,BMP8B is involved in reproductive processes an...,MFSD2A,0.7,MFSD2A is involved in lipid transport and meta...,BMP8A,0.7,BMP8A has been implicated in sex determination...
20,Triglyceride to HDL cholesterol ratio,"{IGF2R},{LPA},{MAP3K4},{PLG},{SLC22A1},{SLC22A...",SLC22A1,LPA,0.8,LPA gene is associated with lipid metabolism a...,LPA,0.9,"LPA encodes lipoprotein(a), strongly associate...",LPA,0.8,Strong association with triglyceride levels an...
23,Body mass index,"{CACNA1D},{DCP1A},{ENSG00000243696},{ENSG00000...",ITIH3,CACNA1D,0.7,"CACNA1D is involved in calcium signaling, whic...",PRKCD,0.7,"PRKCD encodes protein kinase C delta, involved...",CACNA1D,0.8,CACNA1D has been implicated in obesity and met...
24,Multi-trait sex score,"{ADAP2},{ATAD5},{BLMH},{CPD},{CRLF3},{GOSR1},{...",ATAD5,SLC6A4,0.7,SLC6A4 is involved in the regulation of neurot...,SLC6A4,0.8,"SLC6A4 encodes the serotonin transporter, whic...",ADAP2,0.8,ADAP2 has been implicated in sex-specific gene...
31,COVID-19 hospitalization or osteoarthritis (MTAG),"{CCR1},{CCR2},{CCR3},{CCR5},{CCR9},{CCRL2},{CX...",FYCO1,CCR5,0.8,CCR5 has been implicated in COVID-19 severity ...,CCR5,0.8,CCR5 is involved in immune response and has be...,CCR5,0.8,"As a chemokine receptor, CCR5 has been implica..."
32,Multi-trait sex score,"{APOA1},{APOA4},{APOA5},{APOC3},{BUD13},{PAFAH...",APOA4,APOA1,0.7,APOA1 is involved in lipid metabolism and has ...,APOA5,0.8,APOA5 is involved in lipid metabolism and has ...,ZPR1,0.8,ZPR1 has been implicated in sex determination ...
34,Primary open angle glaucoma (multi-trait analy...,"{CFAP52},{DHRS7C},{GAS7},{GLP2R},{GSG1L2},{MYH...",MYH13,MYH1,0.7,MYH1 has been implicated in eye muscle develop...,GAS7,0.8,GAS7 is associated with intraocular pressure r...,GAS7,0.6,GAS7 has been implicated in the regulation of ...
