### Baseline Experiment: 
- Prepare the Query Embedding
- Retrieve Abstracts and Compute Embeddings
- Calculate Cosine Similarity based on semantic_search from SBERT
- Select the Best Abstract for Each Edge
- Ensure at Least One Selection per Edge

In [3]:
import torch
import json
import time
import os
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm import tqdm
import random
import text_util
import openai
from openai import OpenAI
from agatha.construct.semrep_handler import SemRepHandler
from agatha.util.sqlite3_lookup import Sqlite3LookupTable
from sentence_transformers import SentenceTransformer, util
import warnings
from openai_llm import oai_get_response

warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [4]:
agatha_sent_db_path = (
     '/work/acslab/shared/Agatha_shared/sentences.sqlite3'
)
sents_db = Sqlite3LookupTable(agatha_sent_db_path)

### Retrieve Path from HGCR component per pair:

In [5]:
df = pd.read_pickle('amiodarone_mefloquine.pkl')
top_20_df = df.loc[:].head(20)
top_20_df

Unnamed: 0,path,score_std,score_mean,dec_path,context_pmids
155,"[C0002598, C0055661, C1412079, C0025153]",0.0,0.998306,"[amiodarone, chrysin, ABCC4 gene, mefloquine]","{('C0002598', 'C0055661'): ['34534572'], ('C00..."
392,"[C0002598, C0018995, C0037285, C0025153]",0.0,0.973623,"[amiodarone, Hemochromatosis, Skin Manifestati...","{('C0002598', 'C0018995'): ['9141611'], ('C001..."
351,"[C0002598, C1420133, C0024537, C0025153]",0.0181,0.966783,"[amiodarone, SLCO2B1 gene, Malaria, Vivax, mef...","{('C0002598', 'C1420133'): ['19285480'], ('C14..."
295,"[C0002598, C0051231, C0210995, C0025153]",0.006013,0.966562,"[amiodarone, allyl isothiocyanate, mismatch re...","{('C0002598', 'C0051231'): ['25069801'], ('C00..."
329,"[C0002598, C0002390, C0035891, C0025153]",0.0,0.964968,"[amiodarone, Extrinsic allergic alveolitis, ro...","{('C0002598', 'C0002390'): ['33792260'], ('C00..."
126,"[C0002598, C0029132, C0005699, C0025153]",0.025416,0.953859,"[amiodarone, Disorder of the optic nerve, Blas...","{('C0002598', 'C0029132'): ['18760119', '97752..."
5,"[C0002598, C0010194, C0286738, C0025153]",0.0,0.946645,"[amiodarone, Cotinine, saquinavir, mefloquine]","{('C0002598', 'C0010194'): ['21308701'], ('C00..."
273,"[C0002598, C2976303, C0068788, C0025153]",0.031924,0.941339,"[amiodarone, sofosbuvir, nitazoxanide, mefloqu...","{('C0002598', 'C2976303'): ['27503387', '26416..."
363,"[C0002598, C1414236, C0037285, C0025153]",0.0,0.940781,"[amiodarone, EBP gene, Skin Manifestations, me...","{('C0002598', 'C1414236'): ['32286791'], ('C14..."
10,"[C0002598, C0085786, C0034068, C0025153]",0.0,0.939823,"[amiodarone, Hamman-Rich syndrome, Pulmonary E...","{('C0002598', 'C0085786'): ['21686894', '65192..."


In [6]:
model_sbert = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")  

### Generate Baseline response per path via gpt-4o model:

In [7]:
def get_embedding(text):
    return model_sbert.encode(text, convert_to_tensor=True)

def retrieve_single_abstracts_from_pmid(pmid):
    abstr_text = text_util.get_abstr_text(pmid, sents_db)
    return abstr_text

def retrieve_abstracts_from_pmids(pmids, sents_db):
    abstracts = []
    for pmid in tqdm(pmids, desc='Retrieving abstracts'):
        abstr_text = text_util.get_abstr_text(pmid, sents_db)
        abstracts.append(abstr_text)
    return abstracts

def find_best_selection_context_per_path_sbert(source, target, edges_pmids):
    query = f"{source} {target}"
    query_embedding = get_embedding(query)
    print(f"Query: {query}")
    
    best_selection = {}
    cosine_similarities = {}
    for edge, pmids in edges_pmids.items():
        pmid_texts = [retrieve_single_abstracts_from_pmid(pmid) for pmid in pmids]
        pmid_embeddings = model_sbert.encode(pmid_texts, convert_to_tensor=True, show_progress_bar=False)
        print(f"PMID Texts: {pmid_texts}")

        top_k = len(pmid_embeddings)
        print(f"top_k: {top_k}")
        
        search_results = util.semantic_search(query_embedding, pmid_embeddings, top_k=len(pmid_embeddings))
        best_pmid_index = search_results[0][0]['corpus_id']
        best_pmid = pmids[best_pmid_index]
        best_score = search_results[0][0]['score']
        
        best_selection[edge] = best_pmid
        cosine_similarities[edge] = best_score
    
    return best_selection, cosine_similarities, search_results

In [8]:
def process_dataframe_row(row):
    source = "amiodarone"
    target = "mefloquine"
    edges_pmids = row['context_pmids']
    best_selection, cosine_similarities, search_results = find_best_selection_context_per_path_sbert(source, target, edges_pmids)
    return pd.Series({'best_selection_sbert': best_selection, 'cosine_similarity': cosine_similarities})

In [9]:
def generate_llm_response(row):
    best_selection = row['best_selection_sbert']
    pmids = list(best_selection.values())
    print("Here is the list of PMIDs according to sbert:", ', '.join(map(str, pmids)))
    
    path_context_abstracts = retrieve_abstracts_from_pmids(pmids, sents_db)
    
    source = "amiodarone"
    target = "mefloquine"
    llm_prompt_template = "How would you describe an indirect relationship between {source} and {target} given the following scientific abstracts as contexts?"
    llm_fix_prompt_str = "\n\n".join(path_context_abstracts)
    llm_fix_prompt_combo = llm_prompt_template.format(source=source, target=target) + "\n\n" + llm_fix_prompt_str
    
    start_time = time.time()
    llm_resp_path = oai_get_response(llm_fix_prompt_combo, temp=1e-19, top_p=1e-9, seed=1234)
    end_time = time.time()
    
    duration = end_time - start_time
    print(f"Time taken for oai_get_response gpt4o: {duration:.2f} seconds")
    
    return llm_resp_path

In [10]:
intermediate_df = top_20_df.apply(process_dataframe_row, axis=1)
res_df = top_20_df.copy()
res_df[['best_selection_sbert', 'cosine_similarity']] = intermediate_df
res_df['llm_response_bl'] = res_df.apply(generate_llm_response, axis=1)
# res_df.to_pickle('Baseline_SBERT_gpt4o.pkl')
res_df

Query: amiodarone mefloquine
PMID Texts: ["Experimental study on the effect of chrysin on skin injury induced by amiodarone extravasation in rats. Amiodarone is the first choice for the treatment of arrhythmia, but it is easy to cause extravasation during infusion, after extravasation, it often cause skin injury. The healing of skin injury induced by amiodarone is an inflammatory process. Chrysin, a natural flavonoid, has been investigated to have anti-inflammatory and antioxidant effects. It was reported that chrysin can promote wound healing. So this study aims to investigate the effect of chrysin on amiodarone extravasation-induced skin injury model in rats. The rat model of skin extravasation injury was established by subcutaneous injection of 0.5 mL of amiodarone. After successful modeling, the rats were randomly assigned to the five groups: control group, 10% DMSO group, and low-dose, medium-dose, and high-dose chrysin groups (10, 20 and 40 mg/mL). The extravasation injury model 

Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 12865.96it/s]


Time taken for oai_get_response gpt4o: 9.52 seconds
Here is the list of PMIDs according to sbert: 9141611, 31496470, 16938613


Retrieving abstracts: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 9078.58it/s]


Time taken for oai_get_response gpt4o: 12.25 seconds
Here is the list of PMIDs according to sbert: 19285480, 28975866, 1603018


Retrieving abstracts: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 7354.13it/s]


Time taken for oai_get_response gpt4o: 7.57 seconds
Here is the list of PMIDs according to sbert: 25069801, 24672635, 16004972


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 12520.31it/s]


Time taken for oai_get_response gpt4o: 18.87 seconds
Here is the list of PMIDs according to sbert: 33792260, 17008299, 17118354


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 16384.00it/s]


Time taken for oai_get_response gpt4o: 8.83 seconds
Here is the list of PMIDs according to sbert: 18760119, 31109500, 31276961


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 13079.95it/s]


Time taken for oai_get_response gpt4o: 10.39 seconds
Here is the list of PMIDs according to sbert: 21308701, 20950334, 15923343


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 14234.06it/s]


Time taken for oai_get_response gpt4o: 12.82 seconds
Here is the list of PMIDs according to sbert: 25853664, 35953881, 25773183


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 12892.33it/s]


Time taken for oai_get_response gpt4o: 8.63 seconds
Here is the list of PMIDs according to sbert: 32286791, 34490468, 16938613


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 16236.02it/s]


Time taken for oai_get_response gpt4o: 9.55 seconds
Here is the list of PMIDs according to sbert: 6519217, 11144923, 18041895


Retrieving abstracts: 100%|████████████████████████████████████████████████████| 3/3 [00:00<00:00, 9716.53it/s]


Time taken for oai_get_response gpt4o: 7.67 seconds
Here is the list of PMIDs according to sbert: 35654917, 34546608, 16938613


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 12620.77it/s]


Time taken for oai_get_response gpt4o: 7.82 seconds
Here is the list of PMIDs according to sbert: 16902058, 35625015, 35833815


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 14546.72it/s]


Time taken for oai_get_response gpt4o: 9.05 seconds
Here is the list of PMIDs according to sbert: 9641477, 31330137, 11561091


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 11224.72it/s]


Time taken for oai_get_response gpt4o: 10.08 seconds
Here is the list of PMIDs according to sbert: 36417914, 15967050, 29107173


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 14106.40it/s]


Time taken for oai_get_response gpt4o: 9.96 seconds
Here is the list of PMIDs according to sbert: 23992288, 15656803, 16938613


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 21435.97it/s]


Time taken for oai_get_response gpt4o: 8.76 seconds
Here is the list of PMIDs according to sbert: 19703566, 16041239, 11125902


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 18027.09it/s]


Time taken for oai_get_response gpt4o: 12.56 seconds
Here is the list of PMIDs according to sbert: 19841450, 22711111, 32903470


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 18950.17it/s]


Time taken for oai_get_response gpt4o: 15.57 seconds
Here is the list of PMIDs according to sbert: 18749804, 32360350, 26216464


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 18342.44it/s]


Time taken for oai_get_response gpt4o: 8.24 seconds
Here is the list of PMIDs according to sbert: 25933611, 26735991, 2834931


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 16710.37it/s]


Time taken for oai_get_response gpt4o: 8.39 seconds
Here is the list of PMIDs according to sbert: 28098197, 20395154, 3265137


Retrieving abstracts: 100%|███████████████████████████████████████████████████| 3/3 [00:00<00:00, 19269.39it/s]


Time taken for oai_get_response gpt4o: 11.04 seconds


Unnamed: 0,path,score_std,score_mean,dec_path,context_pmids,best_selection_sbert,cosine_similarity,llm_response_bl
155,"[C0002598, C0055661, C1412079, C0025153]",0.0,0.998306,"[amiodarone, chrysin, ABCC4 gene, mefloquine]","{('C0002598', 'C0055661'): ['34534572'], ('C00...","{('C0002598', 'C0055661'): '34534572', ('C0055...","{('C0002598', 'C0055661'): 0.30451369285583496...",An indirect relationship between amiodarone an...
392,"[C0002598, C0018995, C0037285, C0025153]",0.0,0.973623,"[amiodarone, Hemochromatosis, Skin Manifestati...","{('C0002598', 'C0018995'): ['9141611'], ('C001...","{('C0002598', 'C0018995'): '9141611', ('C00189...","{('C0002598', 'C0018995'): 0.31887203454971313...","Based on the provided abstracts, an indirect r..."
351,"[C0002598, C1420133, C0024537, C0025153]",0.0181,0.966783,"[amiodarone, SLCO2B1 gene, Malaria, Vivax, mef...","{('C0002598', 'C1420133'): ['19285480'], ('C14...","{('C0002598', 'C1420133'): '19285480', ('C1420...","{('C0002598', 'C1420133'): 0.13925690948963165...","Based on the provided abstracts, an indirect r..."
295,"[C0002598, C0051231, C0210995, C0025153]",0.006013,0.966562,"[amiodarone, allyl isothiocyanate, mismatch re...","{('C0002598', 'C0051231'): ['25069801'], ('C00...","{('C0002598', 'C0051231'): '25069801', ('C0051...","{('C0002598', 'C0051231'): 0.14810478687286377...",An indirect relationship between amiodarone an...
329,"[C0002598, C0002390, C0035891, C0025153]",0.0,0.964968,"[amiodarone, Extrinsic allergic alveolitis, ro...","{('C0002598', 'C0002390'): ['33792260'], ('C00...","{('C0002598', 'C0002390'): '33792260', ('C0002...","{('C0002598', 'C0002390'): 0.3570442497730255,...",An indirect relationship between amiodarone an...
126,"[C0002598, C0029132, C0005699, C0025153]",0.025416,0.953859,"[amiodarone, Disorder of the optic nerve, Blas...","{('C0002598', 'C0029132'): ['18760119', '97752...","{('C0002598', 'C0029132'): '18760119', ('C0029...","{('C0002598', 'C0029132'): 0.4765419363975525,...",An indirect relationship between amiodarone an...
5,"[C0002598, C0010194, C0286738, C0025153]",0.0,0.946645,"[amiodarone, Cotinine, saquinavir, mefloquine]","{('C0002598', 'C0010194'): ['21308701'], ('C00...","{('C0002598', 'C0010194'): '21308701', ('C0010...","{('C0002598', 'C0010194'): 0.18645814061164856...",An indirect relationship between amiodarone an...
273,"[C0002598, C2976303, C0068788, C0025153]",0.031924,0.941339,"[amiodarone, sofosbuvir, nitazoxanide, mefloqu...","{('C0002598', 'C2976303'): ['27503387', '26416...","{('C0002598', 'C2976303'): '25853664', ('C2976...","{('C0002598', 'C2976303'): 0.39631110429763794...",An indirect relationship between amiodarone an...
363,"[C0002598, C1414236, C0037285, C0025153]",0.0,0.940781,"[amiodarone, EBP gene, Skin Manifestations, me...","{('C0002598', 'C1414236'): ['32286791'], ('C14...","{('C0002598', 'C1414236'): '32286791', ('C1414...","{('C0002598', 'C1414236'): 0.3766741156578064,...",An indirect relationship between amiodarone an...
10,"[C0002598, C0085786, C0034068, C0025153]",0.0,0.939823,"[amiodarone, Hamman-Rich syndrome, Pulmonary E...","{('C0002598', 'C0085786'): ['21686894', '65192...","{('C0002598', 'C0085786'): '6519217', ('C00857...","{('C0002598', 'C0085786'): 0.45052632689476013...",An indirect relationship between amiodarone an...


In [11]:
res_df

Unnamed: 0,path,score_std,score_mean,dec_path,context_pmids,best_selection_sbert,cosine_similarity,llm_response_bl
155,"[C0002598, C0055661, C1412079, C0025153]",0.0,0.998306,"[amiodarone, chrysin, ABCC4 gene, mefloquine]","{('C0002598', 'C0055661'): ['34534572'], ('C00...","{('C0002598', 'C0055661'): '34534572', ('C0055...","{('C0002598', 'C0055661'): 0.30451369285583496...",An indirect relationship between amiodarone an...
392,"[C0002598, C0018995, C0037285, C0025153]",0.0,0.973623,"[amiodarone, Hemochromatosis, Skin Manifestati...","{('C0002598', 'C0018995'): ['9141611'], ('C001...","{('C0002598', 'C0018995'): '9141611', ('C00189...","{('C0002598', 'C0018995'): 0.31887203454971313...","Based on the provided abstracts, an indirect r..."
351,"[C0002598, C1420133, C0024537, C0025153]",0.0181,0.966783,"[amiodarone, SLCO2B1 gene, Malaria, Vivax, mef...","{('C0002598', 'C1420133'): ['19285480'], ('C14...","{('C0002598', 'C1420133'): '19285480', ('C1420...","{('C0002598', 'C1420133'): 0.13925690948963165...","Based on the provided abstracts, an indirect r..."
295,"[C0002598, C0051231, C0210995, C0025153]",0.006013,0.966562,"[amiodarone, allyl isothiocyanate, mismatch re...","{('C0002598', 'C0051231'): ['25069801'], ('C00...","{('C0002598', 'C0051231'): '25069801', ('C0051...","{('C0002598', 'C0051231'): 0.14810478687286377...",An indirect relationship between amiodarone an...
329,"[C0002598, C0002390, C0035891, C0025153]",0.0,0.964968,"[amiodarone, Extrinsic allergic alveolitis, ro...","{('C0002598', 'C0002390'): ['33792260'], ('C00...","{('C0002598', 'C0002390'): '33792260', ('C0002...","{('C0002598', 'C0002390'): 0.3570442497730255,...",An indirect relationship between amiodarone an...
126,"[C0002598, C0029132, C0005699, C0025153]",0.025416,0.953859,"[amiodarone, Disorder of the optic nerve, Blas...","{('C0002598', 'C0029132'): ['18760119', '97752...","{('C0002598', 'C0029132'): '18760119', ('C0029...","{('C0002598', 'C0029132'): 0.4765419363975525,...",An indirect relationship between amiodarone an...
5,"[C0002598, C0010194, C0286738, C0025153]",0.0,0.946645,"[amiodarone, Cotinine, saquinavir, mefloquine]","{('C0002598', 'C0010194'): ['21308701'], ('C00...","{('C0002598', 'C0010194'): '21308701', ('C0010...","{('C0002598', 'C0010194'): 0.18645814061164856...",An indirect relationship between amiodarone an...
273,"[C0002598, C2976303, C0068788, C0025153]",0.031924,0.941339,"[amiodarone, sofosbuvir, nitazoxanide, mefloqu...","{('C0002598', 'C2976303'): ['27503387', '26416...","{('C0002598', 'C2976303'): '25853664', ('C2976...","{('C0002598', 'C2976303'): 0.39631110429763794...",An indirect relationship between amiodarone an...
363,"[C0002598, C1414236, C0037285, C0025153]",0.0,0.940781,"[amiodarone, EBP gene, Skin Manifestations, me...","{('C0002598', 'C1414236'): ['32286791'], ('C14...","{('C0002598', 'C1414236'): '32286791', ('C1414...","{('C0002598', 'C1414236'): 0.3766741156578064,...",An indirect relationship between amiodarone an...
10,"[C0002598, C0085786, C0034068, C0025153]",0.0,0.939823,"[amiodarone, Hamman-Rich syndrome, Pulmonary E...","{('C0002598', 'C0085786'): ['21686894', '65192...","{('C0002598', 'C0085786'): '6519217', ('C00857...","{('C0002598', 'C0085786'): 0.45052632689476013...",An indirect relationship between amiodarone an...
