In [None]:
from datasets import concatenate_datasets
from datasets import load_dataset
from openai import OpenAI
from pathlib import Path
from tqdm import tqdm
from prompts import *
import pandas as pd
import tiktoken
import pickle
import json
import time
import re

ddi_subset = pd.read_csv("../data/mined_data/final_DDI.csv")
molecular_relations_df = pd.read_csv("../data/OAI/molecular_interactions/molecular_interactions_df.csv")
dpi_subset = pd.read_csv("../data/mined_data/final_DPI.csv")

all_drugs = set(ddi_subset["drug_1_name"].unique()).union(set(ddi_subset["drug_2_name"].unique()))
all_proteins = dpi_subset["protein_name"].unique()

# For use during Mol questions
drug_smiles_mapping = {}
for row in ddi_subset.itertuples():
    drug_smiles_mapping[row.drug_1_name] = row.drug_1_SMILES
    drug_smiles_mapping[row.drug_2_name] = row.drug_2_SMILES

client = OpenAI()
gpt_tokenizer = tiktoken.encoding_for_model("gpt-4o")

# Common Functions

In [None]:
def print_sample(sample_list):
    sample = sample_list[0]
    print(sample["body"]["messages"][0]["content"] + "\n\n" + sample["body"]["messages"][1]["content"])

def count_tokens(sample_list):
    total_tokens = 0
    for sample in sample_list:
        total_tokens = total_tokens + len(gpt_tokenizer.encode(sample["body"]["messages"][0]["content"])) + \
                                      len(gpt_tokenizer.encode(sample["body"]["messages"][1]["content"]))
    print(total_tokens)

def retrieve_text(entity_name, entity_type):
    complexified_path = Path(f"../data/background_information_data/{entity_type}_data/Wiki_complexified/{entity_name}.txt")
    if complexified_path.exists():
        with complexified_path.open("r") as file:
            return file.read()
    else:
        with Path(f"../data/background_information_data/{entity_type}_data/Wiki/{entity_name}.txt").open("r") as file:
            return file.read()

def save_batch(batch, output_path):
    with Path(output_path).open('w') as file:
        for sample in batch:
            json_line = json.dumps(sample)
            file.write(json_line + '\n')

def create_batches(formatted_samples, base_path):
    batch_tokens = 0
    batch_id = 0
    batch = []
    i = 0
    while i < len(formatted_samples):
        sample = formatted_samples[i]
        batch.append(sample)
        batch_tokens = batch_tokens + \
                       len(gpt_tokenizer.encode(sample["body"]["messages"][0]["content"])) + \
                       len(gpt_tokenizer.encode(sample["body"]["messages"][1]["content"]))
        if batch_tokens > 90_000:
            batch.pop() # Removing the last sample which caused the total number of tokens to exceed the 90K limit.
            save_batch(batch, f"{base_path}/batch_{batch_id}_input.jsonl")
            batch_id += 1
            batch = []
            batch_tokens = 0
        else:
            i += 1
    save_batch(batch, f"{base_path}/batch_{batch_id}_input.jsonl")

# Wiki Complexify 

In [None]:
def create_text_dict(entity_list, entity_type):
    entity_text = {}
    for entity in entity_list:
        with Path(f"../data/background_information_data/{entity_type}_data/Wiki/{entity}.txt").open("r") as file:
            text = file.read()
            entity_text[entity] = (text, len(gpt_tokenizer.encode(text)))
    return entity_text

def create_formatted_inputs_for_complexify(entity, text):
    return {"custom_id": f"{entity}-complexify", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4o", 
                     "messages": [
                                     {"role": "developer", "content": WIKI_COMPLEXIFY_DEVELOPER_PROMPT},
                                     {"role": "user", "content": WIKI_COMPLEXIFY_USER_PROMPT.format(entity, text)}
                                 ]
                    }
           }

In [None]:
drugs_text = create_text_dict(all_drugs, "drug")

formatted_samples = []
for drug, text_tup in drugs_text.items():
    if text_tup[1] >= 200:
        formatted_samples.append(create_formatted_inputs_for_complexify(drug, text_tup[0]))

In [None]:
total_tokens = 0
for sample in formatted_samples:
    total_tokens += len(gpt_tokenizer.encode(sample["body"]["messages"][0]["content"]))
    total_tokens += len(gpt_tokenizer.encode(sample["body"]["messages"][1]["content"]))
if total_tokens < 90_000:
    print(f"Total tokens : {total_tokens}. Fine for batching everything.")
    with Path("../data/OAI/complexify/batch_input.jsonl").open('w') as file:
        for sample in formatted_samples:
            json_line = json.dumps(sample)
            file.write(json_line + '\n')

In [None]:
# If I want to read the file
with Path("../data/OAI/complexify/batch_input.jsonl").open('r') as file:
     s = [json.loads(line) for line in file]

# Molecular Interactions

In [None]:
def create_formatted_inputs_for_MI(row):
    drug_1_name = row.drug_1_name
    drug_2_name = row.drug_2_name
    drug_1_SMILES = row.drug_1_SMILES
    drug_2_SMILES = row.drug_2_SMILES
    return {"custom_id": f"{drug_1_name}-{drug_2_name}-MI",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {"model": "gpt-4o",
                     "messages": [
                                     {"role": "developer", "content": MOLECULAR_INTERACTIONS_DEVELOPER_PROMPT},
                                     {"role": "user", "content": MOLECULAR_INTERACTIONS_USER_PROMPT.format(drug_1_SMILES, drug_2_SMILES)}
                                 ]
                    }
           }

In [None]:
formatted_samples = []
for row in ddi_subset.itertuples(index=False):
    formatted_samples.append(create_formatted_inputs_for_MI(row))
create_batches(formatted_samples, "../data/OAI/molecular_interactions")

In [None]:
# Sanity check output
with Path("../data/OAI/molecular_interactions/batch_0_input.jsonl").open('r') as file:
     s = [json.loads(line) for line in file]

with Path("../data/OAI/molecular_interactions/batch_1_input.jsonl").open('r') as file:
     y = [json.loads(line) for line in file]

with Path("../data/OAI/molecular_interactions/batch_2_input.jsonl").open('r') as file:
     w = [json.loads(line) for line in file]

assert (s+y+w) == (formatted_samples)

In [None]:
# Had to post-process batch_0 & split it into 2 halves for rate limits.
with Path("../data/OAI/molecular_interactions/batch_0_input.jsonl").open('r') as file:
     s = [json.loads(line) for line in file]

batch_0 = s[:len(s)//2]
batch_1 = s[len(s)//2:]
save_batch(batch_0, 0)
save_batch(batch_1, 1)

# DDI Questions

## Bio relationships

### 1-hop

In [None]:
def create_DDI_Bio_1_hop_samples(drug_1, drug_1_text, drug_2, drug_2_text, triple):
    return {"custom_id": f"{drug_1}-{drug_2}-DDI-Bio-1_hop", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4.1", 
                     "messages": [
                                     {"role": "developer", "content": DDI_BIO_ONE_HOP_DEVELOPER_PROMPT},
                                     {"role": "user", "content": DDI_BIO_ONE_HOP_USER_PROMPT.format(drug_1, drug_1_text,
                                                                                                    drug_2, drug_2_text,
                                                                                                    triple
                                                                                                    )
                                     }
                                 ]
                    }
           }

formatted_samples = []
for row in ddi_subset.itertuples(index=False):
    drug_1 = row.drug_1_name
    drug_2 = row.drug_2_name
    triple = f"{drug_1}-{row.relationship}-{drug_2}"
    drug_1_text = retrieve_text(drug_1, "drug")
    drug_2_text = retrieve_text(drug_2, "drug")
    formatted_samples.append(create_DDI_Bio_1_hop_samples(drug_1, drug_1_text, drug_2, drug_2_text, triple))

#create_batches(formatted_samples, "../data/OAI/Questions/DDI_BIO_1_hop/")
save_batch(formatted_samples, "../data/OAI/Questions/DDI_Bio/1_hop/batch_input.jsonl")

### 2-hop

In [None]:
def create_DDI_Bio_2_hop_samples(drug_1, drug_1_text, drug_2, drug_2_text, drug_3, drug_3_text, drug_1_2_triple, drug_2_3_triple):
    return {"custom_id": f"{drug_1}-{drug_2}-{drug_3}-DDI-Bio-2_hop", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4.1", 
                     "messages": [
                                     {"role": "developer", "content": DDI_BIO_TWO_HOP_DEVELOPER_PROMPT},
                                     {"role": "user", "content": DDI_BIO_TWO_HOP_USER_PROMPT.format(drug_1, drug_1_text,
                                                                                                    drug_2, drug_2_text,
                                                                                                    drug_3, drug_3_text,
                                                                                                    drug_1_2_triple, drug_2_3_triple
                                                                                                   )
                                     }
                                 ]
                    }
           }

with Path("../data/mined_data/DDI_Bio_two_hop_list.pkl").open("rb") as file:
    ddi_bio_two_hop_tuples = pickle.load(file)

formatted_samples = []
for ddi_tup in ddi_bio_two_hop_tuples:   
    drug_1 = ddi_tup[0]
    drug_2 = ddi_tup[1]
    drug_3 = ddi_tup[2]
    
    drug_1_2_rel = ddi_subset.query("drug_1_name == @drug_1 and drug_2_name == @drug_2").iloc[0].relationship
    drug_2_3_rel = ddi_subset.query("drug_1_name == @drug_2 and drug_2_name == @drug_3").iloc[0].relationship
    
    drug_1_2_triple = f"{drug_1}-{drug_1_2_rel}-{drug_2}"
    drug_2_3_triple = f"{drug_2}-{drug_2_3_rel}-{drug_3}"
    
    drug_1_text = retrieve_text(drug_1, "drug")
    drug_2_text = retrieve_text(drug_2, "drug")
    drug_3_text = retrieve_text(drug_3, "drug")

    formatted_samples.append(create_DDI_Bio_2_hop_samples(drug_1, drug_1_text, drug_2, drug_2_text, drug_3, drug_3_text, 
                                                          drug_1_2_triple, drug_2_3_triple))

save_batch(formatted_samples, "../data/OAI/Questions/DDI_Bio/2_hop/batch_input.jsonl")

### 3-hop

In [None]:
def create_DDI_Bio_3_hop_samples(drug_1, drug_1_text, drug_2, drug_2_text, drug_3, drug_3_text, drug_4, drug_4_text, 
                                 drug_1_2_triple, drug_2_3_triple, drug_3_4_triple):
    return {"custom_id": f"{drug_1}-{drug_2}-{drug_3}-{drug_4}-DDI-Bio-3_hop", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4.1", 
                     "messages": [
                                     {"role": "developer", "content": DDI_BIO_THREE_HOP_DEVELOPER_PROMPT},
                                     {"role": "user", "content": DDI_BIO_THREE_HOP_USER_PROMPT.format(drug_1, drug_1_text,
                                                                                                      drug_2, drug_2_text,
                                                                                                      drug_3, drug_3_text,
                                                                                                      drug_4, drug_4_text,
                                                                                                      drug_1_2_triple, 
                                                                                                      drug_2_3_triple,
                                                                                                      drug_3_4_triple
                                                                                                     )
                                     }
                                 ]
                    }
           }

with Path("../data/mined_data/DDI_Bio_three_hop_list.pkl").open("rb") as file:
    ddi_bio_three_hop_tuples = pickle.load(file)

formatted_samples = []
for ddi_tup in ddi_bio_three_hop_tuples:   
    drug_1 = ddi_tup[0]
    drug_2 = ddi_tup[1]
    drug_3 = ddi_tup[2]
    drug_4 = ddi_tup[3]
    
    drug_1_2_rel = ddi_subset.query("drug_1_name == @drug_1 and drug_2_name == @drug_2").iloc[0].relationship
    drug_2_3_rel = ddi_subset.query("drug_1_name == @drug_2 and drug_2_name == @drug_3").iloc[0].relationship
    drug_3_4_rel = ddi_subset.query("drug_1_name == @drug_3 and drug_2_name == @drug_4").iloc[0].relationship

    drug_1_2_triple = f"{drug_1}-{drug_1_2_rel}-{drug_2}"
    drug_2_3_triple = f"{drug_2}-{drug_2_3_rel}-{drug_3}"
    drug_3_4_triple = f"{drug_3}-{drug_3_4_rel}-{drug_4}"
    
    drug_1_text = retrieve_text(drug_1, "drug")
    drug_2_text = retrieve_text(drug_2, "drug")
    drug_3_text = retrieve_text(drug_3, "drug")
    drug_4_text = retrieve_text(drug_4, "drug")

    formatted_samples.append(create_DDI_Bio_3_hop_samples(drug_1, drug_1_text, drug_2, drug_2_text, drug_3, drug_3_text, 
                                                          drug_4, drug_4_text, drug_1_2_triple, drug_2_3_triple, drug_3_4_triple
                                                         )
                            )

save_batch(formatted_samples, "../data/OAI/Questions/DDI_Bio/3_hop/batch_input.jsonl")

## Molecular Relationships

### 1-hop

In [None]:
def create_DDI_Mol_1_hop_samples(drug_1, drug_1_SMILES, drug_2, drug_2_SMILES, triple):
    return {"custom_id": f"{drug_1}-{drug_2}-DDI-Mol-1_hop", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4.1", 
                     "messages": [
                                     {"role": "developer", "content": DDI_MOL_ONE_HOP_DEVELOPER_PROMPT},
                                     {"role": "user", "content": DDI_MOL_ONE_HOP_USER_PROMPT.format(drug_1, drug_1_SMILES,
                                                                                                    drug_2, drug_2_SMILES,
                                                                                                    triple
                                                                                                   )
                                     }
                                 ]
                    }
           }

formatted_samples = []
for row in molecular_relations_df.itertuples(index=False):
    drug_1 = row.drug_1_name
    drug_2 = row.drug_2_name
    triple = f"{drug_1}-{row.molecular_interaction}-{drug_2}"
    drug_1_SMILES = row.drug_1_SMILES
    drug_2_SMILES = row.drug_2_SMILES
    formatted_samples.append(create_DDI_Mol_1_hop_samples(drug_1, drug_1_SMILES, drug_2, drug_2_SMILES, triple))

save_batch(formatted_samples, "../data/OAI/Questions/DDI/Mol/1_hop/batch_input.jsonl")

### 2-hop

In [None]:
def create_DDI_Mol_2_hop_samples(drug_1, drug_1_SMILES, drug_2, drug_2_SMILES, drug_3, drug_3_SMILES, drug_1_2_triple, drug_2_3_triple):
    return {"custom_id": f"{drug_1}-{drug_2}-{drug_3}-DDI-Mol-2_hop", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4.1", 
                     "messages": [
                                     {"role": "developer", "content": DDI_MOL_TWO_HOP_DEVELOPER_PROMPT},
                                     {"role": "user", "content": DDI_MOL_TWO_HOP_USER_PROMPT.format(drug_1, drug_1_SMILES,
                                                                                                    drug_2, drug_2_SMILES,
                                                                                                    drug_3, drug_3_SMILES,
                                                                                                    drug_1_2_triple, drug_2_3_triple
                                                                                                   )
                                     }
                                 ]
                    }
           }

with Path("../data/mined_data/DDI_Mol_two_hop_list.pkl").open("rb") as file:
    ddi_mol_two_hop_tuples = pickle.load(file)

formatted_samples = []
for ddi_tup in ddi_mol_two_hop_tuples:   
    drug_1 = ddi_tup[0]
    drug_2 = ddi_tup[1]
    drug_3 = ddi_tup[2]
    
    drug_1_2_rel = molecular_relations_df.query("drug_1_name == @drug_1 and drug_2_name == @drug_2").iloc[0].molecular_interaction
    drug_2_3_rel = molecular_relations_df.query("drug_1_name == @drug_2 and drug_2_name == @drug_3").iloc[0].molecular_interaction
    
    drug_1_2_triple = f"{drug_1}-{drug_1_2_rel}-{drug_2}"
    drug_2_3_triple = f"{drug_2}-{drug_2_3_rel}-{drug_3}"
    
    drug_1_SMILES = drug_smiles_mapping[drug_1]
    drug_2_SMILES = drug_smiles_mapping[drug_2]
    drug_3_SMILES = drug_smiles_mapping[drug_3]

    formatted_samples.append(create_DDI_Mol_2_hop_samples(drug_1, drug_1_SMILES, drug_2, drug_2_SMILES, drug_3, drug_3_SMILES, 
                                                          drug_1_2_triple, drug_2_3_triple))

save_batch(formatted_samples, "../data/OAI/Questions/DDI/Mol/2_hop/batch_input.jsonl")

### 3-hop

In [None]:
def create_DDI_Mol_3_hop_samples(drug_1, drug_1_SMILES, drug_2, drug_2_SMILES, drug_3, drug_3_SMILES, drug_4, drug_4_SMILES, 
                                 drug_1_2_triple, drug_2_3_triple, drug_3_4_triple):
    return {"custom_id": f"{drug_1}-{drug_2}-{drug_3}-{drug_4}-DDI-Mol-3_hop", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4.1", 
                     "messages": [
                                     {"role": "developer", "content": DDI_MOL_THREE_HOP_DEVELOPER_PROMPT},
                                     {"role": "user", "content": DDI_MOL_THREE_HOP_USER_PROMPT.format(drug_1, drug_1_SMILES,
                                                                                                      drug_2, drug_2_SMILES,
                                                                                                      drug_3, drug_3_SMILES,
                                                                                                      drug_4, drug_4_SMILES,
                                                                                                      drug_1_2_triple, 
                                                                                                      drug_2_3_triple,
                                                                                                      drug_3_4_triple
                                                                                                     )
                                     }
                                 ]
                    }
           }

with Path("../data/mined_data/DDI_Mol_three_hop_list.pkl").open("rb") as file:
    ddi_mol_three_hop_tuples = pickle.load(file)

formatted_samples = []
for ddi_tup in ddi_mol_three_hop_tuples:   
    drug_1 = ddi_tup[0]
    drug_2 = ddi_tup[1]
    drug_3 = ddi_tup[2]
    drug_4 = ddi_tup[3]
    
    drug_1_2_rel = molecular_relations_df.query("drug_1_name == @drug_1 and drug_2_name == @drug_2").iloc[0].molecular_interaction
    drug_2_3_rel = molecular_relations_df.query("drug_1_name == @drug_2 and drug_2_name == @drug_3").iloc[0].molecular_interaction
    drug_3_4_rel = molecular_relations_df.query("drug_1_name == @drug_3 and drug_2_name == @drug_4").iloc[0].molecular_interaction

    drug_1_2_triple = f"{drug_1}-{drug_1_2_rel}-{drug_2}"
    drug_2_3_triple = f"{drug_2}-{drug_2_3_rel}-{drug_3}"
    drug_3_4_triple = f"{drug_3}-{drug_3_4_rel}-{drug_4}"
    
    drug_1_SMILES = drug_smiles_mapping[drug_1]
    drug_2_SMILES = drug_smiles_mapping[drug_2]
    drug_3_SMILES = drug_smiles_mapping[drug_3]
    drug_4_SMILES = drug_smiles_mapping[drug_4]

    formatted_samples.append(create_DDI_Mol_3_hop_samples(drug_1, drug_1_SMILES, drug_2, drug_2_SMILES, drug_3, drug_3_SMILES, 
                                                          drug_4, drug_4_SMILES, drug_1_2_triple, drug_2_3_triple, drug_3_4_triple
                                                         )
                            )

save_batch(formatted_samples, "../data/OAI/Questions/DDI/Mol/3_hop/batch_input.jsonl")

# DPI Questions

## 1-hop

In [None]:
def create_DPI_1_hop_samples(drug, drug_text, protein, protein_text, triple):
    return {"custom_id": f"{drug}-{protein}-DPI-1_hop", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4.1", 
                     "messages": [
                                     {"role": "developer", "content": DPI_ONE_HOP_DEVELOPER_PROMPT},
                                     {"role": "user", "content": DPI_ONE_HOP_USER_PROMPT.format(drug, drug_text, 
                                                                                                protein, protein_text, 
                                                                                                triple
                                                                                               )
                                     }
                                 ]
                    }
           }

formatted_samples = []
for row in dpi_subset.itertuples(index=False):
    drug = row.drug_name
    protein = row.protein_name
    triple = f"{drug}-{row.relationship}-{protein}"
    drug_text = retrieve_text(drug, "drug")
    protein_text = retrieve_text(protein, "protein")
    formatted_samples.append(create_DPI_1_hop_samples(drug, drug_text, protein, protein_text, triple))

save_batch(formatted_samples, "../data/OAI/Questions/DPI/1_hop/batch_input.jsonl")

## 2-hop

In [None]:
def create_DPI_2_hop_samples(drug_1, drug_1_text, drug_2, drug_2_text, protein, protein_text, triple_1_2, triple_2_3):
    return {"custom_id": f"{drug_1}-{drug_2}-{protein}-DPI-2_hop", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4.1", 
                     "messages": [
                                     {"role": "developer", "content": DPI_TWO_HOP_DEVELOPER_PROMPT},
                                     {"role": "user", "content": DPI_TWO_HOP_USER_PROMPT.format(drug_1, drug_1_text,
                                                                                                drug_2, drug_2_text,
                                                                                                protein, protein_text, 
                                                                                                triple_1_2, triple_2_3
                                                                                               )
                                     }
                                 ]
                    }
           }


with Path("../data/mined_data/DPI_two_hop_list.pkl").open("rb") as file:
    dpi_two_hop_tuples = pickle.load(file)

formatted_samples = []
for dpi_tuple in dpi_two_hop_tuples:
    drug_1 = dpi_tuple[0]
    drug_2 = dpi_tuple[1]
    protein = dpi_tuple[2]

    rel_1_2 = ddi_subset.query("drug_1_name == @drug_1 and drug_2_name == @drug_2").iloc[0].relationship
    rel_2_3 = dpi_subset.query("drug_name == @drug_2 and protein_name == @protein").iloc[0].relationship

    triple_1_2 = f"{drug_1}-{rel_1_2}-{drug_2}"
    triple_2_3 = f"{drug_2}-{rel_2_3}-{protein}"
    
    drug_1_text = retrieve_text(drug_1, "drug")
    drug_2_text = retrieve_text(drug_2, "drug")
    protein_text = retrieve_text(protein, "protein")

    formatted_samples.append(create_DPI_2_hop_samples(drug_1, drug_1_text, 
                                                      drug_2, drug_2_text, 
                                                      protein, protein_text, 
                                                      triple_1_2, triple_2_3))

save_batch(formatted_samples, "../data/OAI/Questions/DPI/2_hop/batch_input.jsonl")

## 3-hop

In [None]:
def create_DPI_3_hop_samples(drug_1, drug_1_text, drug_2, drug_2_text, drug_3, drug_3_text, protein, protein_text, 
                             triple_1_2, triple_2_3, triple_3_4):
    return {"custom_id": f"{drug_1}-{drug_2}-{drug_3}-{protein}-DPI-3_hop", 
            "method": "POST", 
            "url": "/v1/chat/completions", 
            "body": {"model": "gpt-4.1", 
                     "messages": [
                                     {"role": "developer", "content": DPI_THREE_HOP_DEVELOPER_PROMPT},
                                     {"role": "user", "content": DPI_THREE_HOP_USER_PROMPT.format(drug_1, drug_1_text,
                                                                                                  drug_2, drug_2_text,
                                                                                                  drug_3, drug_3_text,
                                                                                                  protein, protein_text,
                                                                                                  triple_1_2, triple_2_3, triple_3_4
                                                                                                 )
                                     }
                                 ]
                    }
           }

with Path("../data/mined_data/DPI_three_hop_list.pkl").open("rb") as file:
    dpi_three_hop_tuples = pickle.load(file)

formatted_samples = []
for dpi_tuple in dpi_three_hop_tuples:
    drug_1 = dpi_tuple[0]
    drug_2 = dpi_tuple[1]
    drug_3 = dpi_tuple[2]
    protein = dpi_tuple[3]

    rel_1_2 = ddi_subset.query("drug_1_name == @drug_1 and drug_2_name == @drug_2").iloc[0].relationship
    rel_2_3 = ddi_subset.query("drug_1_name == @drug_2 and drug_2_name == @drug_3").iloc[0].relationship
    rel_3_4 = dpi_subset.query("drug_name == @drug_3 and protein_name == @protein").iloc[0].relationship
    
    triple_1_2 = f"{drug_1}-{rel_1_2}-{drug_2}"
    triple_2_3 = f"{drug_2}-{rel_2_3}-{drug_3}"
    triple_3_4 = f"{drug_3}-{rel_3_4}-{protein}"
    
    drug_1_text = retrieve_text(drug_1, "drug")
    drug_2_text = retrieve_text(drug_2, "drug")
    drug_3_text = retrieve_text(drug_3, "drug")
    protein_text = retrieve_text(protein, "protein")

    formatted_samples.append(create_DPI_3_hop_samples(drug_1, drug_1_text, 
                                                      drug_2, drug_2_text,
                                                      drug_3, drug_3_text,
                                                      protein, protein_text, 
                                                      triple_1_2, triple_2_3, triple_3_4))

save_batch(formatted_samples, "../data/OAI/Questions/DPI/3_hop/batch_input.jsonl")