In [None]:
from readability import Readability
from datasets import Dataset
from pathlib import Path
import pandas as pd
import tiktoken
import json
import re

def create_df(base_path):
    inputs = []
    outputs = []
    for file in Path(base_path).iterdir():
        if "input" in file.stem:
            with Path(file).open('r') as file:
                 inputs.extend([json.loads(line) for line in file])
        elif "output" in file.stem:
            with Path(file).open('r') as file:
                 outputs.extend([json.loads(line) for line in file])
    
    inputs_df = pd.DataFrame(inputs)
    outputs_df = pd.DataFrame(outputs)
    merged_df = pd.merge(inputs_df, outputs_df)
    merged_df.drop(columns=["method", "url", "id", "error"], inplace=True)
    return merged_df

# Creating text sources from complexify phase

In [None]:
with Path("../data/OAI/complexify/batch_output.jsonl").open('r') as file:
     complexify_outputs = [json.loads(line) for line in file]

for response_dict in complexify_outputs:
    drug_name = response_dict["custom_id"].split("-complexify")[0]
    complexified_text = response_dict["response"]["body"]["choices"][0]["message"]["content"]
    with Path(f"../data/background_information_data/drug_data/Wiki_complexified/{drug_name}.txt").open("w") as file:
        file.write(complexified_text)

# Measuring readability

In [None]:
gpt_tokenizer = tiktoken.encoding_for_model("gpt-4o")

ddi_subset = pd.read_csv("../data/mined_data/final_DDI.csv")
all_drugs = set(ddi_subset["drug_1_name"].unique()).union(set(ddi_subset["drug_2_name"].unique()))

orig_tokens = []
orig_readability = []
comp_tokens = []
comp_readability = []
for drug in all_drugs:
    drug_orig = Path(f"../data/background_information_data/drug_data/Wiki/{drug}.txt")
    drug_comp = Path(f"../data/background_information_data/drug_data/Wiki_complexified/{drug}.txt")
    if drug_orig.exists() and drug_comp.exists():
        with drug_orig.open("r") as file:
            orig_text = file.read()
            orig_tokens.append(len(gpt_tokenizer.encode(orig_text)))
            orig_readability.append(Readability(orig_text).gunning_fog().score)
        with drug_comp.open("r") as file:
            comp_text = file.read()
            comp_tokens.append(len(gpt_tokenizer.encode(comp_text)))
            comp_readability.append(Readability(comp_text).gunning_fog().score)

print(f"Avg. Original Tokens: {sum(orig_tokens)/len(orig_tokens)} | Avg. Original Readability: {sum(orig_readability)/len(orig_readability)}")
print(f"Avg. Complex Tokens: {sum(comp_tokens)/len(comp_tokens)} | Avg. Complex Readability: {sum(comp_readability)/len(comp_readability)}")

# Creating Molecular interaction table

In [None]:
merged_df = create_df("../data/OAI/molecular_interactions/")

final_rows = []
for row in merged_df.itertuples():
    drug_1, drug_2 = row.custom_id.split("-")[:2]
    SMILES_1, SMILES_2 = [x[-1].strip() for x in re.findall(r"(SMILES )(\d: )(.*)", row.body["messages"][1]["content"])]
    response = row.response["body"]["choices"][0]["message"]["content"]
    
    pattern = r"^(?P<interaction_field>\*{0,2}INTERACTION\*{0,2}):\s*(?P<interaction>.*?)\n(?P<mechanism_field>\*{0,2}MECHANISM\*{0,2}):\s*(?P<mechanism>.*(?:\n(?!\*{0,2}(EVIDENCE|SEVERITY)\*{0,2}:).*)*)\n(?P<evidence_field>\*{0,2}EVIDENCE\*{0,2}):\s*(?P<evidence>.*(?:\n(?!\*{0,2}SEVERITY\*{0,2}:).*)*)\n(?P<severity_field>\*{0,2}SEVERITY\*{0,2}):\s*(?P<severity>.*)$"
    matches = re.search(pattern, response, re.DOTALL | re.MULTILINE | re.IGNORECASE)
    
    interaction = matches.group('interaction').strip()
    mechanism = matches.group('mechanism').strip()
    evidence = matches.group('evidence').strip()
    if interaction == "None": # None is not the "type" here. Thus, not using if interaction is None.
        continue
    severity = matches.group('severity').strip()
    
    final_rows.append((drug_1, drug_2, SMILES_1, SMILES_2, interaction, mechanism, evidence, severity))

pd.DataFrame(final_rows, columns=["drug_1_name", "drug_2_name", "drug_1_SMILES", "drug_2_SMILES", 
                                  "molecular_interaction", 
                                  "mechanism", 
                                  "evidence", 
                                  "severity"]).to_csv("../data/OAI/molecular_interactions/molecular_interactions_df.csv", index=None)

# Assembling dataset

In [None]:
def create_QA_df(base_df, label):
    final_rows = []
    for row in base_df.itertuples(index=False):
        entity_pair = "-".join(list(filter(lambda x: x not in ["DDI", "DPI", "Bio", "Mol", "1_hop", "2_hop", "3_hop"], row.custom_id.split("-"))))
        question_background = row.body["messages"][1]["content"]
        response = re.sub(r'\*{1,2}(.*?)\*{1,2}', r'\1', row.response["body"]["choices"][0]["message"]["content"])
        pattern = r"Question:\s*(.*?)\s*Answer:\s*(.*)"
        match = re.search(pattern, response, re.DOTALL)
        question = match.group(1).strip()
        answer = match.group(2).strip()
        final_rows.append((entity_pair, question_background, question, answer, label))
    return pd.DataFrame(final_rows, columns=["Entities", "Question_Background", "Question", "Answer", "Label"])

df1 = create_QA_df(create_df("../data/OAI/Questions/DDI/Bio/1_hop/"), "DDI_Bio_1_hop")
df2 = create_QA_df(create_df("../data/OAI/Questions/DDI/Bio/2_hop/"), "DDI_Bio_2_hop")
df3 = create_QA_df(create_df("../data/OAI/Questions/DDI/Bio/3_hop/"), "DDI_Bio_3_hop")

df4 = create_QA_df(create_df("../data/OAI/Questions/DDI/Mol/1_hop/"), "DDI_Mol_1_hop")
df5 = create_QA_df(create_df("../data/OAI/Questions/DDI/Mol/2_hop/"), "DDI_Mol_2_hop")
df6 = create_QA_df(create_df("../data/OAI/Questions/DDI/Mol/3_hop/"), "DDI_Mol_3_hop")

df7 = create_QA_df(create_df("../data/OAI/Questions/DPI/1_hop/"), "DPI_1_hop")
df8 = create_QA_df(create_df("../data/OAI/Questions/DPI/2_hop/"), "DPI_2_hop")
df9 = create_QA_df(create_df("../data/OAI/Questions/DPI/3_hop/"), "DPI_3_hop")

final_QA_df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9])
final_QA_df.to_csv("../data/OAI/Questions/all_questions.csv", index=False)

# Creating our dataset for HF
dataset = Dataset.from_pandas(final_QA_df, preserve_index=False)
dataset = dataset.class_encode_column("Label")

dataset = dataset.train_test_split(test_size=0.2, stratify_by_column="Label", seed=42)
dataset["train"].to_csv("../dataset_for_hf/train.csv")

val_test_ds = dataset["test"]
val_test_ds = val_test_ds.train_test_split(test_size=0.5, stratify_by_column="Label", seed=42)

# Saving "test" as validation since it has more samples. The "test"/"train" labels are arbitrary.
val_test_ds["test"].to_csv("../dataset_for_hf/validation.csv")
val_test_ds["train"].to_csv("../dataset_for_hf/test.csv")