In [12]:
import sys
sys.path.append("../common_scripts/")

from common_functions import save_batch, print_sample, count_tokens, create_formatted_samples_for_eval
from eval_prompts import *
from pathlib import Path
import pandas as pd
import json


from datetime import datetime
import time

import fsspec
import os

df = pd.read_csv("../../dataset_for_hf/test.csv")

In [4]:
SYSTEM_PROMPT_RAG = """You are a helpful AI assistant who answers questions about Biomolecular interactions. A question may concern a drug-drug or drug-protein interaction. Your task is to answer the question by providing an entity name, which may be a drug name like 'Xanax' or a protein name like 'alpha-2 adrenergic receptor'. To answer the question, you will be provided with relevant retrieved data.

TASK REQUIREMENTS:
1. Do not write filler language like "Here is the answer", etc.
2. Provide your thought process for arriving at the answer.

Please structure your output as,
REASON: <The justification for your answer>
ANSWER: <The corresponding drug or protein name>"""

USER_PROMPT_RAG = """BACKGROUND DATA: 
{}

QUESTION: {}"""


## functions 

In [None]:
def extract_modalities(df):
    text_only_corpus = []
    graph_only_corpus = []

    for row in df.itertuples():
        text_parts = []
        graph_parts = []

        for line in row.Question_Background.splitlines():
            if 'INTERACTION TRIPLE' in line:
                graph_parts.append(line)
            else:
                text_parts.append(line)

        text_only_corpus.append("\n".join(text_parts))
        graph_only_corpus.append("\n".join(graph_parts))

    df["TextOnly"] = text_only_corpus
    df["GraphOnly"] = graph_only_corpus
    df["TextGraph"] = df["Question_Background"]  # original background
    return df

In [None]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

def simple_tokenizer(text):
    return text.lower().split()

def build_bm25_index(corpus):
    tokenized_corpus = [simple_tokenizer(doc) for doc in corpus]
    return BM25Okapi(tokenized_corpus), tokenized_corpus

def retrieve_context(bm25, tokenized_corpus, query, top_k=5):
    tokenized_query = simple_tokenizer(query) #word_tokenize(query.lower())
    scores = bm25.get_scores(tokenized_query)
    top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return top_k_indices


In [None]:
def format_rag_prompt(row, retrieved_context, modality):
    return {
        "custom_id": f"{row.Entities}_{row.Label}:{modality}_rag",
        "input_text": f"{SYSTEM_PROMPT_RAG}\n\n{USER_PROMPT_RAG.format(retrieved_context, row.Question)}"
    }


In [None]:
def generate_rag_samples(df, modality, bm25, tokenized_corpus, top_k=5):
    all_samples = []
    for row in df.itertuples():
        indices = retrieve_context(bm25, tokenized_corpus, row.Question, top_k=top_k)
        retrieved_context = "\n---\n".join([df.iloc[i][modality] for i in indices])
        sample = format_rag_prompt(row, retrieved_context, modality)
        all_samples.append(sample)
    return all_samples


In [None]:
def save_jsonl(samples, filename):
    with open(filename, 'w') as f:
        for sample in samples:
            f.write(json.dumps(sample) + '\n')


# test 

In [11]:
def format_openai_rag_messages(row, retrieved_context, modality):
    return {
        "custom_id": f"{row.Entities}_{row.Label}:{modality}_rag",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "o4-mini", 
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT_RAG},
                {"role": "user", "content": USER_PROMPT_RAG.format(retrieved_context, row.Question)}
            ]
        }
    }


In [14]:
def generate_openai_rag_batch(df, modality, bm25, tokenized_corpus, top_k=5):
    batch = []
    for row in df.itertuples():
        indices = retrieve_context(bm25, tokenized_corpus, row.Question, top_k=top_k)
        retrieved_context = "\n---\n".join([df.iloc[i][modality] for i in indices])
        item = format_openai_rag_messages(row, retrieved_context, modality)
        batch.append(item)
    return batch


In [16]:
df.head()


Unnamed: 0,Entities,Question_Background,Question,Answer,Label,TextOnly,GraphOnly,TextGraph
0,Candesartan-Citalopram-Fluphenazine-Histamine ...,DRUG 1 NAME: Candesartan\nDRUG 1 BACKGROUND IN...,Consider a high-affinity phenothiazine antipsy...,Histamine H1 receptor,8,DRUG 1 NAME: Candesartan\nDRUG 1 BACKGROUND IN...,DRUG 1 - DRUG 2 INTERACTION TRIPLE (subject-pr...,DRUG 1 NAME: Candesartan\nDRUG 1 BACKGROUND IN...
1,Lamotrigine-Oxaprozin-Sildenafil,DRUG 1 NAME: Lamotrigine\nDRUG 1 SMILES: C1=CC...,"Which drug, whose structure features a pyrazol...",Sildenafil,4,DRUG 1 NAME: Lamotrigine\nDRUG 1 SMILES: C1=CC...,DRUG 1 - DRUG 2 INTERACTION TRIPLE (subject-pr...,DRUG 1 NAME: Lamotrigine\nDRUG 1 SMILES: C1=CC...
2,Ranitidine-Sibutramine-Zolpidem,DRUG 1 NAME: Ranitidine\nDRUG 1 SMILES: CNC(=C...,"Which drug, containing an imidazopyridine core...",Zolpidem,4,DRUG 1 NAME: Ranitidine\nDRUG 1 SMILES: CNC(=C...,DRUG 1 - DRUG 2 INTERACTION TRIPLE (subject-pr...,DRUG 1 NAME: Ranitidine\nDRUG 1 SMILES: CNC(=C...
3,Cetirizine-Dipyridamole-Sildenafil-Zaleplon,DRUG 1 NAME: Cetirizine\nDRUG 1 BACKGROUND INF...,A selective PDE5 inhibitor used primarily for ...,Zaleplon,2,DRUG 1 NAME: Cetirizine\nDRUG 1 BACKGROUND INF...,DRUG 1 - DRUG 2 INTERACTION TRIPLE (subject-pr...,DRUG 1 NAME: Cetirizine\nDRUG 1 BACKGROUND INF...
4,Alprazolam-Bumetanide-Fenofibrate-Modafinil,DRUG 1 NAME: Alprazolam\nDRUG 1 SMILES: CC1=NN...,"Which drug, characterized by a sulfinyl-linked...",Modafinil,5,DRUG 1 NAME: Alprazolam\nDRUG 1 SMILES: CC1=NN...,DRUG 1 - DRUG 2 INTERACTION TRIPLE (subject-pr...,DRUG 1 NAME: Alprazolam\nDRUG 1 SMILES: CC1=NN...


In [22]:
df = extract_modalities(df)

# Build indexes
bm25_text, tok_text = build_bm25_index(df["TextOnly"])
bm25_graph, tok_graph = build_bm25_index(df["GraphOnly"])
bm25_both, tok_both = build_bm25_index(df["TextGraph"])

# Generate OpenAI RAG input files
openai_text_rag = generate_openai_rag_batch(df, "TextOnly", bm25_text, tok_text)
openai_graph_rag = generate_openai_rag_batch(df, "GraphOnly", bm25_graph, tok_graph)
openai_textgraph_rag = generate_openai_rag_batch(df, "TextGraph", bm25_both, tok_both)

# Save to JSONL
save_jsonl(openai_text_rag, "openai_rag_textonly.jsonl")
save_jsonl(openai_graph_rag, "openai_rag_graphonly.jsonl")
save_jsonl(openai_textgraph_rag, "openai_rag_textgraph.jsonl")
