# Automated Question-Answer Generation from Pharmaceutical Data

This notebook systematically generates question-answer pairs from pharmaceutical data sources including:
1. PubMed target passages
2. DrugBank tables
3. Related pharmaceutical data

Each QA pair will include:
- Question
- Answer
- Source text/passage
- Related table

In [16]:
# Import required libraries
import os
import pandas as pd
import json
from pathlib import Path
from openai import OpenAI
import csv
from tqdm import tqdm
from collections import defaultdict
import re

In [5]:
# Configuration and paths
PUBMED_TARGETS_DIR = '../data/Pharma/pubmed-targets'
DRUGBANK_TABLES_DIR = '../data/Pharma/drugbank-tables'
MAPPING_FILE = '../data/Pharma/pubmed-drugbank-tables.gt'
OUTPUT_FILE = 'both_output.gt'

# Initialize your LLM API key if needed
# os.environ["OPENAI_API_KEY"]="<KEY>"
client = OpenAI()


In [6]:
def load_passage_table_mapping():
    """Load the mapping between passages and their relevant tables"""
    mapping = defaultdict(list)
    with open(MAPPING_FILE, 'r') as f:
        for line in f:
            passage_id, table_name = line.strip().split(',')
            mapping[passage_id].append(table_name)
    # print("mapping", mapping)
    return mapping

def load_target_passages():
    """Load all target passages from the pubmed-targets directory"""
    passages = {}
    target_files = Path(PUBMED_TARGETS_DIR).glob('Target-*')
    
    for file_path in target_files:
        target_id = file_path.name
        with open(file_path, 'r') as f:
            passages[target_id] = f.read()
    # print("passages", passages)
    return passages

def load_drugbank_tables():
    """Load all relevant DrugBank tables"""
    tables = {}
    csv_files = Path(DRUGBANK_TABLES_DIR).glob('*.csv')
    
    for file_path in csv_files:
        table_name = file_path.stem
        tables[table_name] = pd.read_csv(file_path)
    
    # print("tables", tables)
    return tables

In [7]:
def get_relevant_table_content(tables, table_names, max_tokens=5000):
    """Extract relevant content from tables for context with token budget control"""
    print("Debug - Available tables:", tables.keys())
    print("Debug - Looking for table_names:", table_names)

    # Filter to only tables that exist
    available_tables = [name.replace('.csv', '') for name in table_names if name.replace('.csv', '') in tables]
    
    if not available_tables:
        print("Debug - No matching tables found")
        return {}
    
    # Calculate token budget per table
    tokens_per_table = max_tokens // len(available_tables)
    
    table_content = {}
    for table_name in available_tables:
        df = tables[table_name]
        columns = list(df.columns)
        
        # Add rows until we reach the token limit for this table
        rows = []
        total_tokens = 0
        
        for i, row in df.iterrows():
            row_dict = row.to_dict()
            # Add table name to each record
            row_str = str(row_dict)
            row_tokens = len(row_str) // 4  # Approximate token count
            
            if total_tokens + row_tokens <= tokens_per_table:
                rows.append(row_dict)
                total_tokens += row_tokens
            else:
                break
        
        table_content[table_name] = {
            'table_name': table_name,
            'columns': columns,
            'sample': rows
        }
        
    return table_content

def generate_questions_for_passage(passage_id, passage_text, tables, relevant_table_names, model="gpt-4o"):
    """Generate questions for a given passage and its relevant tables using LLM"""

    # print("tables", tables)
    # print("relevant_table_names", relevant_table_names)
    
    # Limit passage length if too long (e.g., first 1000 characters)
    passage_text = passage_text[:1000] + "..." if len(passage_text) > 1000 else passage_text
    
    # Limit to maximum 3 relevant tables
    relevant_table_names = relevant_table_names[:3]
    
    # Get relevant table content
    table_content = get_relevant_table_content(tables, relevant_table_names)

    
    prompt = f"""
    Given the following passage and related tables, generate 1-2 meaningful question-answer pairs.
    IMPORTANT: Each question MUST be answerable using information from BOTH the passage AND the tables together.
    Only generate questions that require combining information from both sources. Try and make the question as difficult and technical as possible.
    Focus on pharmaceutical and medical aspects.
    
    
    Passage (ID: {passage_id}):
    {passage_text}
    
    Related Tables:
    {json.dumps(table_content, indent=2)}
    
    Generate questions in the following format:
    1. question: [specific question about drug/treatment]
       answer: [detailed answer combining information from passage and tables]
       text: [passage ID if information from passage was used (e.g. Target-123456789)]
       table: [table name if information from table was used (e.g. drugbank-drug)]
    
    Ensure every question uses information from both the passage and at least one table.
    """
    
    # Call your LLM here with the prompt
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a medical and pharmaceutical expert tasked with generating detailed question-answer pairs about drugs and treatments."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7
    )

    print(response.choices[0].message.content)
    
    # Parse the response into structured QA pairs
    qa_pairs = parse_llm_response(response.choices[0].message.content, passage_id, relevant_table_names)
    return qa_pairs

def parse_llm_response(response_text, passage_id, relevant_table_names):
    """Parse the LLM response into structured QA pairs"""
    qa_pairs = []
    
    # Split the response into individual QA entries
    entries = response_text.strip().split('\n\n')
    
    for entry in entries:
        if not entry.strip():
            continue
            
        lines = entry.strip().split('\n')
        current_qa = {
            'question': '',
            'answer': '',
            'text': passage_id,
            'table': relevant_table_names  # Default value
        }
        
        for line in lines:
            line = line.strip()
            # Skip empty lines and numbering
            if not line or line.replace('.', '').strip().isdigit():
                continue
                
            # Parse each field using more robust splitting
            if 'question:' in line:
                current_qa['question'] = line.split('question:', 1)[1].strip()
            elif 'answer:' in line:
                current_qa['answer'] = line.split('answer:', 1)[1].strip()
            elif 'text:' in line:
                current_qa['text'] = line.split('text:', 1)[1].strip()
            elif 'table:' in line:
                table_value = line.split('table:', 1)[1].strip()
                # Handle NA, N/A, None cases
                current_qa['table'] = 'None' if table_value.upper() in ['NA', 'N/A', 'NONE'] else table_value
        
        # Only add complete QA pairs that have both question and answer
        if current_qa['question'] and current_qa['answer']:
            qa_pairs.append(current_qa.copy())  # Use copy to avoid reference issues
    
    return qa_pairs

In [8]:
def verify_qa_uses_both_sources(qa_pair, passage_content, table_content, model="gpt-4o"):
        prompt = f"""
        Your task is to determine if the provided answer to a question uses information from BOTH the passage AND the table.
        
        Question: {qa_pair['question']}
        Answer: {qa_pair['answer']}
        
        Passage:
        {passage_content[:2000]}  # Limiting to first 2000 chars for token constraints
        
        Table:
        {json.dumps(table_content, indent=2)}
        
        Does the answer draw specific information from BOTH the passage AND the table? Reply with just "YES" or "NO".
        If the answer only uses information from one source or could be answered using only the passage or only the table, reply "NO".
        """
        
        # Call the LLM for verification
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a verification expert who determines if answers draw from multiple data sources."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1
        )
        
        verification = response.choices[0].message.content.strip().upper()
        return verification
    

In [18]:
def main():
    # Load mappings and data
    print("Loading passage-table mappings...")
    passage_table_mapping = load_passage_table_mapping()
    
    print("Loading target passages...")
    passages = load_target_passages()
    
    print("Loading DrugBank tables...")
    tables = load_drugbank_tables()
    
    # Initialize output list
    qa_pairs = []
    
    # Process each passage with its relevant tables
    for passage_id, passage_text in tqdm(list(passages.items())[:40]):
        if passage_id in passage_table_mapping:
            relevant_tables = passage_table_mapping[passage_id]
            # print(passage_id, passage_text, tables, relevant_tables)
            
            # Generate QA pairs using the passage and its relevant tables
            new_qa_pairs = generate_questions_for_passage(
                passage_id,
                passage_text,
                tables,
                relevant_tables
            )

            # Get relevant table content
            table_content = get_relevant_table_content(tables, relevant_tables)

            # Verify that the answer uses both sources
            for qa_pair in new_qa_pairs:
                verification = verify_qa_uses_both_sources(qa_pair, passage_text, table_content)
                if verification == "YES":
                    qa_pairs.append(qa_pair)
            
        else:
            print(f"No table mapping found for passage {passage_id}")
    
    # Save results
    with open(OUTPUT_FILE, 'w', newline='') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        writer.writerow(['question', 'answer', 'text', 'table'])  # header
        for qa_pair in qa_pairs:
            writer.writerow([
                qa_pair['question'],
                qa_pair['answer'],
                qa_pair['text'],
                qa_pair['table']
            ])
        
    print(f"Generated {len(qa_pairs)} question-answer pairs")

In [19]:
if __name__ == "__main__":
    main()

Loading passage-table mappings...
Loading target passages...
Loading DrugBank tables...


  0%|          | 0/40 [00:00<?, ?it/s]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

  2%|▎         | 1/40 [00:22<14:41, 22.60s/it]

No table mapping found for passage Target-20133511
No table mapping found for passage Target-22294487
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_

 10%|█         | 4/40 [00:31<04:04,  6.79s/it]

No table mapping found for passage Target-10445394
No table mapping found for passage Target-20552178
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_

 18%|█▊        | 7/40 [00:39<02:28,  4.49s/it]

No table mapping found for passage Target-10234600
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions',

 22%|██▎       | 9/40 [00:47<02:14,  4.32s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 25%|██▌       | 10/40 [00:57<02:40,  5.36s/it]

No table mapping found for passage Target-10901704
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions',

 30%|███       | 12/40 [01:05<02:17,  4.92s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 32%|███▎      | 13/40 [01:18<02:58,  6.62s/it]

No table mapping found for passage Target-9630826
No table mapping found for passage Target-7648771
No table mapping found for passage Target-15585441
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drug

 42%|████▎     | 17/40 [01:28<01:40,  4.36s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 45%|████▌     | 18/40 [01:38<01:56,  5.30s/it]

No table mapping found for passage Target-17586865
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions',

 50%|█████     | 20/40 [01:47<01:40,  5.05s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 52%|█████▎    | 21/40 [01:55<01:46,  5.59s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 55%|█████▌    | 22/40 [02:03<01:49,  6.11s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 57%|█████▊    | 23/40 [02:14<02:03,  7.26s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 60%|██████    | 24/40 [02:24<02:04,  7.78s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 62%|██████▎   | 25/40 [02:31<01:57,  7.80s/it]

1. **Question:** What are the K(m) values for the hOat2-mediated uptake of theophylline and erythromycin, and how does this relate to the known action of transporters listed in the drugbank-transporters table?
   **Answer:** The K(m) values for the hOat2-mediated uptake of theophylline and erythromycin are 12.6 µM and 18.5 µM, respectively. These values indicate the affinity of hOat2 for these substrates, with a lower K(m) value suggesting a higher affinity. In the drugbank-transporters table, various transporters, such as "Solute carrier family 22 member 2" (id: BE0003647), are listed with "unknown" known actions, indicating that the specific interactions and affinities for many transporters are not fully characterized, similar to the less defined role of hOat2 in drug interactions.
   **Text:** Target-15708966
   **Table:** drugbank-transporters

2. **Question:** How does the interaction between theophylline and erythromycin mediated by hOat2 compare with the types of drugs listed un

 65%|██████▌   | 26/40 [02:40<01:51,  7.98s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 68%|██████▊   | 27/40 [02:48<01:44,  8.07s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 70%|███████   | 28/40 [02:58<01:41,  8.43s/it]

No table mapping found for passage Target-3878759
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 

 75%|███████▌  | 30/40 [03:08<01:09,  6.99s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 78%|███████▊  | 31/40 [03:16<01:04,  7.17s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 80%|████████  | 32/40 [03:23<00:57,  7.23s/it]

No table mapping found for passage Target-8591723
No table mapping found for passage Target-26952092
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_e

 88%|████████▊ | 35/40 [03:33<00:25,  5.15s/it]

No table mapping found for passage Target-8750925
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 

 92%|█████████▎| 37/40 [03:45<00:16,  5.45s/it]

Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions', 'drugbank-drug_external_identifiers', 'drugbank-en

 95%|█████████▌| 38/40 [03:54<00:12,  6.07s/it]

No table mapping found for passage Target-15482171
Debug - Available tables: dict_keys(['drugbank-targets_polypeptides_ext_id', 'drugbank-drug_prices', 'drugbank-drugs_links', 'drugbank-drug_carriers_articles', 'drugbank-drug_experimental_properties', 'drugbank-transporters_polypeptides_go', 'drugbank-drug_enzymes_links', 'drugbank-drug_atc_codes', 'drugbank-drug_carriers_attachments', 'drugbank-transporters_polypeptides', 'drugbank-carriers_polypeptides', 'drugbank-query_ddi_table', 'drugbank-enzymes_polypeptides_ext_id', 'drugbank-drug_international_brands', 'drugbank-targets_polypeptides_syn', 'drugbank-transporters_polypeptides_ext_id', 'drugbank-drug_reactions_enzymes', 'drugbank-carriers_polypeptides_syn', 'drugbank-drug_groups', 'drugbank-drug_targ_links', 'drugbank-carriers_polypeptides_pfams', 'drugbank-transporters_polypeptides_pfams', 'drugbank-enzymes_polypeptides_syn', 'drugbank-enzymes_polypeptides_pfams', 'drugbank-drug_external_links', 'drugbank-drug_food_interactions',

100%|██████████| 40/40 [04:03<00:00,  6.09s/it]

Generated 15 question-answer pairs





In [21]:
def paraphrase_questions(input_csv, output_csv, model="gpt-4o"):
    """
    Paraphrase questions to remove references to specific tables or passages
    
    Args:
        input_csv: Path to input CSV file with original questions
        output_csv: Path to output CSV file for paraphrased questions
        model: LLM model to use for paraphrasing
    """
    # Read the input CSV
    df = pd.read_csv(input_csv, quoting=csv.QUOTE_ALL)
    
    # New list to hold the updated rows
    paraphrased_data = []
    
    # Process each question
    for _, row in tqdm(list(df.iterrows()), desc="Paraphrasing questions"):
        question = row['question']
        
        # Create prompt for the LLM
        prompt = f"""
        Your task is to paraphrase the following question to remove references to specific tables, datasets, or passages.
        If the question is solely about dataset structure or tables (e.g., "Does the dataset provide any links..."), return "REMOVE" to indicate it should be filtered out.
        Otherwise, paraphrase the question to make it more general while preserving its core content.

        Examples:
        - "Does the data set provide any links or references for Flumazenil in the drugbank-drugs_links table?" → "REMOVE"
        - "Which muscarinic receptor antagonist examined in the passage has affinity values closest to that of atropine and is also a biotech drug listed in the drugbank-drug table?" → "Which muscarinic receptor antagonist has affinity values closest to that of atropine and is also a biotech drug?"
        - "What is the significance of identifying Lys 190 as the modification site of HSA in the study, and how does it contrast with any protein-related drugs in the tables?" → "What is the significance of identifying Lys 190 as the modification site of HSA, and how does it contrast with any protein-related drugs?"

        Original question: {question}
        Paraphrased question:
        """
        
        # Call the LLM for paraphrasing
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert in reformulating pharmaceutical questions to make them more general and independent of specific data sources."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3
        )
        
        paraphrased = response.choices[0].message.content.strip()
        
        # Skip questions that should be removed
        if paraphrased == "REMOVE":
            continue
            
        # Add the paraphrased question and original answer/text/table to the new data
        paraphrased_data.append({
            'question': paraphrased,
            'answer': row['answer'],
            'text': row['text'],
            'table': row['table']
        })
    
    # Write the paraphrased data to the output CSV
    with open(output_csv, 'w', newline='') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        writer.writerow(['question', 'answer', 'text', 'table'])  # header
        for item in paraphrased_data:
            writer.writerow([
                item['question'],
                item['answer'],
                item['text'],
                item['table']
            ])
    
    print(f"Processed {len(df)} questions, kept {len(paraphrased_data)} after paraphrasing and filtering")
    return paraphrased_data

In [22]:
paraphrased_data = paraphrase_questions('both_output.gt', 'both_output_paraphrased.gt')

Paraphrasing questions: 100%|██████████| 15/15 [00:13<00:00,  1.08it/s]

Processed 15 questions, kept 15 after paraphrasing and filtering





In [23]:

def load_ground_truth(gt_file: str):
    """Load ground truth data from CSV file."""
    gt_data = []
    with open(gt_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            gt_data.append(row)
    return gt_data

def generate_llm_answer(question: str, model: str = "gpt-4o"):
    """
    Generate an answer to a question using an LLM.
    
    Args:
        question: The question to answer
        model: Model to use for generation
        
    Returns:
        The generated answer
    """
    try:
        # Create a simple prompt with just the question
        prompt = f"Answer this question with detailed information: {question}"
        print(f"Prompt: {prompt}")
        
        # Call OpenAI API
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,  # Use low temperature for more consistent answers
            max_tokens=500
        )
        
        # Extract the response text
        return response.choices[0].message.content.strip()
            
    except Exception as e:
        print(f"Error generating answer: {e}")
        time.sleep(2)  # Sleep to handle rate limits
        return ""

def calculate_llm_correctness(hypothesis: str, reference: str, question: str, model: str = "gpt-4o"):
    """
    Use an LLM to evaluate the correctness of the hypothesis compared to the reference.
    
    Args:
        hypothesis: The system-generated answer
        reference: The ground truth answer
        question: The original question
        model: Model to use for evaluation
        
    Returns:
        A score between 0 and 1 representing correctness (1 = fully correct, 0 = incorrect)
    """
    try:
        # Create prompt for the LLM
        prompt = f"""
You are an expert evaluator assessing the correctness of an answer to a question.

Question: {question}

Ground Truth Answer: {reference}

System Answer: {hypothesis}

Evaluate how correct the System Answer is compared to the Ground Truth Answer. Be very critical in your evaluation/analysis.
Give a score from 0 to 1 where:
- 1.0 means the System Answer is fully correct and contains all the information from the Ground Truth
- 0.0 means the System Answer is completely incorrect
- Values between 0 and 1 indicate partial correctness

Output a single line with just the score as a decimal between 0 and 1.
"""

        # Call OpenAI API
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,  # Use low temperature for more consistent evaluations
            max_tokens=300
        )
        
        # Extract the response text
        response_text = response.choices[0].message.content.strip()
        
        # Extract the score from the response - find the last number between 0 and 1
        score_matches = re.findall(r'(?:^|\s)(0(?:\.\d+)?|1(?:\.0+)?)(?:$|\s)', response_text)
        if score_matches:
            score = float(score_matches[-1])  # Take the last match as the final score
            return min(max(score, 0.0), 1.0)  # Ensure score is between 0 and 1
        else:
            print(f"Could not extract a score from LLM response: {response_text}")
            return 0.0
            
    except Exception as e:
        print(f"Error in LLM evaluation: {e}")
        time.sleep(2)  # Sleep to handle rate limits
        return 0.0

def filter_qa_pairs(gt_data,  llm_model: str = "gpt-4o", threshold: float = 0.5):
    """
    Filter question-answer pairs based on LLM correctness score.
    
    Args:
        gt_data: List of dictionaries containing ground truth data
        llm_model: Model to use for generation and evaluation
        threshold: Correctness threshold above which pairs are removed
        
    Returns:
        Filtered list of dictionaries
    """
    filtered_data = []
    
    for qa_pair in tqdm(gt_data, desc="Filtering QA pairs"):
        question = qa_pair['question']
        reference_answer = qa_pair['answer']
        
        # Generate an answer using just the question
        llm_answer = generate_llm_answer(question)
        
        # If we couldn't get an answer, keep the pair
        if not llm_answer:
            filtered_data.append(qa_pair)
            continue
        
        # Evaluate the correctness
        correctness_score = calculate_llm_correctness(
            llm_answer, reference_answer, question
        )
        
        print(f"Question: {question}")
        print(f"LLM Answer: {llm_answer[:100]}...")
        print(f"Correctness Score: {correctness_score}")
        
        # Keep pairs that the LLM couldn't answer correctly
        if correctness_score <= threshold:
            filtered_data.append(qa_pair)
    
    return filtered_data

def save_filtered_data(filtered_data, output_file):
    """Save filtered data to a CSV file."""
    if not filtered_data:
        print("No data to save!")
        return
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=filtered_data[0].keys(), quoting=csv.QUOTE_ALL)
        writer.writeheader()
        for row in filtered_data:
            writer.writerow(row)
    


In [24]:
# Load ground truth data
print(f"Loading ground truth data")
paraphrased_output_file = OUTPUT_FILE.replace(".gt", "_paraphrased.gt")
gt_data = load_ground_truth(paraphrased_output_file)
print(f"Loaded {len(gt_data)} QA pairs")

# Filter pairs
filtered_data = filter_qa_pairs(gt_data, 0.5)
print(f"Filtered to {len(filtered_data)} QA pairs")

filtered_output_file = OUTPUT_FILE.replace(".gt", "_correctness_filtered.gt")
# Save filtered data
save_filtered_data(filtered_data, filtered_output_file)
print(f"Saved filtered data to {filtered_output_file}")

# generate_llm_answer("What is the main target of the drug?")

Loading ground truth data
Loaded 15 QA pairs


Filtering QA pairs:   0%|          | 0/15 [00:00<?, ?it/s]

Prompt: Answer this question with detailed information: How does the involvement of the bile salt export pump (BSEP) relate to the hepatocanalicular transport mechanisms affected by bosentan, and what evidence supports this?


Filtering QA pairs:   7%|▋         | 1/15 [00:07<01:50,  7.89s/it]

Question: How does the involvement of the bile salt export pump (BSEP) relate to the hepatocanalicular transport mechanisms affected by bosentan, and what evidence supports this?
LLM Answer: The bile salt export pump (BSEP), also known as ABCB11, is a crucial transporter protein located in ...
Correctness Score: 0.8
Prompt: Answer this question with detailed information: Which specific transporter is likely associated with the observed increase in serum bile salt levels in humans, considering the safety analysis of bosentan from clinical trials?


Filtering QA pairs:  13%|█▎        | 2/15 [00:12<01:16,  5.85s/it]

Question: Which specific transporter is likely associated with the observed increase in serum bile salt levels in humans, considering the safety analysis of bosentan from clinical trials?
LLM Answer: Bosentan is a dual endothelin receptor antagonist used primarily in the treatment of pulmonary arter...
Correctness Score: 1.0
Prompt: Answer this question with detailed information: What is the relationship between glyburide's mechanism of action in modulating ion transport and its potential application in sepsis, and how does it compare to other drugs in terms of target specificity?


Filtering QA pairs:  20%|██        | 3/15 [00:21<01:30,  7.51s/it]

Question: What is the relationship between glyburide's mechanism of action in modulating ion transport and its potential application in sepsis, and how does it compare to other drugs in terms of target specificity?
LLM Answer: Glyburide, also known as glibenclamide, is a sulfonylurea class drug primarily used to treat type 2 ...
Correctness Score: 0.5
Prompt: Answer this question with detailed information: How might the absence of specific references impact the validation and acceptance of metoprolol's off-label use for supraventricular tachycardia?


Filtering QA pairs:  27%|██▋       | 4/15 [00:51<02:59, 16.35s/it]

Question: How might the absence of specific references impact the validation and acceptance of metoprolol's off-label use for supraventricular tachycardia?
LLM Answer: The absence of specific references can significantly impact the validation and acceptance of metopro...
Correctness Score: 0.8
Prompt: Answer this question with detailed information: Which drug is associated with a vitamin K-dependent gamma-glutamyl carboxylase enzyme process in the liver, and what is the known target of this drug?


Filtering QA pairs:  33%|███▎      | 5/15 [00:58<02:07, 12.74s/it]

Question: Which drug is associated with a vitamin K-dependent gamma-glutamyl carboxylase enzyme process in the liver, and what is the known target of this drug?
LLM Answer: The drug associated with the vitamin K-dependent gamma-glutamyl carboxylase enzyme process in the li...
Correctness Score: 0.5
Prompt: Answer this question with detailed information: Considering the affinities of amoxapine and loxapine for various receptors, which drugs could potentially have a similar mechanism of action based on known receptor targets, and why?


Filtering QA pairs:  40%|████      | 6/15 [01:05<01:38, 10.95s/it]

Question: Considering the affinities of amoxapine and loxapine for various receptors, which drugs could potentially have a similar mechanism of action based on known receptor targets, and why?
LLM Answer: Amoxapine and loxapine are both antipsychotic medications that belong to the dibenzoxazepine class. ...
Correctness Score: 0.5
Prompt: Answer this question with detailed information: What is the potential therapeutic application of nicotine derivatives for Alzheimer's disease, and which drug is related to a similar target in humans?


Filtering QA pairs:  47%|████▋     | 7/15 [01:14<01:21, 10.15s/it]

Question: What is the potential therapeutic application of nicotine derivatives for Alzheimer's disease, and which drug is related to a similar target in humans?
LLM Answer: The potential therapeutic application of nicotine derivatives for Alzheimer's disease (AD) is primar...
Correctness Score: 0.7
Prompt: Answer this question with detailed information: What role does the Solute carrier organic anion transporter family member 1B3 (SLCO1B3) play in relation to SLCO1B3 polymorphism, and how is this connected to androgen-deprivation therapy in patients with advanced prostate cancer?


Filtering QA pairs:  53%|█████▎    | 8/15 [01:22<01:07,  9.67s/it]

Question: What role does the Solute carrier organic anion transporter family member 1B3 (SLCO1B3) play in relation to SLCO1B3 polymorphism, and how is this connected to androgen-deprivation therapy in patients with advanced prostate cancer?
LLM Answer: The Solute Carrier Organic Anion Transporter Family Member 1B3 (SLCO1B3) is a protein that plays a c...
Correctness Score: 0.5
Prompt: Answer this question with detailed information: How does the SLCO1B3 polymorphism influence the duration to androgen independence in prostate cancer patients receiving ADT, and what is known about the action of this transporter?


Filtering QA pairs:  60%|██████    | 9/15 [01:39<01:11, 11.96s/it]

Question: How does the SLCO1B3 polymorphism influence the duration to androgen independence in prostate cancer patients receiving ADT, and what is known about the action of this transporter?
LLM Answer: The SLCO1B3 gene encodes a liver-specific organic anion transporting polypeptide, OATP1B3, which is ...
Correctness Score: 0.5
Prompt: Answer this question with detailed information: What is the primary mechanism of action of chlorambucil in chronic lymphocytic leukemia (CLL), and how does this compare to the biological targets of other drugs?


Filtering QA pairs:  67%|██████▋   | 10/15 [01:49<00:56, 11.22s/it]

Question: What is the primary mechanism of action of chlorambucil in chronic lymphocytic leukemia (CLL), and how does this compare to the biological targets of other drugs?
LLM Answer: Chlorambucil is a chemotherapy drug primarily used in the treatment of chronic lymphocytic leukemia ...
Correctness Score: 0.8
Prompt: Answer this question with detailed information: What are the different types of drugs that could potentially overcome chlorambucil resistance, which involves elevated glutathione S-transferase (GST) activity, through alternative mechanisms?


Filtering QA pairs:  73%|███████▎  | 11/15 [01:59<00:44, 11.05s/it]

Question: What are the different types of drugs that could potentially overcome chlorambucil resistance, which involves elevated glutathione S-transferase (GST) activity, through alternative mechanisms?
LLM Answer: Chlorambucil is a chemotherapy drug used primarily in the treatment of chronic lymphocytic leukemia ...
Correctness Score: 0.5
Prompt: Answer this question with detailed information: How does the interaction of vecuronium with muscarinic M2 receptors relate to its cardiovascular side effects, and where can one find the official label for a drug with similar characteristics?


Filtering QA pairs:  80%|████████  | 12/15 [02:08<00:31, 10.42s/it]

Question: How does the interaction of vecuronium with muscarinic M2 receptors relate to its cardiovascular side effects, and where can one find the official label for a drug with similar characteristics?
LLM Answer: Vecuronium is a non-depolarizing neuromuscular blocking agent primarily used to induce muscle relaxa...
Correctness Score: 0.7
Prompt: Answer this question with detailed information: Considering the allosteric regulation of muscarinic M2 receptors by neuromuscular blocking drugs, what can be inferred about the role of Chinese hamster ovary (CHO) cells in the study?


Filtering QA pairs:  87%|████████▋ | 13/15 [02:15<00:18,  9.36s/it]

Question: Considering the allosteric regulation of muscarinic M2 receptors by neuromuscular blocking drugs, what can be inferred about the role of Chinese hamster ovary (CHO) cells in the study?
LLM Answer: The study of allosteric regulation of muscarinic M2 receptors by neuromuscular blocking drugs often ...
Correctness Score: 0.7
Prompt: Answer this question with detailed information: What is the significance of P-glycoprotein (P-gp) in the transport of nadolol, and how is it typically represented in databases?


Filtering QA pairs:  93%|█████████▎| 14/15 [02:24<00:09,  9.31s/it]

Question: What is the significance of P-glycoprotein (P-gp) in the transport of nadolol, and how is it typically represented in databases?
LLM Answer: P-glycoprotein (P-gp), also known as multidrug resistance protein 1 (MDR1) or ATP-binding cassette s...
Correctness Score: 0.3
Prompt: Answer this question with detailed information: How does the potential for resistance to fluoroquinolones in Streptococcus pneumoniae with a preexisting parC mutation relate to the presence of any specific targets?


Filtering QA pairs: 100%|██████████| 15/15 [02:38<00:00, 10.55s/it]

Question: How does the potential for resistance to fluoroquinolones in Streptococcus pneumoniae with a preexisting parC mutation relate to the presence of any specific targets?
LLM Answer: The potential for resistance to fluoroquinolones in *Streptococcus pneumoniae* is a significant conc...
Correctness Score: 0.8
Filtered to 7 QA pairs
Saved filtered data to both_output_correctness_filtered.gt



