### Hemonc

In [19]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from Bio import Entrez
from urllib.error import HTTPError
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

In [20]:
columns_ref = ['study', 'condition', 'pmid', 'pub.date']
columns_result = ['study', 'condition', 'regimen', 'comparator', 'efficacy']
columns_indication = ['study', 'condition', 'stage_or_status']

preprocess = lambda df, col: df[col].dropna().drop_duplicates().sort_values(col).reset_index(drop=True)
ref = preprocess(pd.read_csv("Data/Raw/Hemonc/ref.table.csv"), columns_ref)
result = preprocess(pd.read_csv("Data/Raw/Hemonc/study_results.csv"), columns_result)
indication = preprocess(pd.read_csv("Data/Raw/Hemonc/indications.csv"), columns_indication)

efficacy = pd.read_excel("Data/Raw/Hemonc/efficacy.xlsx")
efficacy2label = dict(zip(efficacy['efficacy_raw'], efficacy['efficacy_std']))
efficacy2label = {k:v if 'Might Be ' not in v else v[len('Might Be '):] for k,v in efficacy2label.items()}
stage = pd.read_excel("Data/Raw/Hemonc/stage.xlsx")
stage2label = dict(zip(stage['stage_raw'], stage['stage_std']))

In [21]:
study2pmids = defaultdict(set)
for _, row in ref.iterrows():
    study2pmids[(row['study'], row['condition'])].add(row['pmid'])
    
study2result = defaultdict(set)
for _, row in result.iterrows():
    if (row['study'], row['condition']) not in study2pmids or efficacy2label[row['efficacy']] == 'Other' or row['regimen'] == row['comparator']: continue
    label = efficacy2label[row['efficacy']]
    switched = label.replace('Inferior', 'Superior') if 'Inferior' in label else label.replace('Superior', 'Inferior')
    study2result[(row['study'], row['condition'])].add((row['regimen'], row['comparator'], label))
    study2result[(row['study'], row['condition'])].add((row['comparator'], row['regimen'], switched))
    
for study in set(study2pmids.keys()) - set(study2result.keys()):
    del study2pmids[study]
    
study2stage = defaultdict(set)
for _, row in indication.iterrows():
    if (row['study'], row['condition']) not in study2result: continue
    study2stage[(row['study'], row['condition'])].add(stage2label[row['stage_or_status']])

In [22]:
Entrez.email = "yuxin102@mit.edu"
def get_text(pmid, retries=50):
    for i in range(retries):
        try:
            handle = Entrez.efetch(db="pubmed", id=pmid, rettype="xml", retmode="text")
            record = Entrez.read(handle)
            article = record['PubmedArticle'][0]['MedlineCitation']['Article']
            doc = article['ArticleTitle']
            if 'Abstract' in article: doc = ' '.join([doc] + article['Abstract']['AbstractText'])
            return pmid, doc
        except HTTPError as e:
            if e.code == 429: time.sleep(i)
            else: raise Exception(f"Failed to fetch data for PMID {pmid} after {retries} retries.")

pmid2doc = {}
path_doc = "Data/Raw/Hemonc/docs.csv"
if os.path.exists(path_doc):
    pmid2doc = pd.read_csv(path_doc)
    pmid2doc = dict(zip(pmid2doc['pmid'], pmid2doc['doc']))

pmids_missing = set([pmid for study, pmids in study2pmids.items() for pmid in pmids]) - set(pmid2doc.keys())
pmids_missing |= set([pmid for pmid, doc in pmid2doc.items() if not isinstance(doc, str) and np.isnan(doc)])
if len(pmids_missing):
    pmid2doc_missing = {}
    with ThreadPoolExecutor(max_workers=10) as exe:
        futures = [exe.submit(get_text, pmid) for pmid in pmids_missing]
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                pmid, doc = future.result()
                pmid2doc_missing[pmid] = doc
            except Exception as e:
                print(f"Error: {e}")
    for pmid in pmid2doc_missing:
        if not isinstance(pmid2doc_missing[pmid], str) and np.isnan(pmid2doc_missing[pmid]):
            pmid2doc_missing[pmid] = get_text(pmid)[1]
    pmid2doc |= pmid2doc_missing
    pmid2doc_df = pd.DataFrame(list(pmid2doc.items()), columns=['pmid', 'doc'])
    pmid2doc_df.to_csv(path_doc, index=False)

In [7]:
openai_client = openai.OpenAI(api_key="")
openai_model = "gpt-4o"

def call_gpt(message_user, retries=50):
    messages = [{"role": "system", "content": 'You are a helpful assistant.'}, 
                {"role": "user", "content": message_user}]
    for i in range(retries):
        try:
            response = openai_client.chat.completions.create(
                model = openai_model, messages = messages, max_tokens = 4096 # 512
            )
            return response.choices[0].message.content
        except openai.RateLimitError as e:
            time.sleep(i)
        except Exception as e:
            print(f"An error occurred: {e}")
            raise
    raise Exception(f"Failed to call GPT-4 after {retries} retries.")

In [10]:
templates_qn = [
    'Choose an option that best describes the efficacy of {REGIMEN} compared to {COMPARATOR} when used to treat {CONDITION}.',
    'Select the option that most accurately reflects the effectiveness of {REGIMEN} versus {COMPARATOR} in treating {CONDITION}.',
    'Which option best summarizes the comparative efficacy of {REGIMEN} and {COMPARATOR} for managing {CONDITION}?',
    'Identify the option that best summarizes the effectiveness of {REGIMEN} versus {COMPARATOR} in treating {CONDITION}.',
    'Which option most effectively illustrates the efficacy of {REGIMEN} when compared with {COMPARATOR} for {CONDITION}?'
]
rephrase = \
f'''
### Instruction
Do not respond to the question. 
Instead, rephrase the given question template into 20 other versions that are semantically equivalent.

## Version 0: {templates_qn[0]}
## Version 1: {templates_qn[1]}
## Version 2: {templates_qn[2]}
## Version 3: {templates_qn[3]}
## Version 4: {templates_qn[4]}
'''
response = call_gpt(rephrase)
matches = [re.search(fr"## Version {i}: (.*)", response) for i in range(5, 20)]           
templates_qn += [each.group(1) for each in matches if each is not None and len(each.group(1))]

In [23]:
option2idx = {'superior':1, 'inferior':2, 'no difference':3}
dataset = []
for key, values in study2result.items():
    evidence = '\n\n'.join([pmid2doc[pmid] for pmid in study2pmids[key]])
    stage = '' if key not in study2stage else ' ({})'.format(', '.join(sorted(study2stage[key])))
    conditon = key[1] + stage
    for regimen, comparator, efficacy in values:
        if regimen == comparator: continue
        questions = [each.format(**{'REGIMEN':regimen, 'COMPARATOR':comparator, 'CONDITION':conditon}) for each in templates_qn]
        answer = option2idx[efficacy.lower()]
        dataset.append((evidence, *questions, answer))
        
columns = ['evidence'] + [f'question {i}' for i in range(1, len(templates_qn)+1)] + ['answer']
dataset = pd.DataFrame(dataset, columns=columns)
dataset = dataset.dropna().drop_duplicates(subset=columns[:-1], keep=False).reset_index(drop=True)
for option, idx in option2idx.items():
    dataset[f'option {idx}'] = option
dataset.to_csv('Data/Input/Hemonc.csv', index=False)

In [4]:
dataset

Unnamed: 0,evidence,question 1,question 2,question 3,question 4,question 5,question 6,question 7,question 8,question 9,...,question 15,question 16,question 17,question 18,question 19,question 20,answer,option 1,option 2,option 3
0,"A double-blind, placebo-controlled, randomized...",Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Cisplati...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Cisplatin an...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference
1,"A double-blind, placebo-controlled, randomized...",Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Cisplati...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,"Which selection best reflects how Cisplatin, T...",Identify the option that most precisely illust...,What is the best choice that outlines the diff...,1,superior,inferior,no difference
2,Thalidomide maintenance treatment increases pr...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Interfer...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Interferon a...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,1,superior,inferior,no difference
3,Thalidomide maintenance treatment increases pr...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Interfer...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Interferon a...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference
4,"Randomized, Double-Blind, Placebo-Controlled P...",Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Placebo ...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Placebo perf...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6207,Adjuvant Abemaciclib Plus Endocrine Therapy fo...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Abemacic...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Abemaciclib ...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,1,superior,inferior,no difference
6208,Topotecan versus paclitaxel for the treatment ...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Paclitax...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Paclitaxel m...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference
6209,Topotecan versus paclitaxel for the treatment ...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how Topoteca...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how Topotecan mo...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,1,superior,inferior,no difference
6210,Bendamustine prolongs progression-free surviva...,Choose an option that best describes the effic...,Select the option that most accurately reflect...,Which option best summarizes the comparative e...,Identify the option that best summarizes the e...,Which option most effectively illustrates the ...,Pick the option that best depicts how CMF comp...,Select the statement that most clearly describ...,Which choice best compares the treatment effic...,Find the option that best outlines the compara...,...,Point out the choice that best reflects the ef...,Select the option that best indicates the effe...,Choose the answer that most accurately showcas...,Which selection best reflects how CMF performs...,Identify the option that most precisely illust...,What is the best choice that outlines the diff...,2,superior,inferior,no difference
