In [None]:
import openai
import pandas as pd
from collections import defaultdict
import json
import os

from src.utils import call_gpt3_api, call_gpt4_api
from src.mcq_frq import *
from src.eval import eval_experiment
from tqdm.auto import tqdm

In [None]:
# Setup OpenAI API credentials
# Replace this information with your OpenAI key and organization ID.
openai_key = open("../keys/openai_key.txt", "r")
openai.api_key = openai_key.readlines()[0].strip()

organization_id = open("../keys/rajpurkarlab_org_id.txt", "r")
openai.organization = organization_id.readlines()[0].strip()

In [None]:
# Read Dataset
dataset = pd.read_csv("./data/dataset_final.tsv",sep="\t")

# Experiment

In [None]:
def vignette_experiment(all_cases, case, case_idx, experiment_name, gpt_model_type, n_trials=10):
    prompts = {'mcq_4': get_mcq_prompt, 'mcq_many': get_mcq_prompt, 'frq': get_frq_prompt}
    
    call_gpt_api = {"gpt-3.5": call_gpt3_api, "gpt-4": call_gpt4_api}
    
    if experiment_name == "mcq_4":
        choices = get_choices(all_cases, case_idx)
        prompt = prompts[experiment_name](case, choices)
    elif experiment_name == "mcq_many":
        choices = get_all_choices()
        prompt = prompts[experiment_name](case, choices)
    else:
        prompt = prompts[experiment_name](case)
        
    input_msg = [{"role":"system", "content": prompt}]
    
    temp = []
    for i in range(n_trials):
        response = call_gpt_api[gpt_model_type](input_msg, n_responses=1)
        temp.append(response)
        
    return temp

In [None]:
### Run MCQ-4, MCQ-many and FRQ for all case vignettes in the dataset.

keys = ["mcq_4", "mcq_many", "frq"]
res = {"gpt-3.5":{}, "gpt-4":{}}

for case_idx, case_id in tqdm(dataset.case_id.items()):
    res["gpt-3.5"][case_id] = {key:{"responses":None} for key in keys}
    res["gpt-4"][case_id] = {key:{"responses":None} for key in keys}
    case_desc = dataset.loc[case_idx, "case_desc"]
    
    for key in keys:
        res["gpt-3.5"][case_id][key]["responses"] = vignette_experiment(dataset, case_desc, case_idx, key, "gpt-3.5", n_trials=10)
        res["gpt-4"][case_id][key]["responses"] = vignette_experiment(dataset, case_desc, case_idx, key, "gpt-4", n_trials=10)

# Evaluation

In [None]:
### Run evaluation on all experiments (MCQ-4, MCQ-many, FRQ) in the dataset.

evaluated_res = res.copy()
for case in tqdm(dataset.case_id):
    evaluated_res["gpt-3.5"][case] = eval_experiment(res["gpt-3.5"][case], 
                                                     case_id = case,
                                                     exp_keys=["mcq_4", "mcq_many", "frq"],
                                                     method = "autoeval", 
                                                     full_dataset = dataset,
                                                     n_trials=10)
    evaluated_res["gpt-4"][case] = eval_experiment(res["gpt-4"][case],
                                                   case_id = case,
                                                   exp_keys=["mcq_4", "mcq_many", "frq"],
                                                   method = "autoeval",
                                                   full_dataset = dataset,
                                                   n_trials=10)

# Save Result

In [None]:
json.dump(evaluated_res, open("./results/final_results_vignette.json","w"))