In [None]:
import openai
import pandas as pd
from collections import defaultdict
import json
import os

from src.utils import call_gpt3_api, call_gpt4_api
from src.mcq_frq import *
from src.eval import eval_experiment
from tqdm.auto import tqdm

In [None]:
# Set up OpenAI API credentials for GPT-3.5
openai_key = open("../keys/openai_key.txt", "r")
openai.api_key = openai_key.readlines()[0].strip()

organization_id = open("../keys/rajpurkarlab_org_id.txt", "r")
openai.organization = organization_id.readlines()[0].strip()

In [None]:
path_gpt3 = "./results/conversations_raw/conversations_gpt3.json"
path_gpt4 = "./results/conversations_raw/conversations_gpt4.json"

In [None]:
# Read dataset
dataset = pd.read_csv("./data/dataset_final.tsv", sep="\t")

# Experiment

In [None]:
def singleturn_withPE_experiment(all_cases, singleturn, case_idx, experiment_name, gpt_model_type):    
    prompts = {'mcq_4': get_mcq_withPE_prompt, 'mcq_many': get_mcq_withPE_prompt, 'frq': get_frq_withPE_prompt}
    
    call_gpt_api = {"gpt-3.5": call_gpt3_api, "gpt-4": call_gpt4_api}
    
    age = all_cases.loc[case_idx, "age"]
    sex = all_cases.loc[case_idx, "sex"]
    pe = all_cases.loc[case_idx, "physical_exam"]
    
    if experiment_name == "mcq_4":
        choices = get_choices(all_cases, case_idx)
        prompt = prompts[experiment_name](singleturn, age, sex, pe, choices)
    elif experiment_name == "mcq_many":
        choices = get_all_choices()
        prompt = prompts[experiment_name](singleturn, age, sex, pe, choices)
    else:
        prompt = prompts[experiment_name](singleturn, age, sex, pe)
        
    input_msg = [{"role":"system", "content": prompt}] 
    response = call_gpt_api[gpt_model_type](input_msg, n_responses=1)
        
    return response

In [None]:
keys = ["mcq_4", "mcq_many", "frq"]
res = json.load(open("./results/final_results_conversation.json","r"))

res_with_exam["gpt-3.5"]["single-turn"] = {}
res_with_exam["gpt-4"]["single-turn"] = {}

gpt3_convos = json.load(open(path_gpt3, "r"))
gpt4_convos = json.load(open(path_gpt4, "r"))

n_trials = 10

for case_idx, case_id in tqdm(dataset.case_id.items()):
    for key in keys:
        for j in range(n_trials):
            singleturn_gpt3 = gpt3_convos[case_id][f"trial_{j}_doctor_responses"][1]["content"]
            singleturn_gpt4 = gpt4_convos[case_id][f"trial_{j}_doctor_responses"][1]["content"]
            
            res_with_exam["gpt-3.5"]["single-turn"][case_id] = {key:{"responses":None} for key in keys}
            res_with_exam["gpt-4"]["single-turn"][case_id] = {key:{"responses":None} for key in keys}

            res_with_exam["gpt-3.5"]["single-turn"][case_id][key]["responses"] = singleturn_withPE_experiment(dataset, singleturn_gpt3, case_idx, key, "gpt-3.5")
            res_with_exam["gpt-4"]["single-turn"][case_id][key]["responses"] = singleturn_withPE_experiment(dataset, singleturn_gpt4, case_idx, key, "gpt-4")

# Evaluation

In [None]:
evaluated_res = res_with_exam.copy()
for case in dataset.case_id:
    evaluated_res["gpt-3.5"]["single-turn"][case] = eval_experiment(res_with_exam["gpt-3.5"]["single-turn"][case], 
                                                                    case_id = case,
                                                                    exp_keys=["mcq_4", "mcq_many", "frq"],
                                                                    method = "autoeval",
                                                                    full_dataset = dataset,
                                                                    n_trials=10)
    evaluated_res["gpt-4"]["single-turn"][case] = eval_experiment(res_with_exam["gpt-4"]["single-turn"][case],
                                                                  case_id = case,
                                                                  exp_keys=["mcq_4", "mcq_many", "frq"],
                                                                  method = "autoeval",
                                                                  full_dataset = dataset,
                                                                  n_trials=10)

In [None]:
json.dump(evaluated_res, open("./results/final_results_conversation.json","w"))