In [2]:
!pip install anthropic --quiet

import pandas as pd
import numpy as np
from tqdm import tqdm
import asyncio
import nest_asyncio

result_df = pd.read_csv('TempdefGemini20Prod.csv', index_col=0)

from openai import AsyncOpenAI
eval_client = AsyncOpenAI(api_key = "")

nest_asyncio.apply()

In [3]:
prompt_no_example = '''You are an expert diagnostician machine for use by doctors. If the user input is not patient data, you politely decline the request. Please suggest diagnoses and conditions, followed by the evidence points supporting each diagnosis in the form of bullet points. Include previous diagnoses and pertinent information about the patient's medical history (if any). Pay close attention to all the history and investigations provided.  Put asterisks around the diagnoses to highlight them. Give each evidence points as a separate bullet point beneath the diagnosis. Include in your evidence points any relevant clinical scores that can be calculated from the information I have given. Do not explain the evidence points, only state them. For every diagnosis you list, if there are alternative differentials possible, state the most likely three in a bullet point beneath the evidence points (you do not need to state the evidence supporting them - you only need to do that for the main diagnoses). For the main diagnoses, give only confirmed diagnoses and evidence points that can be inferred solely based on the information I have given - do not use any other information. Only give me the information I have asked for - do not give me any other information. Do not give me any introductions or conclusions, safety instructions, or safety warnings. Use British English.

                                To illustrate how the information should be presented:

                                *MAIN DIAGNOSIS 1 AS HEADING*
                                evidence points to support MAIN DIAGNOSIS 1
                                The final bullet point is alternative differentials to consider: alternative 1, alternative 2, alternative 3

                                *MAIN DIAGNOSIS 2 AS HEADING*
                                evidence points to support MAIN DIAGNOSIS 2
                                The final bullet point is alternative differentials to consider: alternative 1, alternative 2, alternative 3

                                and so on...

Before finalising your answer check if you haven't missed any abnormal data points and hence any diagnoses or alternative differentials that could be made based on them. If you did, add them to your reply. If two diagnoses are commonly caused by the same underlying disease, have them under one header, which is the underlying disease.
'''

example = ''''''

patient_data = '''

Patient data:\n'''

follow_up_message = """Below are the actual diagnoses of the same patient reported by clinicians.
Go through the actual diagnoses and cross-check each actual diagnosis with \
the initial list of diagnoses you provided answer the following two questions:

Question 1: Is this actual diagnosis a new disease, not directly related to any of the diagnoses or alternatives you suggested \
in your initial list? If an actual diagnosis is a complication of, a more specific version of, or falls under a broader \
category of a diagnosis / alternative you initially listed, it should not be considered a new disease. If an actual diagnosis \
affects the same organ as a diagnosis / alternative you initially listed, but it has a different onset and progression \
(for example, the actual diagnosis is chronic but you initially listed the acute disease), then your answer should be 'No'. \
If an actual diagnosis is caused by the same pathogen as a diagnosis in your initial list, the answer should also be 'No'. \
If an actual diagnosis is not a medical diagnosis, your answer should be 'No'.

If your answer to Question 1 was 'No', put N/A as answer for Question 2 and skip to the Example below. If your answer to Question 1 was 'Yes', always answer Question 2!

Question 2: Would it be possible to directly infer this actual diagnosis from the patient data provided in the initial query?
If yes, support with facts: quote exact numbers or text from the initial query.
If no, in case the data contradicts the diagnosis, quote the data and say why it does not support the diagnosis. \
Otherwise, please specify what additional data would have been helpful to establish this diagnosis.

Example:
If the patient data is:
"Blood report: min potassium: 3.1, avg hemoglobin: 14.5, max sodium: 139, avg wbc: 13.9
Blood gas report: ph: 7.2
Imaging report: patient with polysubstance abuse, lungs look normal"

and your initial list in your previous response contained the following suggested diagnoses:
*Acidosis*
- ph of 7.2
- Alternative differentials to consider: respiratory acidosis, metabolic acidosis, mixed acid-base disorder

*Polysubstance abuse*
- The imaging report mentions "patient with polysubstance abuse"
- Alternative differentials to consider: alcohol abuse, drug abuse, signs of withdrawal'

*Leukocytosis*
- avg wbc of 13.9
- Alternative differentials to consider: infection, inflammatory condition, myeloproliferative disorder

and actual diagnoses are:
D1: Poisoning by cocaine
D2: Hypokalemia
D3: Hypernatremia
D4: Severe sepsis

Then your answer should be:
D1: Poisoning by cocaine
Question 1: No, this is similar to diagnosis *Polysubstance abuse*
Question 2: N/A

D2: Hypokalemia
Question 1: Yes
Question 2: Yes, the blood report mentions "min potassium: 3.1"

D3: Hypernatremia
Question 1: Yes
Question 2: No, the blood report mentions "max sodium: 139", but only sodium levels above 145 mmol/L indicate hypernatremia, \
hence the data does not support hypernatremia.

D4: Severe sepsis
Question 1: Yes
Question 2: No, additional data such as fever, increased heart rate, increased respiratory rate, positive blood cultures, or evidence of organ dysfunction would have been helpful to establish this diagnosis. "

Before finalizing your answer check if you haven't missed noticing any diagnoses from your initial list that are related to \
any of the actual diagnoses you answered the two questions for! If you did, modify the answers to the questions accordingly!

Actual diagnoses:\n"""

In [5]:
async def get_followup(query, response, follow_up, model="gpt-4-1106-preview"):
    response = await eval_client.chat.completions.create(
        model="gpt-4-1106-preview",
        temperature = 0,
        messages=[
            {"role": "system", "content": "You are a helpful assistant who gives reasons for all answers."},
            {"role": "user", "content": query},
            {"role": "assistant", "content": response},
            {"role": "user", "content": follow_up}
        ])
    return response#.choices[0].message["content"]

async def get_evaluation(queries, responses, follow_ups):
    loop = asyncio.get_event_loop()
    tasks = []
    for query, response, follow_up in zip(queries, responses, follow_ups):
        tasks.append(get_followup(query, response, follow_up))
    all_data = loop.run_until_complete(asyncio.gather(*tasks))
    return all_data

query_per_call = 25
repeat = []
for i in tqdm(range(300,1000,query_per_call)):
    try:
        idx_from = i
        idx_to = i + query_per_call
        queries = [prompt_no_example +  patient_data + result_df.iloc[i]['GPT_input'] for i in range(idx_from,idx_to)]
        responses = result_df.iloc[idx_from:idx_to,:]['GPT-Diagnoses']
        follow_ups = [follow_up_message + result_df.iloc[i]['diagnoses'].replace('\n', '\nD').replace('1:', 'D1:', 1) for i in range(idx_from,idx_to)]
        result = asyncio.run(get_evaluation(queries, responses, follow_ups))
        contents = [result[i].choices[0].message.content for i in range(query_per_call)]
        hadm_ids = [result_df.index[i] for i in range(idx_from,idx_to)]
        result_df.loc[hadm_ids, 'GPT-Eval'] = contents
    except Exception as e:
        print('Error happened at iteration i: ' + str(i))
        repeat.append(i)
        print(e)

100%|██████████| 28/28 [20:02<00:00, 42.94s/it]


In [6]:
save_name = 'TempdefGemini20ProdDiagandEval.csv'

result_df.to_csv(save_name)
from google.colab import files
import time

time.sleep(5)
files.download(save_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def analyze_results(text, index):
    mistakes = []
    hits = []
    excluded = [] #not a medical diagnosis
    noninferables = []
    current = 1
    total_adjust = 0
    #for conditions that GPT-4 grouped together - still doesn't capture issue with hadmid 23707730
    from_nums = []
    to_nums = []
    grouped = re.findall(r'\n\d+-\d+:', text)
    if len(grouped) > 0:
        #print("Grouping found!")
        #print("At index: ")
        #print(index)
        for elem in grouped:
            from_nums.append(str(int(elem.split('-')[0]))) #str(int()) for safety
            to_nums.append(str(int(elem.split('-')[1].strip(':'))))
    while 1:
        try:
            if current == 1:
                number = str(current)
                #sometimes GPT-4 adds words like Diagnosis or Actual diagnosis, and we want to capture that
                pre_word = text.split(number, 1)[0]
            #for conditions that GPT-4 grouped together
            elif str(current) in from_nums:
                idx = from_nums.index(str(current))
                number = grouped[idx]
                total_adjust += int(to_nums[idx]) - current
                current = int(to_nums[idx])
            else:
                number = '\n' + pre_word + str(current)
            nextOne = '\n' + pre_word + str(current+1)
            if text.split(number, 1)[1].split('Question 1: ', 1)[1][:2] == 'No':
                if 'not a medical diagnosis' in text.split(number, 1)[1].split('Question 1: ', 1)[1].split('Question 2: ', 1)[0].split(nextOne, 1)[0]:
                    print(index)
                    print(text.split(number, 1)[1].split('Question 1: ', 1)[0])
                    excluded.append(str(current))
                else:
                    hits.append(str(current))
            elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:3] == 'Yes':
                mistakes.append(str(current))
            elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:2] == 'No':
                noninferables.append(str(current))
            else:
                print("Unable to parse text when looking at diagnosis number: ")
                print(current)
                print("At index: ")
                print(index)
        except:
            #print('Diagnosis number not found in text: ')
            #print(current)
            total = current - 1 - total_adjust
            break
        current += 1
    return pd.Series([len(hits), len(noninferables), len(mistakes), len(excluded), '; '.join(hits), '; '.join(noninferables), '; '.join(mistakes), '; '.join(excluded), total])

In [None]:
import re

analyzed_df = result_df.apply(lambda row: analyze_results(row['GPT-Eval'], row.name),1)
analyzed_df.columns = ['no_hits', 'no_noninferables', 'no_mistakes', 'no_excluded', 'hits', 'noninferables', 'mistakes', 'excluded', 'total_ICD_diagnoses']
analyzed_df['error'] = analyzed_df['no_mistakes'] / (analyzed_df['no_hits'] + analyzed_df['no_mistakes'])
analyzed_df['sensitivity'] = 1-analyzed_df['error']
print(analyzed_df['sensitivity'].mean())
print(1-(analyzed_df['no_mistakes'].sum() / (analyzed_df['no_hits'].sum() + analyzed_df['no_mistakes'].sum())))

results = pd.concat([result_df, analyzed_df], axis=1)

14
: Unspecified place or not applicable

38
: Patient room in hospital as the place of occurrence of the external cause

47
: Other place in hospital as the place of occurrence of the external cause

50
: Do not resuscitate

50
: Encounter for immunization

102
: Examination of participant in clinical trial

105
: Personal history of antineoplastic chemotherapy

105
: Personal history of tobacco use

105
: Do not resuscitate status

105
: Encounter for palliative care

116
: Unspecified place in hospital as the place of occurrence of the external cause

116
: Do not resuscitate

116
: Encounter for immunization

129
: Alcohol abuse, in remission

129
: Personal history of tobacco use

138
: Unspecified place in unspecified non-institutional (private) residence as the place of occurrence of the external cause

144
: Examination of participant in clinical trial

158
: Unspecified place in other non-institutional residence as the place of occurrence of the external cause

172
: Unspecifi

In [None]:
print(results['no_hits'].sum())
print(results['no_excluded'].sum())
print(results['no_hits'].sum() + results['no_mistakes'].sum())
print(results['no_excluded'].sum() + results['no_noninferables'].sum())
print(results['no_hits'].sum() + results['no_mistakes'].sum() + results['no_excluded'].sum() + results['no_noninferables'].sum())

6761
164
6790
7612
14402
