In [1]:
from openai import AsyncOpenAI
client = AsyncOpenAI(api_key = "")
from aiohttp import ClientSession
import pandas as pd
import asyncio
import nest_asyncio
from tqdm import tqdm
import numpy as np
import re

In [4]:
result_df = pd.read_csv('MIMIC-IV_input.csv', index_col=0)

result_df['GPT-Diagnoses'] = np.nan
result_df['GPT-Eval'] = np.nan

In [5]:
nest_asyncio.apply()

async def get_completion(prompt, model="gpt-4o-2024-05-13"):
    messages = [{"role": "user", "content": prompt}]
    response = await client.chat.completions.create(
        model=model,
        temperature = 0,
        messages=messages)
    return response#.choices[0].message["content"]

async def get_diagnoses(queries):
    loop = asyncio.get_event_loop()
    tasks = []
    for query in queries:
        tasks.append(get_completion(query))
    all_data = loop.run_until_complete(asyncio.gather(*tasks))
    return all_data

In [6]:
prompt_no_example = '''Suggest as many potential diagnoses as possible from the following patient data.
In addition, include previously diagnosed conditions and information about patient's medical history (if any).
Give exact numbers and/or text quotes from the data that made you think of each of the diagnoses \
and, if necessary, give further tests that could confirm the diagnosis.
Once you're done, suggest further, more complex diseases that may be ongoing based on the existing diagnoses you already made.
Use the International Classification of Disease (ICD) naming standard for reporting the diagnoses, but you don't have to specify the codes.

Before finalizing your answer check if you haven't missed any abnormal data points and hence any diagnoses that could be made \
based on them. If you did, add them to your list of diagnoses.'''

example = '''

For example, if the patient data mentions:

"Blood report:
min glucose: 103, max glucose: 278, avg glucose: 156.5, max inr: 2.1, max pt: 22.4, max ptt: 150, \
avg wbc: 13.8, max wbc: 14.1, max lactate: 5.9, max bun: 101, max creatinine: 5.8, avg bun: 38.15, avg creatinine: 2.78
Blood gas report:
3 hours after admission the blood gas results from venous blood are: ph: 7.2
Imaging report:
Status post left total shoulder replacement
Chest X-Ray Possible small right pleural effusion and Mild, bibasilar atelectasis.. \
Lung volumes have slightly increased but areas of atelectasis are seen at both the left and the right lung bases
Microbiology tests:
24 hours after admission the microbiology culture test MRSA SCREEN obtained via MRSA SCREEN \
identified POSITIVE FOR METHICILLIN RESISTANT STAPH AUREUS
Vitalsigns data from ICU:
max temperature: 38, min peripheral oxygen saturation: 70, max respiration rate: 29"

then your answer may be:

1: Methicillin resistant Staphylococcus aureus infection, site unspecified
Foundational data: Microbiology culture test identifying "POSITIVE FOR METHICILLIN RESISTANT STAPH AUREUS"

2: Atelectasis
Foundational data from Chest X-Ray: "Mild, bibasilar atelectasis.. \
Lung volumes have slightly increased but areas of atelectasis are seen at both the left and the right lung bases"

3: Pleural effusion, not elsewhere classified
Foundational data from Chest X-Ray: "Possible small right pleural effusion."
Further tests: Thoracentesis, CT chest

4: Acidosis
Foundational data: "ph: 7.2"
Further tests: Urine pH, Anion Gap

5: Lactic acidosis
Foundational data: "max lactate: 5.9"

6: Acquired coagulation factor deficiency
Foundational data: "max inr: 2.1, max pt: 22.4, max ptt: 150"
Further tests: Antiphospholipid Antibodies (APL), Protein C, Protein S, Antithrombin III, Factor V Leiden, Fibrinogen test

7: Hyperglycemia, unspecified
Foundational data: "max glucose: 278, avg glucose: 156.5".
Further tests: Hemoglobin A1c (HbA1c) test

8: Hypoxemia
Foundational data: "min peripheral oxygen saturation: 70"
Further tests: Measure PaO2 in blood

9: Leukocytosis
Foundational data: "max wbc: 14.1, avg wbc: 13.8". The patient's white blood cell count is consistently elevated which may suggest an ongoing inflammatory response or infection.
Further tests: Infection markers such as CRP or PCT, Assessment of symptoms like fever, inflammation or fatigue.

10. Unspecified acute kidney failure:
Foundational data: "max bun: 101, max creatinine: 5.8, avg bun: 38.15, avg creatinine: 2.78"
Further tests: Urine output measurements for oliguria, ultrasound to rule out obstruction

11. Presence of left artificial shoulder joint
Foundational data: The imaging report mentions: "Status post left total shoulder replacement"

Further diseases based on these diagnoses (continued the indexing from the previous number in the list):

12: Unspecified septicemia
Foundational data: positive MRSA screen, systemic inflammatory response: "max respiration rate: 29", "max temperature: 38", leukocytosis
Further tests: HR, BP, wound culture, respiratory excretion tests

13: Septic shock
Foundational data: Septicemia with acidosis and lactic acidosis may suggest septic shock
Further tests: patient examination (low BP, mental disorientation, nausea, pale skin may confirm the finding)

14: Acute respiratory failure, with hypoxia or hypercapnia
Foundational data: hypoxemia and the presence of atelectasis
Further tests: Clinical symptoms (severe shortness of breath, rapid breathing, and confusion), \
arterial blood gas measurements showing hypoxia or hypercapnia

15: Type 2 diabetes mellitus with diabetic chronic kidney disease
Foundational data: Hyperglycemia and kidney failure
Further tests: urine test, hemoglobin (A1C) test, GFR, BP, physical examination (swelling, nausea, weakness, eye disease)'''

patient_data = '''

Patient data:\n'''

In [7]:
query_per_call = 50
repeat = []
for i in tqdm(range(0,1000,query_per_call)):
    try:
        idx_from = i
        idx_to = i + query_per_call
        queries = [prompt_no_example + example + patient_data +  result_df.iloc[i]['GPT_input'] for i in range(idx_from,idx_to)]
        result = asyncio.run(get_diagnoses(queries))
        contents = [result[i].choices[0].message.content for i in range(query_per_call)]
        hadm_ids = [result_df.index[i] for i in range(idx_from,idx_to)]
        result_df.loc[hadm_ids, 'GPT-Diagnoses'] = contents
    except Exception as e:
        print('Error happened at iteration i: ' + str(i))
        repeat.append(i)
        print(e)

  result_df.loc[hadm_ids, 'GPT-Diagnoses'] = contents
100%|██████████| 20/20 [10:19<00:00, 30.96s/it]


In [8]:
async def get_followup(query, response, follow_up, model="gpt-4-1106-preview"): #
    response = await client.chat.completions.create(
        model="gpt-4-1106-preview",
        temperature = 0,
        messages=[
            {"role": "system", "content": "You are a helpful assistant who gives reasons for all answers."},
            {"role": "user", "content": query},
            {"role": "assistant", "content": response},
            {"role": "user", "content": follow_up}
        ])
    return response#.choices[0].message["content"]

In [9]:
follow_up_message = """Below are the actual diagnoses of the same patient reported by clinicians.
Go through the actual diagnoses and cross-check each actual diagnosis with \
the initial list of diagnoses you provided answer the following two questions:

Question 1: Is this actual diagnosis a new disease, not directly related to any of the diagnoses you suggested \
in your initial list? If an actual diagnosis is a complication of, a more specific version of, or falls under a broader \
category of a diagnosis you initially listed, it should not be considered a new disease. If an actual diagnosis \
affects the same organ as a diagnosis you initially listed, but it has a different onset and progression \
(for example, the actual diagnosis is chronic but you initially listed the acute disease), then your answer should be 'No'. \
If an actual diagnosis is caused by the same pathogen as a diagnosis in your initial list, the answer should also be 'No'. \
If an actual diagnosis is not a medical diagnosis, your answer should be 'No'.

If your answer to Question 1 was 'No', put N/A as answer for Question 2 and skip to the Example below.

Question 2: Would it be possible to directly infer this actual diagnosis from the patient data provided in the initial query?
If yes, support with facts: quote exact numbers or text from the initial query.
If no, in case the data contradicts the diagnosis, quote the data and say why it does not support the diagnosis. \
Otherwise, please specify what additional data would have been helpful to establish this diagnosis.

Example:
If the patient data is:
"Blood report: min potassium: 3.1, avg hemoglobin: 14.5, max sodium: 139, avg wbc: 13.9
Blood gas report: ph: 7.2
Imaging report: patient with polysubstance abuse, lungs look normal"

and your initial list in your previous response contained the following suggested diagnoses:
1: Acidosis
Foundational data: "ph: 7.2"
Further tests: Urine pH, Anion Gap

2: Polysubstance abuse, not elsewhere classified
Foundational data: The imaging report mentions "patient with polysubstance abuse"

3: Leukocytosis
Foundational data: "avg wbc: 13.9".
Further tests: Infection markers such as CRP or PCT, Assessment of symptoms like fever, inflammation or fatigue.

and actual diagnoses are:
D1: Poisoning by cocaine
D2: Hypokalemia
D3: Hypernatremia
D4: Severe sepsis

Then your answer should be:
D1: Poisoning by cocaine
Question 1: No, this is similar to diagnosis 2: Polysubstance abuse, not elsewhere classified.
Question 2: N/A

D2: Hypokalemia
Question 1: Yes
Question 2: Yes, the blood report mentions "min potassium: 3.1"

D3: Hypernatremia
Question 1: Yes
Question 2: No, the blood report mentions "max sodium: 139", but only sodium levels above 145 mmol/L indicate hypernatremia, \
hence the data does not support hypernatremia.

D4: Severe sepsis
Question 1: Yes
Question 2: No, additional data such as fever, increased heart rate, increased respiratory rate, positive blood cultures, or evidence of organ dysfunction would have been helpful to establish this diagnosis. "

Before finalizing your answer check if you haven't missed noticing any diagnoses from your initial list that are related to \
any of the actual diagnoses you answered the two questions for! If you did, modify the answers to the questions accordingly!

Actual diagnoses:\n"""

async def get_evaluation(queries, responses, follow_ups):
    loop = asyncio.get_event_loop()
    tasks = []
    for query, response, follow_up in zip(queries, responses, follow_ups):
        tasks.append(get_followup(query, response, follow_up))
    all_data = loop.run_until_complete(asyncio.gather(*tasks))
    return all_data

In [10]:
query_per_call = 50
repeat = []
for i in tqdm(range(0,1000,query_per_call)):
    try:
        idx_from = i
        idx_to = i + query_per_call
        queries = [prompt_no_example +  patient_data + result_df.iloc[i]['GPT_input'] for i in range(idx_from,idx_to)]
        responses = result_df.iloc[idx_from:idx_to,:]['GPT-Diagnoses']
        follow_ups = [follow_up_message + result_df.iloc[i]['diagnoses'].replace('\n', '\nD').replace('1:', 'D1:', 1) for i in range(idx_from,idx_to)]
        result = asyncio.run(get_evaluation(queries, responses, follow_ups))
        contents = [result[i].choices[0].message.content for i in range(query_per_call)]
        hadm_ids = [result_df.index[i] for i in range(idx_from,idx_to)]
        result_df.loc[hadm_ids, 'GPT-Eval'] = contents
    except Exception as e:
        print('Error happened at iteration i: ' + str(i))
        repeat.append(i)
        print(e)

  result_df.loc[hadm_ids, 'GPT-Eval'] = contents
100%|██████████| 20/20 [16:14<00:00, 48.74s/it]


In [11]:
def analyze_results(text, index):
    mistakes = []
    hits = []
    excluded = [] #not a medical diagnosis
    noninferables = []
    current = 1
    total_adjust = 0
    #for conditions that GPT-4 grouped together - still doesn't capture issue with hadmid 23707730
    from_nums = []
    to_nums = []
    grouped = re.findall(r'\n\d+-\d+:', text)
    if len(grouped) > 0:
        #print("Grouping found!")
        #print("At index: ")
        #print(index)
        for elem in grouped:
            from_nums.append(str(int(elem.split('-')[0]))) #str(int()) for safety
            to_nums.append(str(int(elem.split('-')[1].strip(':'))))
    while 1:
        try:
            if current == 1:
                number = str(current)
                #sometimes GPT-4 adds words like Diagnosis or Actual diagnosis, and we want to capture that
                pre_word = text.split(number, 1)[0]
            #for conditions that GPT-4 grouped together
            elif str(current) in from_nums:
                idx = from_nums.index(str(current))
                number = grouped[idx]
                total_adjust += int(to_nums[idx]) - current
                current = int(to_nums[idx])
            else:
                number = '\n' + pre_word + str(current)
            if text.split(number, 1)[1].split('Question 1: ', 1)[1][:2] == 'No':
                if 'not a medical diagnosis' in text.split(number, 1)[1].split('Question 1: ', 1)[1].split('Question 2: ', 1)[0]:
                    print(text.split(number, 1)[1].split('Question 1: ', 1)[0])
                    excluded.append(str(current))
                else:
                    hits.append(str(current))
            elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:3] == 'Yes':
                mistakes.append(str(current))
            elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:2] == 'No':
                noninferables.append(str(current))
            else:
                print("Unable to parse text when looking at diagnosis number: ")
                print(current)
                print("At index: ")
                print(index)
        except:
            #print('Diagnosis number not found in text: ')
            #print(current)
            total = current - 1 - total_adjust
            break
        current += 1
    return pd.Series([len(hits), len(noninferables), len(mistakes), len(excluded), '; '.join(hits), '; '.join(noninferables), '; '.join(mistakes), '; '.join(excluded), total])

In [12]:
def analyze_results_new(text, index):
    mistakes = []
    hits = []
    excluded = [] #not a medical diagnosis
    noninferables = []
    current = 1
    total_adjust = 0
    #for conditions that GPT-4 grouped together - still doesn't capture issue with hadmid 23707730
    from_nums = []
    to_nums = []
    grouped = re.findall(r'\n\d+-\d+:', text)
    if len(grouped) > 0:
        #print("Grouping found!")
        #print("At index: ")
        #print(index)
        for elem in grouped:
            from_nums.append(str(int(elem.split('-')[0]))) #str(int()) for safety
            to_nums.append(str(int(elem.split('-')[1].strip(':'))))
    while 1:
        try:
            if current == 1:
                number = str(current)
                #sometimes GPT-4 adds words like Diagnosis or Actual diagnosis, and we want to capture that
                pre_word = text.split(number, 1)[0]
            #for conditions that GPT-4 grouped together
            elif str(current) in from_nums:
                idx = from_nums.index(str(current))
                number = grouped[idx]
                total_adjust += int(to_nums[idx]) - current
                current = int(to_nums[idx])
            else:
                number = '\n' + pre_word + str(current)
            nextOne = '\n' + pre_word + str(current+1)
            if text.split(number, 1)[1].split('Question 1: ', 1)[1][:2] == 'No':
                if 'not a medical diagnosis' in text.split(number, 1)[1].split('Question 1: ', 1)[1].split('Question 2: ', 1)[0].split(nextOne, 1)[0]:
                    print(index)
                    print(text.split(number, 1)[1].split('Question 1: ', 1)[0])
                    excluded.append(str(current))
                else:
                    hits.append(str(current))
            elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:3] == 'Yes':
                mistakes.append(str(current))
            elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:2] == 'No':
                noninferables.append(str(current))
            else:
                print("Unable to parse text when looking at diagnosis number: ")
                print(current)
                print("At index: ")
                print(index)
        except:
            #print('Diagnosis number not found in text: ')
            #print(current)
            total = current - 1 - total_adjust
            break
        current += 1
    return pd.Series([len(hits), len(noninferables), len(mistakes), len(excluded), '; '.join(hits), '; '.join(noninferables), '; '.join(mistakes), '; '.join(excluded), total])


In [13]:
analyzed_df = result_df.apply(lambda row: analyze_results(row['GPT-Eval'], row.name),1)
analyzed_df.columns = ['no_hits', 'no_noninferables', 'no_mistakes', 'no_excluded', 'hits', 'noninferables', 'mistakes', 'excluded', 'total_ICD_diagnoses']
analyzed_df['error'] = analyzed_df['no_mistakes'] / (analyzed_df['no_hits'] + analyzed_df['no_mistakes'])
analyzed_df['sensitivity'] = 1-analyzed_df['error']
print(analyzed_df['sensitivity'].mean())
print(1-(analyzed_df['no_mistakes'].sum() / (analyzed_df['no_hits'].sum() + analyzed_df['no_mistakes'].sum())))

results = pd.concat([result_df, analyzed_df], axis=1)

: Unspecified place or not applicable

: Encounter for palliative care

: Patient room in hospital as the place of occurrence of the external cause

: Other place in hospital as the place of occurrence of the external cause

: Do not resuscitate

: Encounter for immunization

: Operating room of hospital as the place of occurrence of the external cause

: Unspecified place in unspecified non-institutional (private) residence as the place of occurrence of the external cause

: Unspecified place or not applicable

: Encounter for examination for normal comparison and control in clinical research program

: Unspecified place or not applicable

: Awaiting organ transplant status

: Unspecified place in unspecified non-institutional (private) residence as the place of occurrence of the external cause

Unable to parse text when looking at diagnosis number: 
1
At index: 
87
Unable to parse text when looking at diagnosis number: 
2
At index: 
87
Unable to parse text when looking at diagnosis n

In [14]:
analyzed_df = result_df.apply(lambda row: analyze_results_new(row['GPT-Eval'], row.name),1)
analyzed_df.columns = ['no_hits', 'no_noninferables', 'no_mistakes', 'no_excluded', 'hits', 'noninferables', 'mistakes', 'excluded', 'total_ICD_diagnoses']
analyzed_df['error'] = analyzed_df['no_mistakes'] / (analyzed_df['no_hits'] + analyzed_df['no_mistakes'])
analyzed_df['sensitivity'] = 1-analyzed_df['error']
print(analyzed_df['sensitivity'].mean())
print(1-(analyzed_df['no_mistakes'].sum() / (analyzed_df['no_hits'].sum() + analyzed_df['no_mistakes'].sum())))

results_new = pd.concat([result_df, analyzed_df], axis=1)

14
: Unspecified place or not applicable

34
: Encounter for palliative care

38
: Patient room in hospital as the place of occurrence of the external cause

47
: Other place in hospital as the place of occurrence of the external cause

50
: Do not resuscitate

50
: Encounter for immunization

52
: Operating room of hospital as the place of occurrence of the external cause

52
: Unspecified place in unspecified non-institutional (private) residence as the place of occurrence of the external cause

72
: Unspecified place or not applicable

75
: Encounter for examination for normal comparison and control in clinical research program

76
: Unspecified place or not applicable

78
: Awaiting organ transplant status

80
: Unspecified place in unspecified non-institutional (private) residence as the place of occurrence of the external cause

Unable to parse text when looking at diagnosis number: 
1
At index: 
87
Unable to parse text when looking at diagnosis number: 
2
At index: 
87
Unable to

In [15]:
print(results['no_hits'].sum() + results['no_mistakes'].sum())
print(results['no_excluded'].sum() + results['no_noninferables'].sum())
print(results['no_hits'].sum() + results['no_mistakes'].sum() + results['no_excluded'].sum() + results['no_noninferables'].sum())
results['no_excluded'].sum()

7189
7194
14383


221

In [16]:
print(results_new['no_hits'].sum() + results_new['no_mistakes'].sum())
print(results_new['no_excluded'].sum() + results_new['no_noninferables'].sum())
print(results_new['no_hits'].sum() + results_new['no_mistakes'].sum() + results_new['no_excluded'].sum() + results_new['no_noninferables'].sum())
results_new['no_excluded'].sum()

7189
7194
14383


221

In [17]:
result_df.to_csv('NewRun_05-13-GPT4o.csv')

from google.colab import files
import time

time.sleep(5)
files.download('NewRun_05-13-GPT4o.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>