In [1]:
from aiohttp import ClientSession
import pandas as pd
import asyncio
import nest_asyncio
from tqdm import tqdm
import numpy as np
import re

In [2]:
import os
import openai
import dotenv

dotenv.load_dotenv()

os.environ["AZURE_OPENAI_ENDPOINT"] = ""
os.environ["AZURE_OPENAI_API_KEY"] = ""
os.environ["SEARCH_ENDPOINT"] = ""
os.environ["SEARCH_KEY"] = ""
os.environ["SEARCH_INDEX_NAME"] = ""
os.environ["EMBEDDING_ENDPOINT"] = ""
os.environ["EMBEDDING_KEY"] = ""

In [11]:
result_df = pd.read_csv('PaLM2input.csv')

result_df['GPT_input'] = result_df['GPT_input'].str.replace('minimum', 'min')
result_df['GPT_input'] = result_df['GPT_input'].str.replace('maximum', 'max')
result_df['GPT_input'] = result_df['GPT_input'].str.replace('average', 'avg')
result_df['GPT_input'] = result_df['GPT_input'].str.replace('maxntprobnp', 'max ntprobnp')

result_df['GPT-Diagnoses'] = np.nan
result_df['GPT-Eval'] = np.nan

In [3]:
nest_asyncio.apply()

In [4]:
endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
api_key = os.environ["AZURE_OPENAI_API_KEY"]
# set the deployment name for the model we want to use
deployment = "gpt-4" #GPT-4o! 

# client = openai.AzureOpenAI(
#     base_url=f"{endpoint}/openai/deployments/{deployment}/extensions",
#     api_key=api_key,
#     api_version="2023-09-01-preview"
# )

from openai import AsyncOpenAI
client = AsyncOpenAI(api_key = "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")

async_client = openai.AsyncAzureOpenAI(
    base_url=f"{endpoint}/openai/deployments/{deployment}/extensions",
    api_key=api_key,
    api_version="2023-09-01-preview"
)

In [5]:
async def get_completion(prompt):
    
    completion = await async_client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        max_tokens = 4096,
        model=deployment,
        extra_body={
            "dataSources": [
                {
                    "type": "AzureCognitiveSearch",
                    "parameters": {
                        "endpoint": os.environ["SEARCH_ENDPOINT"],
                        "key": os.environ["SEARCH_KEY"],
                        "indexName": os.environ["SEARCH_INDEX_NAME"],
                        "embeddingEndpoint": os.environ["EMBEDDING_ENDPOINT"],
                        "embeddingKey": os.environ["EMBEDDING_KEY"],
                        # parameters below copied from web playground request response
                        "semanticConfiguration": "default",
                        "queryType": "vector",
                        "fieldsMapping": {
                            "contentFields": ["content"],
                            "filepathField": "filepath",
                            "titleField": "title",
                            "urlField": "url",
                            "vectorFields": ["contentVector"],
                        },
                        "inScope": False,  # probably to restrict generation to use only inputs from retrieval
                        "filter": None,
                        "strictness": 3,
                        "topNDocuments": 5,
                    }
                }
            ]
        }
    )

    return completion

In [6]:
prompt_no_example = '''You are an expert diagnostician machine for use by doctors. If the user input is not patient data, you politely decline the request. Please suggest diagnoses and conditions, followed by the evidence points supporting each diagnosis in the form of bullet points. Include previous diagnoses and pertinent information about the patient's medical history (if any). Pay close attention to all the history and investigations provided.  Put asterisks around the diagnoses to highlight them. Give each evidence points as a separate bullet point beneath the diagnosis. Include in your evidence points any relevant clinical scores that can be calculated from the information I have given. Do not explain the evidence points, only state them. For every diagnosis you list, if there are alternative differentials possible, state the most likely three in a bullet point beneath the evidence points (you do not need to state the evidence supporting them - you only need to do that for the main diagnoses). For the main diagnoses, give only confirmed diagnoses and evidence points that can be inferred solely based on the information I have given - do not use any other information. Only give me the information I have asked for - do not give me any other information. Do not give me any introductions or conclusions, safety instructions, or safety warnings. Use British English.
                
                                To illustrate how the information should be presented:
                
                                *MAIN DIAGNOSIS 1 AS HEADING*
                                evidence points to support MAIN DIAGNOSIS 1
                                The final bullet point is alternative differentials to consider: alternative 1, alternative 2, alternative 3
                
                                *MAIN DIAGNOSIS 2 AS HEADING*
                                evidence points to support MAIN DIAGNOSIS 2
                                The final bullet point is alternative differentials to consider: alternative 1, alternative 2, alternative 3
                
                                and so on...
                
Before finalising your answer check if you haven't missed any abnormal data points and hence any diagnoses or alternative differentials that could be made based on them. If you did, add them to your reply. If two diagnoses are commonly caused by the same underlying disease, have them under one header, which is the underlying disease.
'''

example = ''''''

patient_data = '''

Patient data:\n'''

async def get_diagnoses(queries):
    loop = asyncio.get_event_loop()
    tasks = []
    for query in queries:
        tasks.append(get_completion(query))
    all_data = loop.run_until_complete(asyncio.gather(*tasks))
    return all_data

In [16]:
query_per_call = 20 #roughly around 40000 tokens / min (limit) at max
repeat = []
for i in tqdm(range(0,1000,query_per_call)):
    try:
        idx_from = i
        idx_to = i + query_per_call
        queries = [prompt_no_example + example + patient_data +  result_df.iloc[i]['GPT_input'] for i in range(idx_from,idx_to)]
        result = asyncio.run(get_diagnoses(queries))
        contents = [result[i].choices[0].message.content for i in range(query_per_call)]
        hadm_ids = [result_df.index[i] for i in range(idx_from,idx_to)]
        result_df.loc[hadm_ids, 'GPT-Diagnoses'] = contents
    except Exception as e: 
        print('Error happened at iteration i: ' + str(i))
        repeat.append(i)
        print(e)

100%|████████████████████████████████████████████████████████████████████████████████| 50/50 [1:18:42<00:00, 94.44s/it]


In [46]:
query_per_call = 1
for i in tqdm([765]):
    try:
        idx_from = i
        idx_to = i + query_per_call
        queries = [prompt_no_example + example + patient_data + result_df.iloc[i]['GPT_input'] for i in range(idx_from,idx_to)]
        result = asyncio.run(get_diagnoses(queries))
        contents = [result[i].choices[0].message.content for i in range(query_per_call)]
        hadm_ids = [result_df.index[i] for i in range(idx_from,idx_to)]
        result_df.loc[hadm_ids, 'GPT-Diagnoses'] = contents
    except Exception as e: 
        print('Error happened at iteration i: ' + str(i))
        print(e)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.55s/it]


In [47]:
print(result_df.iloc[765]['GPT-Diagnoses'])

*Anaemia*
- Min haemoglobin: 7.3 g/dL
- Min haematocrit: 22.9%
- Min RBC: 2.77 million cells/µL
- Min MCH: 25.7 pg
- Min MCHC: 30.7 g/dL
- Min MCV: 83 fL
- Min iron: (not provided, but low haemoglobin and haematocrit suggest iron deficiency)
- Min reticulocyte count: (not provided, but would be useful for further evaluation)
- Alternative differentials to consider: Thalassemia, Chronic disease anaemia, Vitamin B12 deficiency

*Liver Dysfunction*
- Max ALT: 303 U/L
- Max AST: 337 U/L
- Max ALP: 503 U/L
- Max total bilirubin: 1.5 mg/dL
- Min albumin: 3.3 g/dL
- Alternative differentials to consider: Hepatitis, Alcoholic liver disease, Non-alcoholic fatty liver disease

*Electrolyte Imbalance*
- Min calcium: 7.9 mg/dL
- Min potassium: 3.3 mmol/L
- Min sodium: 136 mmol/L
- Max sodium: 142 mmol/L
- Min chloride: 101 mmol/L
- Max chloride: 106 mmol/L
- Alternative differentials to consider: Renal dysfunction, Endocrine disorders, Dehydration

*Respiratory Acidosis*
- Initial blood gas pH: 7.

In [48]:
result_df.to_csv('GPT4o_diag_result.csv')

In [10]:
result_df = pd.read_csv('GPT4o_diag_result.csv', index_col=0)

In [12]:
async def get_followup(query, response, follow_up, model="gpt-4-1106-preview"):
    response = await client.chat.completions.create(
        model="gpt-4-1106-preview",
        temperature = 0,
        messages=[
            {"role": "system", "content": "You are a helpful assistant who gives reasons for all answers."},
            {"role": "user", "content": query},
            {"role": "assistant", "content": response},
            {"role": "user", "content": follow_up}
        ])
    return response#.choices[0].message["content"]

In [13]:
follow_up_message = """Below are the actual diagnoses of the same patient reported by clinicians.
Go through the actual diagnoses and cross-check each actual diagnosis with \
the initial list of diagnoses you provided answer the following two questions:

Question 1: Is this actual diagnosis a new disease, not directly related to any of the diagnoses or alternatives you suggested \
in your initial list? If an actual diagnosis is a complication of, a more specific version of, or falls under a broader \
category of a diagnosis / alternative you initially listed, it should not be considered a new disease. If an actual diagnosis \
affects the same organ as a diagnosis / alternative you initially listed, but it has a different onset and progression \
(for example, the actual diagnosis is chronic but you initially listed the acute disease), then your answer should be 'No'. \
If an actual diagnosis is caused by the same pathogen as a diagnosis in your initial list, the answer should also be 'No'. \
If an actual diagnosis is not a medical diagnosis, your answer should be 'No'.

If your answer to Question 1 was 'No', put N/A as answer for Question 2 and skip to the Example below. 

Question 2: Would it be possible to directly infer this actual diagnosis from the patient data provided in the initial query? 
If yes, support with facts: quote exact numbers or text from the initial query. 
If no, in case the data contradicts the diagnosis, quote the data and say why it does not support the diagnosis. \
Otherwise, please specify what additional data would have been helpful to establish this diagnosis.

Example:
If the patient data is:
"Blood report: min potassium: 3.1, avg hemoglobin: 14.5, max sodium: 139, avg wbc: 13.9
Blood gas report: ph: 7.2
Imaging report: patient with polysubstance abuse, lungs look normal"

and your initial list in your previous response contained the following suggested diagnoses:
*Acidosis*
- ph of 7.2
- Alternative differentials to consider: respiratory acidosis, metabolic acidosis, mixed acid-base disorder

*Polysubstance abuse*
- The imaging report mentions "patient with polysubstance abuse"
- Alternative differentials to consider: alcohol abuse, drug abuse, signs of withdrawal'

*Leukocytosis*
- avg wbc of 13.9 
- Alternative differentials to consider: infection, inflammatory condition, myeloproliferative disorder

and actual diagnoses are:
D1: Poisoning by cocaine
D2: Hypokalemia
D3: Hypernatremia
D4: Severe sepsis

Then your answer should be:
D1: Poisoning by cocaine
Question 1: No, this is similar to diagnosis *Polysubstance abuse*
Question 2: N/A

D2: Hypokalemia
Question 1: Yes
Question 2: Yes, the blood report mentions "min potassium: 3.1"

D3: Hypernatremia
Question 1: Yes
Question 2: No, the blood report mentions "max sodium: 139", but only sodium levels above 145 mmol/L indicate hypernatremia, \
hence the data does not support hypernatremia.

D4: Severe sepsis
Question 1: Yes
Question 2: No, additional data such as fever, increased heart rate, increased respiratory rate, positive blood cultures, or evidence of organ dysfunction would have been helpful to establish this diagnosis. "

Before finalizing your answer check if you haven't missed noticing any diagnoses from your initial list that are related to \
any of the actual diagnoses you answered the two questions for! If you did, modify the answers to the questions accordingly!

Actual diagnoses:\n"""

async def get_evaluation(queries, responses, follow_ups):
    loop = asyncio.get_event_loop()
    tasks = []
    for query, response, follow_up in zip(queries, responses, follow_ups):
        tasks.append(get_followup(query, response, follow_up))
    all_data = loop.run_until_complete(asyncio.gather(*tasks))
    return all_data

In [14]:
query_per_call = 25
repeat = []
for i in tqdm(range(0,1000,query_per_call)):
    try:
        idx_from = i
        idx_to = i + query_per_call
        queries = [prompt_no_example +  patient_data + result_df.iloc[i]['GPT_input'] for i in range(idx_from,idx_to)]
        responses = result_df.iloc[idx_from:idx_to,:]['GPT-Diagnoses']
        follow_ups = [follow_up_message + result_df.iloc[i]['diagnoses'].replace('\n', '\nD').replace('1:', 'D1:', 1) for i in range(idx_from,idx_to)]
        result = asyncio.run(get_evaluation(queries, responses, follow_ups))
        contents = [result[i].choices[0].message.content for i in range(query_per_call)]
        hadm_ids = [result_df.index[i] for i in range(idx_from,idx_to)]
        result_df.loc[hadm_ids, 'GPT-Eval'] = contents
    except Exception as e: 
        print('Error happened at iteration i: ' + str(i))
        repeat.append(i)
        print(e)

100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [33:55<00:00, 50.90s/it]


In [52]:
query_per_call = 1
for i in tqdm([360]):
    try:
        idx_from = i
        idx_to = i + query_per_call
        queries = [prompt_no_example +  patient_data + result_df.iloc[i]['GPT_input'] for i in range(idx_from,idx_to)]
        responses = result_df.iloc[idx_from:idx_to,:]['GPT-Diagnoses']
        follow_ups = [follow_up_message + result_df.iloc[i]['diagnoses'].replace('\n', '\nD').replace('1:', 'D1:', 1) for i in range(idx_from,idx_to)]
        result = asyncio.run(get_evaluation(queries, responses, follow_ups))
        contents = [result[i].choices[0].message.content for i in range(query_per_call)]
        hadm_ids = [result_df.index[i] for i in range(idx_from,idx_to)]
        result_df.loc[hadm_ids, 'GPT-Eval'] = contents
    except Exception as e: 
        print('Error happened at iteration i: ' + str(i))
        repeat.append(i)
        print(e)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:40<00:00, 40.23s/it]


In [15]:
def analyze_results(text, index):
    mistakes = []
    hits = []
    excluded = [] #not a medical diagnosis
    noninferables = []
    current = 1
    total_adjust = 0
    #for conditions that GPT-4 grouped together - still doesn't capture issue with hadmid 23707730
    from_nums = []
    to_nums = []
    grouped = re.findall(r'\n\d+-\d+:', text)
    if len(grouped) > 0:
        #print("Grouping found!")
        #print("At index: ")
        #print(index)
        for elem in grouped:
            from_nums.append(str(int(elem.split('-')[0]))) #str(int()) for safety
            to_nums.append(str(int(elem.split('-')[1].strip(':'))))
    while 1:
        try:
            if current == 1:
                number = str(current)
                #sometimes GPT-4 adds words like Diagnosis or Actual diagnosis, and we want to capture that
                pre_word = text.split(number, 1)[0]
            #for conditions that GPT-4 grouped together
            elif str(current) in from_nums:
                idx = from_nums.index(str(current))
                number = grouped[idx]
                total_adjust += int(to_nums[idx]) - current
                current = int(to_nums[idx])
            else:
                number = '\n' + pre_word + str(current)
            nextOne = '\n' + pre_word + str(current+1)
            if text.split(number, 1)[1].split('Question 1: ', 1)[1][:2] == 'No':
                if 'not a medical diagnosis' in text.split(number, 1)[1].split('Question 1: ', 1)[1].split('Question 2: ', 1)[0].split(nextOne, 1)[0]:
                    print(index)
                    print(text.split(number, 1)[1].split('Question 1: ', 1)[0])
                    excluded.append(str(current))
                else:
                    hits.append(str(current))
            elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:3] == 'Yes':
                mistakes.append(str(current))
            elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:2] == 'No':
                noninferables.append(str(current))
            else:
                print("Unable to parse text when looking at diagnosis number: ")
                print(current)
                print("At index: ")
                print(index)
        except:
            #print('Diagnosis number not found in text: ')
            #print(current)
            total = current - 1 - total_adjust
            break
        current += 1
    return pd.Series([len(hits), len(noninferables), len(mistakes), len(excluded), '; '.join(hits), '; '.join(noninferables), '; '.join(mistakes), '; '.join(excluded), total])

In [53]:
analyzed_df = result_df.apply(lambda row: analyze_results(row['GPT-Eval'], row.name),1)
analyzed_df.columns = ['no_hits', 'no_noninferables', 'no_mistakes', 'no_excluded', 'hits', 'noninferables', 'mistakes', 'excluded', 'total_ICD_diagnoses']
analyzed_df['error'] = analyzed_df['no_mistakes'] / (analyzed_df['no_hits'] + analyzed_df['no_mistakes'])
analyzed_df['sensitivity'] = 1-analyzed_df['error']
print(analyzed_df['sensitivity'].mean())
print(1-(analyzed_df['no_mistakes'].sum() / (analyzed_df['no_hits'].sum() + analyzed_df['no_mistakes'].sum())))

results = pd.concat([result_df, analyzed_df], axis=1)

7
: Unspecified fall

14
: Unspecified place or not applicable

47
: Other place in hospital as the place of occurrence of the external cause

58
: Long-term (current) use of aspirin

78
: Awaiting organ transplant status

88
: Examination of participant in clinical trial

94
: 31 weeks gestation of pregnancy

102
: Anxiety state, unspecified

105
: Do not resuscitate status

105
: Encounter for palliative care

141
: Presence of cardiac pacemaker

141
: Presence of other heart-valve replacement

158
: Unspecified place in other non-institutional residence as the place of occurrence of the external cause

182
: Personal history of tobacco use

211
: Accidental fall from ladder

221
: Unspecified place or not applicable

230
: Other motor vehicle traffic accident involving collision with motor vehicle injuring passenger in motor vehicle other than motorcycle

239
: Do not resuscitate

261
: Dysthymic disorder

287
: Unspecified place or not applicable

290
: Accidents occurring in unspe

In [54]:
results['num_real_diag'] = results['diagnoses'].str.split('\n').apply(len)
results[results['total_ICD_diagnoses'] != results['num_real_diag']].index.tolist()

[]

In [44]:
#regenerating results where analyzed diagnoses were not equal to true number of diagnoses

follow_up_message = """Below are the actual diagnoses of the same patient reported by clinicians.
Go through the actual diagnoses and cross-check each actual diagnosis with \
the initial list of diagnoses you provided and answer both of the following two questions:

Question 1: Is this actual diagnosis a new disease, not directly related to any of the diagnoses you suggested \
in your initial list? If an actual diagnosis is a complication of, a more specific version of, or falls under a broader \
category of a diagnosis you initially listed, it should not be considered a new disease. If an actual diagnosis \
affects the same organ as a diagnosis you initially listed, but it has a different onset and progression \
(for example, the actual diagnosis is chronic but you initially listed the acute disease), then your answer should be 'No'. \
If an actual diagnosis is caused by the same pathogen as a diagnosis in your initial list, the answer should also be 'No'. \
If an actual diagnosis is not a medical diagnosis, your answer should be 'No'.

Only if your answer to Question 1 was 'No', put N/A as answer for Question 2 and skip to the Example below. 

Question 2: Would it be possible to directly infer this actual diagnosis from the patient data provided in the initial query? 
If yes, support with facts: quote exact numbers or text from the initial query. 
If no, in case the data contradicts the diagnosis, quote the data and say why it does not support the diagnosis. \
Otherwise, please specify what additional data would have been helpful to establish this diagnosis.

Example:
If the patient data is:
"Blood report: min potassium: 3.1, avg hemoglobin: 14.5, max sodium: 139, avg wbc: 13.9
Blood gas report: ph: 7.2
Imaging report: patient with polysubstance abuse, lungs look normal"

and your initial list in your previous response contained the following suggested diagnoses:
*Acidosis*
- ph of 7.2
- Alternative differentials to consider: respiratory acidosis, metabolic acidosis, mixed acid-base disorder

*Polysubstance abuse*
- The imaging report mentions "patient with polysubstance abuse"
- Alternative differentials to consider: alcohol abuse, drug abuse, signs of withdrawal'

*Leukocytosis*
- avg wbc of 13.9 
- Alternative differentials to consider: infection, inflammatory condition, myeloproliferative disorder

and actual diagnoses are:
D1: Poisoning by cocaine
D2: Hypokalemia
D3: Hypernatremia
D4: Severe sepsis

Then your answer should be:
D1: Poisoning by cocaine
Question 1: No, this is similar to diagnosis *Polysubstance abuse*
Question 2: N/A

D2: Hypokalemia
Question 1: Yes
Question 2: Yes, the blood report mentions "min potassium: 3.1"

D3: Hypernatremia
Question 1: Yes
Question 2: No, the blood report mentions "max sodium: 139", but only sodium levels above 145 mmol/L indicate hypernatremia, \
hence the data does not support hypernatremia.

D4: Severe sepsis
Question 1: Yes
Question 2: No, additional data such as fever, increased heart rate, increased respiratory rate, positive blood cultures, or evidence of organ dysfunction would have been helpful to establish this diagnosis. "

Before finalizing your answer check if you haven't missed noticing any diagnoses from your initial list that are related to \
any of the actual diagnoses you answered the two questions for! If you did, modify the answers to the questions accordingly!

Actual diagnoses:\n"""

regenerate = results[results['total_ICD_diagnoses'] != results['num_real_diag']].index.tolist()
query_per_call = len(regenerate)
async def regenerate_evaluation(regenerate):
    loop = asyncio.get_event_loop()
    tasks = []
    queries = [prompt_no_example +  patient_data] + result_df.loc[regenerate, 'GPT_input'].values
    responses = result_df.loc[regenerate, 'GPT-Diagnoses'].values
    follow_ups = [follow_up_message + result_df.loc[reg, 'diagnoses'].replace('\n', '\nD').replace('1:', 'D1:', 1) for reg in regenerate]
    for query, response, follow_up in zip(queries, responses, follow_ups):
        tasks.append(get_followup(query, response, follow_up))
    all_data = loop.run_until_complete(asyncio.gather(*tasks))
    return all_data

try:
    result = asyncio.run(regenerate_evaluation(regenerate))
    contents = [result[i].choices[0].message.content for i in range(query_per_call)]
    result_df.loc[regenerate, 'GPT-Eval'] = contents
except Exception as e: 
    print('Error happened at iteration i: ' + str(i))
    print(e)

In [200]:
# def debug_analyze(text):
#     mistakes = []
#     hits = []
#     excluded = [] #not a medical diagnosis
#     noninferables = []
#     current = 1
#     total_adjust = 0
#     from_nums = []
#     to_nums = []
#     grouped = re.findall(r'\n\d+-\d+:', text)
#     if len(grouped) > 0:
#         #print("Grouping found!")
#         #print("At index: ")
#         #print(index)
#         for elem in grouped:
#             from_nums.append(str(int(elem.split('-')[0]))) #str(int()) for safety
#             to_nums.append(str(int(elem.split('-')[1].strip(':'))))
#     while 1:
#         try:
#             if current == 1:
#                 number = str(current)
#                 #sometimes GPT-4 adds words like Diagnosis or Actual diagnosis, and we want to capture that
#                 pre_word = text.split(number, 1)[0]
#             #for conditions that GPT-4 grouped together
#             elif str(current) in from_nums:
#                 idx = from_nums.index(str(current))
#                 number = grouped[idx]
#                 total_adjust += int(to_nums[idx]) - current
#                 current = int(to_nums[idx])
#             else:
#                 number = '\n' + pre_word + str(current)
#             nextOne = '\n' + pre_word + str(current+1)
#             if text.split(number, 1)[1].split('Question 1: ', 1)[1][:2] == 'No':
#                 if 'not a medical diagnosis' in text.split(number, 1)[1].split('Question 1: ', 1)[1].split('Question 2: ', 1)[0].split(nextOne, 1)[0]:
#                     print(text.split(number, 1)[1].split('Question 1: ', 1)[0])
#                     print(text.split(number, 1)[1].split('Question 1: ', 1)[1].split('Question 2: ', 1)[0].split(nextOne, 1)[0])
#                     excluded.append(str(current))
#                 else:
#                     hits.append(str(current))
#             elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:3] == 'Yes':
#                 mistakes.append(str(current))
#             elif text.split(number, 1)[1].split('Question 2: ', 1)[1][:2] == 'No':
#                 noninferables.append(str(current))
#             else:
#                 print("Unable to parse text when looking at diagnosis number: ")
#                 print(current)
#         except Exception as e:
#             print(e)
#             print('Diagnosis number not found in text: ')
#             print(current)
#             total = current - 1 - total_adjust
#             break
#         current += 1

In [2]:
results = pd.read_csv('results_GPT4o_ProdRAG_TurboEval.csv', index_col=0)

In [55]:
result_df.to_csv('result_df_GPT4o_ProdRAG_TurboEval.csv')

results = pd.concat([result_df, analyzed_df], axis=1)

results.to_csv('results_GPT4o_ProdRAG_TurboEval.csv')

In [17]:
results.loc[results.no_mistakes>=2]

Unnamed: 0,hadm_id,diagnoses,GPT_input,GPT-Diagnoses,GPT-Eval,no_hits,no_noninferables,no_mistakes,no_excluded,hits,noninferables,mistakes,excluded,total_ICD_diagnoses,error,sensitivity
569,23088928,1:Nonrheumatic aortic (valve) stenosis\n2:Unsp...,Blood report: \nThe patient stayed in the hosp...,*Anaemia*\n- Min haemoglobin: 9.4 g/dL\n- Min ...,D1: Nonrheumatic aortic (valve) stenosis\nQues...,2,10,2,0,1; 3,2; 4; 5; 7; 8; 9; 11; 12; 13; 14,6; 10,,14,0.5,0.5


In [3]:
results.no_mistakes.sum()

18

In [4]:
results['no_hits'].sum() + results['no_mistakes'].sum()

7604

In [5]:
results['no_hits'].sum()

7586

In [6]:
results['no_noninferables'].sum()

6704

In [7]:
results['no_excluded'].sum()

94

In [8]:
results['no_hits'].sum() + results['no_mistakes'].sum() + results['no_noninferables'].sum() + results['no_excluded'].sum()

14402

In [9]:
results['diagnoses'].str.split('\n').apply(len).sum()

14403

In [10]:
results['num_real_diag'] = results['diagnoses'].str.split('\n').apply(len)

In [11]:
results['total_ICD_diagnoses'].sum()

14403

In [12]:
results[results['total_ICD_diagnoses'] != results['num_real_diag']] #this should never happen

Unnamed: 0,hadm_id,diagnoses,GPT_input,GPT-Diagnoses,GPT-Eval,no_hits,no_noninferables,no_mistakes,no_excluded,hits,noninferables,mistakes,excluded,total_ICD_diagnoses,error,sensitivity,num_real_diag


In [13]:
def get_identified_diagnoses(text, numbers):
    hit_names = []
    if numbers == '' or numbers == 'nan':
        return hit_names
    nums = numbers.split(';')
    names = text.split('\n')
    try:
        for num in nums:
            diag = names[int(num.strip())-1]
            name = re.sub('[0-9]+:', '', diag, 1)
            hit_names.append(name)
    except ValueError:
        print(nums)
    return hit_names

results['hits'] = results['hits'].astype('str')
results['hit_names'] = results.apply(lambda row: get_identified_diagnoses(row['diagnoses'], row['hits']),1)

results['mistakes'] = results['mistakes'].astype('str')
results['mistake_names'] = results.apply(lambda row: get_identified_diagnoses(row['diagnoses'], row['mistakes']),1)

results['noninferables'] = results['noninferables'].astype('str')
results['noninferable_names'] = results.apply(lambda row: get_identified_diagnoses(row['diagnoses'], row['noninferables']),1)

In [14]:
results['hit_names'].apply(len).sum()

7586

In [15]:
len(np.unique(np.array(results['hit_names'].sum())))

1733

In [16]:
all_hits = []
for i in results['hit_names'].values:
    for hits in i:
        all_hits.append(hits)
        
all_mistakes = []
for i in results['mistake_names'].values:
    for mistakes in i:
        all_mistakes.append(mistakes)

In [17]:
pd.DataFrame(all_hits).value_counts().head(10)

Acute kidney failure, unspecified                                                                                         218
Acidosis                                                                                                                  129
Congestive heart failure, unspecified                                                                                     127
Anemia, unspecified                                                                                                       108
Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled                105
Acute posthemorrhagic anemia                                                                                               99
Chronic kidney disease, unspecified                                                                                        98
Unspecified essential hypertension                                                                                    

In [19]:
pd.DataFrame(all_mistakes).value_counts()

Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled    4
Type 2 diabetes mellitus without complications                                                                4
Hypoxemia                                                                                                     2
Anemia, unspecified                                                                                           1
Bradycardia, unspecified                                                                                      1
Essential (primary) hypertension                                                                              1
Hypopotassemia                                                                                                1
Hypotension, unspecified                                                                                      1
Prediabetes                                                                                             

In [35]:
results['num_real_diag'].describe()

count    1000.000000
mean       14.403000
std         6.812121
min         1.000000
25%         9.000000
50%        14.000000
75%        19.000000
max        39.000000
Name: num_real_diag, dtype: float64

In [36]:
results['num_real_diag'].median()

14.0