Make sure you have MIMIC-IV datasets downloaded, particularly discharge.csv and diagnoses.csv.

In [4]:
import numpy as np
import pandas as pd
from collections import defaultdict
import random


In [2]:
cd ~/Downloads

/Users/roshanswaroop/Downloads


In [12]:
N = 5000

#Load MIMIC IV data (clinical notes and diagnoses)
discharge_df = pd.read_csv('discharge.csv')
diagnoses_df = pd.read_csv('diagnoses_icd.csv')  
diagnoses_df = diagnoses_df.query("icd_version == 10")

#Filter clinical notes
icd_10_hadm_ids = random.sample(list(set(diagnoses_df["hadm_id"].values.tolist())), N)
discharge_df = discharge_df[discharge_df["hadm_id"].isin(icd_10_hadm_ids)]

#Load filtered hadm and subject ids into new data frame
hadm_to_subject_id = dict()
for index, entry in diagnoses_df.iterrows():
    if (entry["hadm_id"] in hadm_to_subject_id):
        continue
    else:
        hadm_to_subject_id[entry["hadm_id"]] = entry["subject_id"]
icd_10_subject_ids = [hadm_to_subject_id[hadm_id] for hadm_id in icd_10_hadm_ids]
query_df = pd.DataFrame()
query_df["hadm_id"] = icd_10_hadm_ids
query_df["subject_id"] = icd_10_subject_ids

#Add ICD codes to new data frame
hadm_to_icd = dict()
for i in icd_10_hadm_ids:
    icd_codes = diagnoses_df.loc[diagnoses_df['hadm_id'] == i, 'icd_code'].values.tolist()
    icd_code_string = ""
    for code in icd_codes:
        icd_code_string += code + " "
    hadm_to_icd[i] = icd_code_string.strip()
icd_10_codes = [hadm_to_icd[hadm_id] for hadm_id in icd_10_hadm_ids]
query_df["icd_codes"] = icd_10_codes

# text, ICD-10 mapping for eval purposes
merged_df = pd.merge(discharge_df, query_df, on='hadm_id')
eval_df = merged_df[['text', 'icd_codes']]

In [16]:
eval_df

Unnamed: 0,text,icd_codes
0,\nName: ___ Unit No: ___\n \nA...,J441 I4892 Z9981 I480 J45998 Z87891 I2510 Z966...
1,\nName: ___ Unit No: ___\n \nA...,J441 J45909 H6991 F419 G4700 I4891 Z7901 I10 I...
2,\nName: ___ Unit No: ___...,S68522A B182 W312XXA Y9269 F1290
3,\nName: ___ Unit No: ___\n ...,I481 I5023 N179 I130 I2510 E785 M06042 M06041 ...
4,\nName: ___ Unit No: ___\n...,T80211A A4181 R6521 N186 K767 G9341 T8242XA Z1...
...,...,...
4012,\nName: ___ Unit No: ___\...,R0789 E1122 I129 N183 K219 Z21 I2510 Z955 E785...
4013,\nName: ___ Unit No: ___\n...,I481 T8111XA J9601 I97710 Z7901 Y840 Y92238 Z5...
4014,\nName: ___. Unit No: ___\n \...,K529 K830 R109 G8929 J449 R634 Z6823 R110 Z878...
4015,\nName: ___ Unit No: ___\n...,S066X0A D696 E119 I4891 G40909 I10 E785 I6521 ...


In [23]:
import json
with open('/Users/roshanswaroop/rema/rema/codemCodes.json', 'r') as f:
    # Load JSON data from file
    icd_code_descriptions = json.load(f)

In [49]:
def print_code_descriptions(index):
    # Get the row corresponding to the index
    row = eval_df.loc[index]

    # Extract the text and ICD codes from the row
    text = row['text']
    icd_codes = row['icd_codes'].split()

    #print("Text:", text, "\n")

    # For each ICD code, print the code and its description
    for code in icd_codes:
        description = icd_code_descriptions.get(code, "No description available")
        print("Code:", code)
        print("Description:", description)
        print()


In [25]:
# input an index, get MIMIC IV's code suggestions and the original clinical text
print_code_descriptions(0)

Text:  
Name:  ___             Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
IV Dye, Iodine Containing Contrast Media / Oxycodone / 
cilostazol / Varenicline
 
Attending: ___.
 
Chief Complaint:
cough, dyspnea
 
Major Surgical or Invasive Procedure:
None

 
History of Present Illness:
HPI: ___ year old female with history of COPD (on home O2), HTN, 
Afib admitted with dyspnea and cough.  
 Pt states inc dyspnea since this am, also one episode of 
retrosternal chest pressure lasting 2minuts on way to ED. No cp 
currently. on home O2. no fevers/chills or abd sx.  
 Patient was recently admitted from ___ with COPD flare 
and afib with RVR. She could not receive azithromycin due to 
concern for QTc prolongation and so was treated with 
ceftriaxone/cefpodoxime. She was treated with 60mg PO prednisone 
and discharged with a prednisone taper of 10 mg decrease q3d 
until at 10 mg, then stay 

In [31]:
import openai
openai.api_key = 

In [36]:
# standard GPT-3.5/4

# for Azure
# openai.api_type = "azure"
# openai.api_key = "..."
# openai.api_base = "https://example-endpoint.openai.azure.com"
# openai.api_version = "2023-03-15-preview"

def call_gpt4(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-4", 
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,           # Maximum number of tokens in the generated text. If the generated response exceeds this limit, it will be cut off.
        temperature=1,          # Controls randomness. Higher values (closer to 1) make output more random, lower values (closer to 0) make it more deterministic.
        top_p=1,                  # Sets the nucleus sampling value, controls diversity via probability threshold, can be used as an alternative to temperature.
        frequency_penalty=0.0,    # Penalizes new tokens based on their frequency in the model's training data. Ranges from -2.0 to 2.0.
        presence_penalty=0.0,     # Penalizes new tokens based on whether they appear in the context. Ranges from -2.0 to 2.0.
        n=1,                      # The number of completions to generate. More completions means more diversity, but at a higher computational cost.
        stream=False,             # If true, generate the response as a stream to reduce latency.
        stop=None,                # A sequence (or list of sequences) where the API will stop generating further tokens.
    )
    return completion.choices[0].message.content

In [37]:
prompt ='You are a medical coder. You must identify all correct ICD-10 codes for the following patient record. Be as specific as possible. Return your answer in the following format: J44.1, E09.52, L89.213\n'

In [50]:
# for i in range(len(eval_df)):
note = eval_df[['text']].iloc[i][0]
inference = call_gpt4(prompt + note)
print_code_descriptions(i)
print('actual: ', inference)
    #break

    

Code: J441
Description: Chronic obstructive pulmonary disease with (acute) exacerbation

Code: I4892
Description: Unspecified atrial flutter

Code: Z9981
Description: Dependence on supplemental oxygen

Code: I480
Description: Paroxysmal atrial fibrillation

Code: J45998
Description: Other asthma

Code: Z87891
Description: Personal history of nicotine dependence

Code: I2510
Description: Atherosclerotic heart disease of native coronary artery without angina pectoris

Code: Z96649
Description: Presence of unspecified artificial hip joint

Code: I10
Description: Essential (primary) hypertension

Code: E785
Description: Hyperlipidemia, unspecified

Code: D509
Description: Iron deficiency anemia, unspecified

Code: I739
Description: Peripheral vascular disease, unspecified

Code: F419
Description: Anxiety disorder, unspecified

Code: K5900
Description: Constipation, unspecified

Code: M1990
Description: Unspecified osteoarthritis, unspecified site

Code: Z825
Description: Family history of 