# Setup

Make sure you have MIMIC-IV datasets downloaded, particularly discharge.csv and diagnoses.csv.

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import random

In [2]:
cd ~/Downloads

/Users/roshanswaroop/Downloads


In [3]:
N = 5000

#Load MIMIC IV data (clinical notes and diagnoses)
discharge_df = pd.read_csv('discharge.csv')
diagnoses_df = pd.read_csv('diagnoses_icd.csv')  
diagnoses_df = diagnoses_df.query("icd_version == 10")

#Filter clinical notes
icd_10_hadm_ids = random.sample(list(set(diagnoses_df["hadm_id"].values.tolist())), N)
discharge_df = discharge_df[discharge_df["hadm_id"].isin(icd_10_hadm_ids)]

#Load filtered hadm and subject ids into new data frame
hadm_to_subject_id = dict()
for index, entry in diagnoses_df.iterrows():
    if (entry["hadm_id"] in hadm_to_subject_id):
        continue
    else:
        hadm_to_subject_id[entry["hadm_id"]] = entry["subject_id"]
icd_10_subject_ids = [hadm_to_subject_id[hadm_id] for hadm_id in icd_10_hadm_ids]
query_df = pd.DataFrame()
query_df["hadm_id"] = icd_10_hadm_ids
query_df["subject_id"] = icd_10_subject_ids

#Add ICD codes to new data frame
hadm_to_icd = dict()
for i in icd_10_hadm_ids:
    icd_codes = diagnoses_df.loc[diagnoses_df['hadm_id'] == i, 'icd_code'].values.tolist()
    icd_code_string = ""
    for code in icd_codes:
        icd_code_string += code + " "
    hadm_to_icd[i] = icd_code_string.strip()
icd_10_codes = [hadm_to_icd[hadm_id] for hadm_id in icd_10_hadm_ids]
query_df["icd_codes"] = icd_10_codes

# text, ICD-10 mapping for eval purposes
merged_df = pd.merge(discharge_df, query_df, on='hadm_id')
eval_df = merged_df[['text', 'icd_codes']]

In [4]:
eval_df

Unnamed: 0,text,icd_codes
0,\nName: ___ Unit No: __...,G3183 F0280 R441 R296 E785 Z8546
1,\nName: ___ Unit No: ___\n \n...,C675 I10 D259 Z87891 E785 E890
2,\nName: ___ Unit No: ___\n \nA...,J441 N179 Z9981 I4891 D649 I10 E785 G5622 I251...
3,\nName: ___ Unit No: ___\...,K31811 B1910 S0990XA G629 D62 F1120 I452 I6523...
4,\nName: ___ Unit No: ___\n...,T8453XA D62 N179 D709 B9562 D696 I10 E785 I251...
...,...,...
3960,\nName: ___ Unit No: ...,K3580 K388 H578 Z006 M549 G8929
3961,\nName: ___ Unit No: ___\n...,E1110 N179 I10 E8770 E669 D72829 E785 Z7984 Z9...
3962,\nName: ___ Unit No: ___\...,R079 G3184 F22 I2510 Z951 Z955 Z8673 I10 E785 ...
3963,\nName: ___ Unit No: ___\n \...,E6601 E559 I10 Z6843 E780 L709 K660 D509 L309 ...


In [None]:
# Sort the DataFrame by text length in descending order
sorted_df = eval_df.assign(text_length=eval_df['icd_codes'].str.len()).sort_values('text_length', ascending=False)

# Get the top 10 longest texts
top_10_longest_texts = sorted_df.head(10)['icd_codes'].tolist()

# Print the top 10 longest texts
for i, text in enumerate(top_10_longest_texts):
    print(f"Text {i+1}: {text}")


In [7]:
import json
with open('/Users/roshanswaroop/rema/rema/codemCodes.json', 'r') as f:
    # Load JSON data from file
    icd_code_descriptions = json.load(f)

In [8]:
def print_code_descriptions(index):
    # Get the row corresponding to the index
    row = eval_df.loc[index]

    # Extract the text and ICD codes from the row
    text = row['text']
    icd_codes = row['icd_codes'].split()

    #print("Text:", text, "\n")

    # For each ICD code, print the code and its description
    code_list = []
    description_list = []
    for code in icd_codes:
        description = icd_code_descriptions.get(code, "No description available")
        code_list.append(code)
        description_list.append(description)
    print("Code:", code_list)
    print("Description:", description_list)
        

In [9]:
# input an index, get MIMIC IV's code suggestions and the original clinical text
print_code_descriptions(0)

Code: ['G3183', 'F0280', 'R441', 'R296', 'E785', 'Z8546']
Description: ['Dementia with Lewy bodies', 'Dementia in other diseases classified elsewhere without behavioral disturbance', 'Visual hallucinations', 'Repeated falls', 'Hyperlipidemia, unspecified', 'Personal history of malignant neoplasm of prostate']


In [31]:
import openai
openai.api_key = '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'

In [36]:
# standard GPT-3.5/4

# for Azure
# openai.api_type = "azure"
# openai.api_key = "..."
# openai.api_base = "https://example-endpoint.openai.azure.com"
# openai.api_version = "2023-03-15-preview"

def call_gpt4(prompt):
    completion = openai.ChatCompletion.create(
        model="gpt-4", 
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_tokens=3000,           # Maximum number of tokens in the generated text. If the generated response exceeds this limit, it will be cut off.
        temperature=1,          # Controls randomness. Higher values (closer to 1) make output more random, lower values (closer to 0) make it more deterministic.
        top_p=1,                  # Sets the nucleus sampling value, controls diversity via probability threshold, can be used as an alternative to temperature.
        frequency_penalty=0.0,    # Penalizes new tokens based on their frequency in the model's training data. Ranges from -2.0 to 2.0.
        presence_penalty=0.0,     # Penalizes new tokens based on whether they appear in the context. Ranges from -2.0 to 2.0.
        n=1,                      # The number of completions to generate. More completions means more diversity, but at a higher computational cost.
        stream=False,             # If true, generate the response as a stream to reduce latency.
        stop=None,                # A sequence (or list of sequences) where the API will stop generating further tokens.
    )
    return completion.choices[0].message.content

In [37]:
prompt ='You are a medical coder. You must identify all correct ICD-10 codes for the following patient record. Be as specific as possible. Return your answer in the following format: J44.1, E09.52, L89.213\n'

In [12]:
# for i in range(len(eval_df)):
note = eval_df[['text']].iloc[0][0]
#inference = call_gpt4(prompt + note)
print(note)
print_code_descriptions(0)
#print('actual: ', inference)
    #break

    

 
Name:  ___                    Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   M
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___.
 
Chief Complaint:
Visual hallucinations
 
Major Surgical or Invasive Procedure:
N/A

 
History of Present Illness:
___ male with ___ disease, dyslipidemia, and a
history of prostate cancer (s/p prostatectomy) who was referred
to the ED by his neurologist for worsening gait, falls, and
visual hallucinations. 

The following history is taken from chart review: 

The patient was seen by his neurologist on ___ at which time he
was noted to have visual hallucinations and worsening gait
freezing. For his gait freezing, his mirapex was increased by
0.125 mg every week to a goal dose of 0.75 mg t.i.d. He
successfully up-titrated the medicine to 0.75/0.625/0.625 but
began to have visual hallucinations and confusion so on ___ his
neurologist recommended d