In [24]:
# imports 
import pandas as pd
import re
import simple_icd_10_cm as icd_desc
from sklearn.model_selection import GroupKFold
import numpy as np
import json


In [10]:
# functions 

replace_LIST = [
                 ['dr\.','']
                ,['DR\.','']
                ,['m\.d\.','']
                ,['M\.D\.','']
                ,['p\.o', 'orally']
                ,['P\.O', 'orally']
                ,['q\.d\.', 'once a day']
                ,['Q\.D\.', 'once a day']
                ,['I\.M\.', 'intramuscularly']
                ,['i\.m\.', 'intramuscularly']
                ,['b\.i\.d\.', 'twice a day']
                ,['B\.I\.D\.', 'twice a day']
                ,['Subq\.', 'subcutaneous']
                ,['SUBQ\.', 'subcutaneous']
                ,['t\.i\.d\.', 'three times a day']
                ,['T\.I\.D\.', 'three times a day']
                ,['q\.i\.d\.', 'four times a day']
                ,['Q\.I\.D\.', 'four times a day']
                ,['I\.V\.', 'intravenous']
                ,['i\.v\.', 'intravenous']
                ,['q\.h\.s\.', 'before bed']
                ,['Q\.H\.S\.', 'before bed']
                ,['O\.D\.', 'in the right eye']
                ,['o\.d\.', 'in the right eye']
                ,['5X', 'a day five times a day']
                ,['5x', 'a day five times a day']
                ,['O\.S\.', 'in the left eye']
                ,['o\.s\.', 'in the left eye']
                ,['q\.4h', 'every four hours']
                ,['Q\.4H', 'every four hours']
                ,['O\.U\.', 'in both eyes']
                ,['o\.u\.', 'in both eyes']
                ,['q\.6h', 'every six hours']
                ,['Q\.6H', 'every six hours']
                ,['q\.o\.d\.', 'every other day']
                ,['Q\.O\.D\.', 'every other day']
                ,['prn\.', 'as needed']
                ,['PRN\.', 'as needed']
                ,['[0-9]+\.','']
                ,[r'\[\*.+\*\]','']
                ]
def preprocess_re_sub(x):
    processed_text = x
    for find,replace in replace_LIST:
        processed_text=re.sub(find,replace,processed_text)
    return processed_text


def filter_admission_text(text):
    """
    Filter text information by section and only keep sections that are known on admission time.
    """
    admission_sections = {
        "CHIEF_COMPLAINT": "chief complaint:",
        "PRESENT_ILLNESS": "present illness:",
        "MEDICAL_HISTORY": "medical history:",
        "MEDICATION_ADM": "medications on admission:",
        "ALLERGIES": "allergies:",
        "PHYSICAL_EXAM": "physical exam:",
        "FAMILY_HISTORY": "family history:",
        "SOCIAL_HISTORY": "social history:"
    }

    # replace linebreak indicators
    text = text.replace("\n", "\\n")

    # extract each section by regex
    notes_dict = {}
    for key in admission_sections.keys():
        section = admission_sections[key]
        results = re.findall(r'(?i){}(.+?)\\n\\n[^(\\|\d|\.)]+?:'.format(section), text)

        if len(results) > 0:
            notes_dict[key] = re.findall(r'(?i){}(.+?)\\n\\n[^(\\|\d|\.)]+?:'.format(section), text)[0]
            notes_dict[key] = notes_dict[key].replace('\\n', ' ')
            notes_dict[key] = notes_dict[key].strip()
        
        elif len(results) == 0:
            notes_dict[key] = ""
        
        else:
            try:
                pass 
            except Exception as e:
                print(key)
                print('multiple string matches found')

        
    # filter notes with missing main information
    if notes_dict['CHIEF_COMPLAINT'] == "" or notes_dict['PRESENT_ILLNESS'] == "" or notes_dict['MEDICAL_HISTORY'] == "":
        return "MISSING"


    try:
        text_final = ("CHIEF COMPLAINT: " + str(notes_dict['CHIEF_COMPLAINT'])
                                    + '\n\n' +
                                    "PRESENT ILLNESS: " + str(notes_dict['PRESENT_ILLNESS'])
                                    + '\n\n' +
                                    "MEDICAL HISTORY: " + str(notes_dict['MEDICAL_HISTORY'])
                                    + '\n\n' +
                                    "MEDICATION ON ADMISSION: " + str(notes_dict['MEDICATION_ADM'])
                                    + '\n\n' +
                                    "ALLERGIES: " + str(notes_dict['ALLERGIES'])
                                    + '\n\n' +
                                    "PHYSICAL EXAM: " + str(notes_dict['PHYSICAL_EXAM'])
                                    + '\n\n' +
                                    "FAMILY HISTORY: " + str(notes_dict['FAMILY_HISTORY'])
                                    + '\n\n' +
                                    "SOCIAL HISTORY: " + str(notes_dict['SOCIAL_HISTORY']))
        return text_final
    except:
        print(notes_dict)
    


## load data

In [11]:
patient_notes = pd.read_csv('../../datasets/mimic/NOTEEVENTS.csv')
patient_notes = patient_notes[(patient_notes['CATEGORY']=='Discharge summary') & (patient_notes['DESCRIPTION']=='Report')].reset_index(drop=True)
patient_notes.head()

  patient_notes = pd.read_csv('../../datasets/mimic/NOTEEVENTS.csv')


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...


In [12]:
icd = pd.read_csv('../../datasets/mimic/DIAGNOSES_ICD.csv').drop('ROW_ID', axis=1)
icd['ICD9_CODE'] = icd['ICD9_CODE'].apply(lambda x: str(x).replace('.','').strip().upper())

icd.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,109,172335,1.0,40301
1,109,172335,2.0,486
2,109,172335,3.0,58281
3,109,172335,4.0,5855
4,109,172335,5.0,4254


In [13]:
mapping = pd.read_csv('../../datasets/icd/icd9to10dictionary.txt', sep='|', header=None, names=['ICD9_CODE', 'ICD10_CODE', 'ICD10_CODE_DESCRIPTION'])
mapping['ICD9_CODE'] = mapping['ICD9_CODE'].apply(lambda x: str(x).replace('.','').strip().upper())
mapping['ICD10_CODE'] = mapping['ICD10_CODE'].apply(lambda x: str(x).replace('.','').replace("'",'').strip().upper())
mapping['ICD10_CODE'] = mapping['ICD10_CODE'].apply(lambda x: None if x == 'NAN' else 'J8409' if  x == '8409' else x)

mapping.head()

Unnamed: 0,ICD9_CODE,ICD10_CODE,ICD10_CODE_DESCRIPTION
0,10,A000,Cholera due to Vibrio cholerae 01 biovar chole...
1,11,A001,Cholera due to Vibrio cholerae 01 biovar eltor
2,19,A009,Cholera unspecified
3,20,A0100,Typhoid fever unspecified
4,21,A011,Paratyphoid fever A


## covert icd9 to 10

In [14]:
icd = icd.merge(mapping, on='ICD9_CODE', how='left')
icd.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ICD10_CODE,ICD10_CODE_DESCRIPTION
0,109,172335,1.0,40301,I120,Hypertensive chronic kidney disease with stage...
1,109,172335,2.0,486,J189,Pneumonia unspecified organism
2,109,172335,3.0,58281,N08,Glomerular disorders in diseases classified el...
3,109,172335,4.0,5855,N185,Chronic kidney disease stage 5
4,109,172335,5.0,4254,I425,Other restrictive cardiomyopathy


## clean note data 

In [15]:

patient_notes_cleaned = patient_notes.copy()
patient_notes_cleaned['TEXT'] = patient_notes_cleaned['TEXT'].apply(preprocess_re_sub)
patient_notes_cleaned['TEXT'] = patient_notes_cleaned['TEXT'].apply(filter_admission_text)
patient_notes_cleaned = patient_notes_cleaned[patient_notes_cleaned['TEXT'] != 'MISSING']
patient_notes_cleaned

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,"CHIEF COMPLAINT: 81 yo F smoker w/ COPD, sever..."
3,177,13702,196489.0,2124-08-18,,,Discharge summary,Report,,,CHIEF COMPLAINT: COPD exacerbation/Shortness o...
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,CHIEF COMPLAINT: Mr. after a mechanical fall ...
5,179,53181,170490.0,2172-03-08,,,Discharge summary,Report,,,CHIEF COMPLAINT: Meningioma\n\nPRESENT ILLNESS...
6,180,20646,134727.0,2112-12-10,,,Discharge summary,Report,,,CHIEF COMPLAINT: Hypoxia\n\nPRESENT ILLNESS: 8...
...,...,...,...,...,...,...,...,...,...,...,...
55172,53613,43691,147266.0,2147-03-01,,,Discharge summary,Report,,,"CHIEF COMPLAINT: hearing loss, visual disturba..."
55173,53614,80847,129802.0,2190-06-05,,,Discharge summary,Report,,,CHIEF COMPLAINT: unresponsiveness\n\nPRESENT I...
55174,53615,41074,182558.0,2121-06-14,,,Discharge summary,Report,,,CHIEF COMPLAINT: ICH\n\nPRESENT ILLNESS: 74 y/...
55175,53616,76397,184741.0,2182-04-22,,,Discharge summary,Report,,,CHIEF COMPLAINT: trauma\n\nPRESENT ILLNESS: 19...


In [16]:
primary_missing = icd[(icd['ICD10_CODE'].isna()) & (icd['SEQ_NUM']==1.0)]['HADM_ID'].unique()
patient_notes_cleaned = patient_notes_cleaned[~patient_notes_cleaned['HADM_ID'].isin(primary_missing)] #remove admissions where the primary diagnosis ICD10 is missing 
patient_notes_cleaned = patient_notes_cleaned.drop_duplicates('HADM_ID').reset_index(drop=True) #keep only 1 note per admission
patient_notes_cleaned.shape

(39248, 11)

## merge icd data with note data

In [17]:
training_data = patient_notes_cleaned[['SUBJECT_ID', 'HADM_ID', 'TEXT']].merge(icd, on=['SUBJECT_ID', 'HADM_ID'], how='left')
training_data = training_data.dropna().reset_index(drop=True)

training_data.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,SEQ_NUM,ICD9_CODE,ICD10_CODE,ICD10_CODE_DESCRIPTION
0,13702,196489.0,CHIEF COMPLAINT: COPD exacerbation/Shortness o...,1.0,51884,J9620,Acute and chronic respiratory failure unspecif...
1,13702,196489.0,CHIEF COMPLAINT: COPD exacerbation/Shortness o...,2.0,5849,N179,Acute kidney failure unspecified
2,13702,196489.0,CHIEF COMPLAINT: COPD exacerbation/Shortness o...,3.0,34830,G9340,Encephalopathy unspecified
3,13702,196489.0,CHIEF COMPLAINT: COPD exacerbation/Shortness o...,4.0,49121,J441,Chronic obstructive pulmonary disease with (ac...
4,13702,196489.0,CHIEF COMPLAINT: COPD exacerbation/Shortness o...,5.0,2760,E870,Hyperosmolality and hypernatremia


In [20]:
training_data["ICD10_L3"] = training_data.ICD10_CODE.str[0:3]
training_data['ICD10_L3_DESCRIPTION'] = training_data['ICD10_L3'].apply(icd_desc.get_description)
training_data["ICD10_GROUP"] = training_data["ICD10_L3"].apply(icd_desc.get_parent)
training_data["ICD10_GROUP_DESCRIPTION"] = training_data["ICD10_GROUP"].apply(icd_desc.get_description)

#drop duplicates
training_data = training_data[['SUBJECT_ID', 'HADM_ID', 'TEXT', 'ICD10_L3','ICD10_L3_DESCRIPTION','ICD10_GROUP', 'ICD10_GROUP_DESCRIPTION']]\
    .drop_duplicates(subset=['SUBJECT_ID', 'HADM_ID', 'ICD10_GROUP'])\
    .reset_index(drop=True)


## break up data into multipe sets for training

In [29]:
# split dataset by subjects 
splitter = GroupKFold(n_splits=5)

for i, (_, test_index) in enumerate(splitter.split(X=patient_notes_cleaned['TEXT'], y=None, groups=patient_notes_cleaned['SUBJECT_ID'])):
    subject_ids = patient_notes_cleaned.iloc[test_index, :]['SUBJECT_ID'].unique()
    data_subset = training_data[training_data['SUBJECT_ID'].isin(subject_ids)]\
                 .reset_index(drop=True)\
                 .drop(['SUBJECT_ID', 'HADM_ID','ICD10_L3','ICD10_GROUP'], axis=1)
    data_subset.to_csv(f'../../datasets/training_data/dataset_{i+1}.csv', index=False)

## clean ICD dataset and label as original dataset

In [38]:
df = pd.DataFrame(json.load(open("../../datasets/icd/icd_json.json","r"))) 

df['TEXT'] = df.text.apply(lambda i: i.get("codeDescription"))
    
df['ICD10_L3'] = df.icd10Code.str[0:3]
df['ICD10_L3_DESCRIPTION'] = df['ICD10_L3'].apply(icd_desc.get_description)

df['ICD10_GROUP'] = df['ICD10_L3'].apply(icd_desc.get_parent)
df['ICD10_GROUP_DESCRIPTION'] = df['ICD10_GROUP'].apply(icd_desc.get_description)
df =  df[["TEXT","ICD10_L3_DESCRIPTION","ICD10_GROUP_DESCRIPTION"]]
df.to_csv(f'../../datasets/training_data/dataset_0.csv', index=False)