In [None]:
import pandas as pd

Find ICD9 Codes for Hospital admission
======================================

In [None]:
def normalise_icd9_code(code) :
    return code[:3] + '.' + code[3:]

def combine_icd9_codes(groupframe) :
    groupframe = groupframe.sort_values(by='SEQ_NUM')
    icd9_codes = ';'.join([normalise_icd9_code(x) for x in list(groupframe.ICD9_CODE)])
    return pd.Series({'ICD9_CODE' : icd9_codes})

# Replace the path with DIAGNOSES_ICD.csv file on your machine.
df_icd9_codes = pd.read_csv('../../../bigdata/MIMIC/DIAGNOSES_ICD.csv').dropna()
df_icd9_codes = df_icd9_codes.groupby(['SUBJECT_ID', 'HADM_ID']).apply(combine_icd9_codes)
df_icd9_codes = pd.DataFrame(df_icd9_codes).reset_index()

Clean Discharge Summaries
=========================

In [None]:
# Replace the path with NOTEEVENTS.csv file on your machine
df_notes = pd.read_csv('../../../bigdata/MIMIC/NOTEEVENTS.csv')

columns_to_keep = ['SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'DESCRIPTION', 'TEXT']
df_notes_discharge = df_notes[(df_notes.CATEGORY == 'Discharge summary') & (pd.isnull(df_notes.ISERROR))][columns_to_keep]
df_notes_discharge['DESCRIPTION'] = df_notes_discharge['DESCRIPTION'].replace({'Report' : 0, 'Addendum' : 1})

In [None]:
def group_text_reports(groupframe) :
    #Combine main report and addenda
    groupframe = groupframe.sort_values(by=['DESCRIPTION', 'CHARTDATE'])
    concat_text = " ".join(groupframe['TEXT']).strip()
    return pd.Series({'TEXT' : concat_text})

df_notes_discharge_combined = df_notes_discharge.groupby(['SUBJECT_ID', 'HADM_ID']).apply(group_text_reports)
df_notes_discharge_combined = pd.DataFrame(df_notes_discharge_combined).reset_index()

In [None]:
from Transparency.preprocess.vectorizer import cleaner_mimic
from tqdm import tqdm

In [None]:
texts = list(df_notes_discharge_combined['TEXT'])

from multiprocessing import Pool
with Pool(5) as p :
    cleaned_texts = list(tqdm(p.imap(cleaner_mimic, texts), total=len(texts)))

df_notes_discharge_combined['TEXT'] = cleaned_texts
df_notes_discharge_combined = df_notes_discharge_combined.merge(df_icd9_codes, on=['SUBJECT_ID', 'HADM_ID'])
df_notes_discharge_combined.to_csv('cleaned_discharge_summaries.csv', index=False)

Train Word2Vec on discharge summaries
=====================================

In [None]:
from gensim.models import Word2Vec
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = [x.split(' ') for x in cleaned_texts]
model = Word2Vec(sentences, size=300, window=10, min_count=2, workers=10)
model.wv.save("mimic_embedding_model.wv")