In [1]:
# setup main dataframe, initial column is ["SUBJECT_ID", "HADM_ID", "ADMITTIME", "DIAGNOSIS"] from ADMISSIONS
# SUBJECT_ID: Each patient has unique ID
# HADM_ID: Each admission has unique ID
# ADMITTIME: The date and time the patient was admitted to the hospital
# DIAGNOSIS: The primary diagnosis of the patient

import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 0)

# read ADMISSIONS.csv.gz
mainDF = pd.read_csv('../mimic-iii-clinical-database-1.4/ADMISSIONS.csv.gz', compression='gzip', usecols=["SUBJECT_ID", "HADM_ID", "ADMITTIME", "DIAGNOSIS"])

# Change the ADMITTIME only have date
mainDF["ADMITTIME"] = pd.to_datetime(mainDF["ADMITTIME"]).dt.date.astype(str)

# Dtop the rows if DIAGNOSIS is NaN
mainDF = mainDF.dropna(subset=["DIAGNOSIS"])

# Separate the DIAGNOSIS if there are multiple, and output to dataframe
data = []
for row in mainDF.itertuples():

  if "\\" in row.DIAGNOSIS:
    diagnosisString = row.DIAGNOSIS.split("\\")[0]
  else:
    diagnosisString = row.DIAGNOSIS
  
  if ";" in diagnosisString:
    diagnosisArray = diagnosisString.split(";")
  else:
    diagnosisArray = [diagnosisString]

  for d in diagnosisArray:
    data.append([row.SUBJECT_ID, row.HADM_ID, row.ADMITTIME, d])

diagnosisDF = pd.DataFrame(data, columns=["SUBJECT_ID", "HADM_ID", "ADMITTIME", "DIAGNOSIS"])
diagnosisDF.to_csv("diagnosis_0.csv", index=False)


In [2]:
# Add the height and weight to the main dataframe by using the ["SUBJECT_ID", "HADM_ID", "ITEMID", "VALUE"] in CHARTEVENTS
# SUBJECT_ID: Each patient has unique ID
# HADM_ID: Each admission has unique ID
# ITEMID: weight index : 762, 763, 3723, 3580, 3581, 3582
#         height index : 920, 1394, 4187, 3486, 3485, 4188
# VALUE: the value of the ITEMID

diagnosisDF = pd.read_csv("diagnosis_0.csv")
# read CHARTEVENTS.csv.gz and drop the missing value, and change the CHARTTIME only have date
CHARTEVENTS = pd.read_csv('../mimic-iii-clinical-database-1.4/CHARTEVENTS.csv.gz', compression='gzip', usecols=["SUBJECT_ID", "HADM_ID", "CHARTTIME", "ITEMID", "VALUE"]).dropna()
CHARTEVENTS["CHARTTIME"] = pd.to_datetime(CHARTEVENTS["CHARTTIME"]).dt.date.astype(str)

# create weightHeightDict with key = (SUBJECT_ID, HADM_ID, CHARTTIME) and value = [weight, height]
weightHeightDict = {}
for row in CHARTEVENTS.itertuples():
  key = (row.SUBJECT_ID, row.HADM_ID, row.CHARTTIME)

  if row.ITEMID == 762 or row.ITEMID == 763 or row.ITEMID == 3723 or row.ITEMID == 3580: # weight unit is alreayd in KG
    if key in weightHeightDict and weightHeightDict[key][0] == -1: weightHeightDict[key][0] = float(row.VALUE)
    else: weightHeightDict[key] = [float(row.VALUE), -1]
  
  elif row.ITEMID == 3581: # weight unit is in LB
    if key in weightHeightDict and weightHeightDict[key][0] == -1: weightHeightDict[key][0] = float(row.VALUE)*0.4536
    else: weightHeightDict[key] = [float(row.VALUE)*0.4536, -1]
  
  elif row.ITEMID == 3582: # weight unit is in OZ
    if key in weightHeightDict and weightHeightDict[key][0] == -1: weightHeightDict[key][0] = float(row.VALUE)*0.0283
    else: weightHeightDict[key] = [float(row.VALUE)*0.0283, -1]
  
  elif row.ITEMID == 920 or row.ITEMID == 1394 or row.ITEMID == 4187 or row.ITEMID == 3486: # height unit is in inches
    if key in weightHeightDict and weightHeightDict[key][1] == -1: weightHeightDict[key][1] = float(row.VALUE)*0.0254
    else: weightHeightDict[key] = [-1, float(row.VALUE)*0.0254]

  elif row.ITEMID == 3485 or row.ITEMID == 4188: # height unit is in cm
    if key in weightHeightDict and weightHeightDict[key][1] == -1: weightHeightDict[key][1] = float(row.VALUE)*0.01
    else: weightHeightDict[key] = [-1, float(row.VALUE)*0.01]

# filter out if the key only have weight or height
for key, value in list(weightHeightDict.items()):
  if value[0] == -1 or value[1] == -1:
    del weightHeightDict[key]

# Plug the weight and height to the main dataframe
for row in diagnosisDF.itertuples():
  if (row.SUBJECT_ID, row.HADM_ID, row.ADMITTIME) in weightHeightDict:
    weight, height = weightHeightDict[(row.SUBJECT_ID, row.HADM_ID, row.ADMITTIME)]
    diagnosisDF.at[row.Index, "WEIGHT"] = weight
    diagnosisDF.at[row.Index, "HEIGHT"] = height

# Drop the rows if the weight or height is NaN
diagnosisDF = diagnosisDF.dropna()

diagnosisDF.to_csv("diagnosis_1.csv", index=False)


  CHARTEVENTS = pd.read_csv('../mimic-iii-clinical-database-1.4/CHARTEVENTS.csv.gz', compression='gzip', usecols=["SUBJECT_ID", "HADM_ID", "CHARTTIME", "ITEMID", "VALUE"]).dropna()


In [3]:
# Add the age and gender to the main dataframe by using the ["SUBJECT_ID", "GENDER", "DOB"] in PATIENTS
# SUBJECT_ID: Each patient has unique ID
# GENDER: patient's gender
# DOB: patient's date of birth

import pandas as pd

diagnosisDF = pd.read_csv("diagnosis_1.csv")

# read PATIENTS.csv.gz and change the DOB only have date
PATIENTS = pd.read_csv('../mimic-iii-clinical-database-1.4/PATIENTS.csv.gz', compression='gzip')
PATIENTS["DOB"] = pd.to_datetime(PATIENTS["DOB"]).dt.date.astype(str)

# create two dictionary with key = SUBJECT
patientBrith = dict(zip(PATIENTS["SUBJECT_ID"], PATIENTS["DOB"]))
patientGender = dict(zip(PATIENTS["SUBJECT_ID"], PATIENTS["GENDER"]))

# Plug the Gender to the main dataframe
for row in diagnosisDF.itertuples():
  diagnosisDF.at[row.Index, "GENDER"] = patientGender[row.SUBJECT_ID]

# Plug the Age to the main dataframe
for row in diagnosisDF.itertuples():
  admitYear, admitMonth, admitDate = row.ADMITTIME.split("-")
  birthYear, birthMonth, birthDate = patientBrith[row.SUBJECT_ID].split("-")
  patientMonth = int(admitYear)*12+int(admitMonth) - int(birthYear)*12+int(birthMonth)
  age, remainder = patientMonth//12, patientMonth%12
  if remainder >= 6: age += 1
  diagnosisDF.at[row.Index, "AGE"] = int(age)

# Drop the SUBJECT_ID, HADM_ID, ADMITTIME, since it is not needed anymore
diagnosisDF = diagnosisDF.drop(columns=["SUBJECT_ID", "HADM_ID", "ADMITTIME"])

diagnosisDF.to_csv("diagnosis_2.csv", index=False)


  diagnosisDF.at[row.Index, "GENDER"] = patientGender[row.SUBJECT_ID]


In [4]:
# Replace the typo in the data
diagnosisDF = pd.read_csv("diagnosis_2.csv")
for index, row in enumerate(diagnosisDF.itertuples()):
  original = row.DIAGNOSIS

  if original == "CORNARY ARTERY DISEASE":
    original = "CORONARY ARTERY DISEASE"
    continue
  elif original == "":
    diagnosisDF.drop(index, inplace=True)
    continue
  
  while original[0] == " " or original[0] == "\"" :
    original = original[1:]
  while original[-1] == " " or original[-1] == "\"":
    original = original[:-1]
  diagnosisDF.at[index, "DIAGNOSIS"] = original

# drop the age > 150, don't know why the age is so high
for row in diagnosisDF.itertuples():
  if row.AGE > 150:
    diagnosisDF.drop(row.Index, inplace=True)
    
diagnosisDF.to_csv("diagnosis_3.csv", index=False)

In [5]:
# This section is do the feature extraction, and save the feature extraction to pickle
import pickle

diagnosisDF = pd.read_csv("diagnosis_3.csv")

# Do feature extraction on Diagnosis and Gender, and save it to pickle
diagnosisLabelToInt = dict([(d, index) for index, d in enumerate(set(diagnosisDF["DIAGNOSIS"]))])
diagnosisIntToLabel = dict([(index, d) for index, d in enumerate(set(diagnosisDF["DIAGNOSIS"]))])
with open('../pickleFiles/diagnosisLabelToInt.pkl', 'wb') as f:
  pickle.dump(diagnosisLabelToInt, f)
with open('../pickleFiles/diagnosisIntToLabel.pkl', 'wb') as f:
  pickle.dump(diagnosisIntToLabel, f)
print(f'Finish saving the diagnosisLabelToInt.pkl and diagnosisIntToLabel.pkl')

genderLabel = {"M": 0, "F": 1}
diagnosisDF["DIAGNOSIS"] = [diagnosisLabelToInt[n] for n in diagnosisDF["DIAGNOSIS"]]
diagnosisDF["GENDER"] = [genderLabel[n] for n in diagnosisDF["GENDER"]]

diagnosisDF.to_csv('diagnosis_4.csv', index=False)

Finish saving the diagnosisLabelToInt.pkl and diagnosisIntToLabel.pkl


In [6]:
# This section is scale the X and then save the dataframe to csv
from sklearn.preprocessing import StandardScaler

diagnosisDF = pd.read_csv('./diagnosis_4.csv')

y = diagnosisDF["DIAGNOSIS"]
X = diagnosisDF.drop(["DIAGNOSIS"], axis=1)
Xcolumn = X.columns
scaler = StandardScaler()
X = scaler.fit_transform(X)

# save scaler to pickle
with open('../pickleFiles/scaler.pkl', 'wb') as f:
  pickle.dump(scaler, f)

# save X to dataframe, and add y back, then drop the NaN
X = pd.DataFrame(X, columns=Xcolumn)
X["DIAGNOSIS"] = y
X = X.dropna()
print(len(X))
# save X to the csv file
X.to_csv('diagnosis_final.csv', index=False)

5860
