In [None]:
import os
import pickle
import pandas as pd


PATH_TRAIN = "../data/mortality/train/"
PATH_VALIDATION = "../data/mortality/validation/"
PATH_TEST = "../data/mortality/test/"
PATH_OUTPUT = "../data/mortality/processed/"

path = PATH_TRAIN


def transform(icd9_object):
    """
    :param icd9_object: ICD-9 code (Pandas/Numpy object).
    :return: extracted main digits of ICD-9 code
    """
    icd9_str = str(icd9_object)
    # TODO: Extract the the first 3 or 4 alphanumeric digits prior to the decimal point from a given ICD-9 code.
    # TODO: Read the homework description carefully

    if icd9_str[0] == 'E':
        converted = icd9_str[:4]
    else:
        converted = icd9_str[:3]

    return converted


def build_codemap(df_icd9, transform):
    """
    :return: Dict of code map {main-digits of ICD9: unique feature ID}
    """
    # TODO: We build a code map using ONLY train data. Think about how to construct validation/test sets using this.
    df_digits = df_icd9['ICD9_CODE'].apply(transform)
    df_digits = set(list(df_digits))

    codemap = {}
    
    for i, icd in enumerate(df_digits):
        codemap[icd] = i
        
    return codemap

df_icd9 = pd.read_csv(os.path.join(PATH_TRAIN, "DIAGNOSES_ICD.csv"), usecols=["ICD9_CODE"])
codemap = build_codemap(df_icd9, transform)

In [None]:
df_mortality = pd.read_csv(os.path.join(path, "MORTALITY.csv"))
df_admissions = pd.read_csv(os.path.join(path, "ADMISSIONS.csv"))
df_diag = pd.read_csv(os.path.join(path, "DIAGNOSES_ICD.csv"))

df_diag['ICD9_CODE'] = df_diag['ICD9_CODE'].apply(transform)
df_diag['ICD9_CODE'] = df_diag['ICD9_CODE'].map(codemap)

df_admissions['ADMITTIME'] = df_admissions['ADMITTIME'].apply(lambda x: x.split(' ')[0])
df_join = pd.merge(df_diag, df_admissions, on='HADM_ID')
df_join = df_join[['SUBJECT_ID_x', 'ADMITTIME', 'ICD9_CODE']]
df_join = pd.merge(df_join, df_mortality, left_on='SUBJECT_ID_x', right_on='SUBJECT_ID').drop(columns='SUBJECT_ID_x')

df_grouped = df_join.groupby(['SUBJECT_ID', 'ADMITTIME']).agg({'ICD9_CODE': lambda x: list(x), 'MORTALITY': lambda x: x.iloc[0]})
df_grouped = df_grouped.sort_values(by=(['SUBJECT_ID','ADMITTIME']), ascending=True)

seq_data = []
patient_ids = []
labels = []

for _, subject in df_grouped.groupby(level=0):
    labels.append(subject.iloc[0]['MORTALITY'])
    patient_ids.append(subject.index[0][0])
    temp = []
    subject = subject['ICD9_CODE']
    for code in subject:
        temp.append((list(code)))
    seq_data.append(temp)

In [None]:
df_grouped