In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import SGDClassifier 
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from datetime import datetime

import seaborn as sns
sns.set(style="darkgrid")
#local
from tadat.pipeline import plots
from tadat.core import data, vectorizer, features, helpers, embeddings

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
def ethnicity_multi_labels(x):
    if "ASIAN" in x:
        return "ASIAN"
    elif "AFRICAN" in x:
        return "BLACK"
    elif "HISPANIC" in x:
        return "HISPANIC"
    elif "WHITE" in x:
        return "WHITE"
    elif "NATIVE" in x:
        return "OTHER"
        return "NATIVE"
    else:
        return "OTHER"

def ethnicity_binary_labels(x):
    if "ASIAN" in x or "AFRICAN" in x or "HISPANIC" in x or "NATIVE" in x: 
        return "NON-WHITE"
    elif "WHITE" in x:
        return "WHITE"
    else:
        return "OTHER"

def get_ethnicity_dataset(df, path_out=None):
    df["ETHNICITY_LABEL"] = df["ETHNICITY"].apply(lambda x:ethnicity_binary_labels(x))
    df_ethnicity = df[["ETHNICITY_LABEL","TEXT"]]
    if path_out:
        df_ethnicity.to_csv(path_out,index=False, sep="\t", header=False)
        
    return df_ethnicity

def get_mortality_dataset(df, path_out=None):
    df["EXPIRE"] = df["HOSPITAL_EXPIRE_FLAG"]
    df_mortality = df[["EXPIRE","TEXT","GENDER","ETHNICITY_LABEL","ETHNICITY_BINARY"]]
    if path_out:
        df_mortality.to_csv(path_out,index=False, sep="\t", header=False)
    return df_mortality

def get_readmissions_dataset(mimic_path, patients_df, path_out=None):
    admissions_df = pd.read_csv(mimic_path+"ADMISSIONS.CSV.gz")
    data = []
    READMISSION_DELTA = float("inf")
    for _, patient in patients_df.iterrows():
        adm = admissions_df[admissions_df["SUBJECT_ID"] == patient["SUBJECT_ID"]]
        if len(adm) > 1:
            admission_times = list(adm.sort_values("ADMITTIME")["ADMITTIME"])
            for i in range(len(admission_times)-1):
                d1 = admission_times[i]
                d2 = admission_times[i+1]
                d1 = datetime.strptime(d1, "%Y-%m-%d %H:%M:%S")
                d2 = datetime.strptime(d2, "%Y-%m-%d %H:%M:%S")
                delta = d2 - d1
                if delta.days < READMISSION_DELTA:
                    data.append([1,patient["TEXT"],patient["GENDER"]])
                    break             
        else:
            data.append([0,patient["TEXT"],patient["GENDER"], 
                           patient["ETHNICITY_LABEL"], patient["ETHNICITY_BINARY"]])
    readm_df = pd.DataFrame(data, columns=["READMISSION","TEXT","GENDER","ETHNICITY_LABEL","ETHNICITY_BINARY"])
    if path_out:
        readm_df.to_csv(path_out,index=False, sep="\t", header=False)    
    return readm_df
    

def extract_patients(mimic_path, sample_size=None, path_out=None):
    all_notes = pd.read_csv(mimic_path+"NOTEEVENTS.CSV.gz")
    all_patients = pd.read_csv(mimic_path+"PATIENTS.CSV.gz")
    all_admissions = pd.read_csv(mimic_path+"ADMISSIONS.CSV.gz")
    #filter relevant columns
    notes = all_notes[["TEXT"]]
    notes["TEXT"] = notes["TEXT"].apply(lambda x: x.replace("\n","").replace("\t","").lower())
    patients = all_patients[["SUBJECT_ID","GENDER"]]
    admissions = all_admissions[["INSURANCE","ETHNICITY","HOSPITAL_EXPIRE_FLAG"]]
    #subsample?
    if sample_size:        
        patients = patients[:sample_size]
    #join dataframes
    patients = patients.join(admissions, how="inner", on="SUBJECT_ID",rsuffix="A_")
    patients = patients.join(notes, how="inner", on="SUBJECT_ID",rsuffix="N_")
    patients["ETHNICITY_LABEL"] = patients["ETHNICITY"].apply(lambda x:ethnicity_multi_labels(x))
    patients["ETHNICITY_BINARY"] = patients["ETHNICITY"].apply(lambda x:ethnicity_binary_labels(x))
    
    if path_out:
        patients.to_csv(path_out,index=False, sep="\t", header=False)    
    
    return patients

def subsample(df, label, ratio=1, path_out=None):
    minority_size = min(df.groupby(label).size())
    minority_class = np.argmin(df.groupby(label).size())
    
    majority_size = int(minority_size * ratio)
    majority_class = np.argmax(df.groupby(label).size())
    all_size = minority_size + majority_size
  
    print("{} > Minority: {}/{} | Majority: {}/{}".format(label, minority_size, all_size,
                                                           majority_size,all_size))

    df_maj = df[df[label] == majority_class].head(majority_size)
    df_min = df[df[label] == minority_class]
    df_all = df_min.append(df_maj, ignore_index=True)
    if path_out:
        df_all.to_csv(path_out,index=False, sep="\t", header=False)    
    return df_all

def split_data(df, y_label, split=0.8):
    #split into training and test sets
    train_split, test_split = data.shuffle_split_idx(df[y_label], split)
    df_train = df.iloc[train_split, :]
    df_test = df.iloc[test_split, :]
    train_split2, val_split = data.shuffle_split_idx(df_train[y_label], split)
    df_train2 = df_train.iloc[train_split2, :]
    df_val = df_train.iloc[val_split, :]
    return df_train2, df_test, df_val


In [None]:
raw_data_path = "/Users/samir/Dev/resources/datasets/MIMIC/full/"
out_data_path = "/Users/samir/Dev/projects/MIMIC/MIMIC/DATA/input/"

## Extract Data

In [None]:
# PATIENT_SAMPLE_SIZE = 20000
# mini_patients = extract_patients(raw_data_path, PATIENT_SAMPLE_SIZE,out_data_path+"mini_patients.csv")
patients = extract_patients(raw_data_path, None,out_data_path+"patients.csv")
patients
# col_names = ["SUBJECT_ID","GENDER","INSURANCE","ETHNICITY","HOSPITAL_EXPIRE_FLAG","TEXT","ETHNICITY_LABEL","ETHNICITY_BINARY"]
# mini_patients = pd.read_csv(out_data_path+"mini_patients.csv", header=None, 
#                             sep="\t", names=col_names)
# mini_patients

## Mortality

In [None]:
# df_mortality = get_mortality_dataset(mini_patients, out_data_path+"mini_mortality.csv")
df_mortality = get_mortality_dataset(patients)
df_mortality

In [None]:
fig, ax = plt.subplots(1,1, figsize=(10,5))

df_mg = df_mortality.groupby("EXPIRE").size()/df_mortality.groupby("EXPIRE").size().sum()
df_mg.plot(ax=ax,kind="bar", title="Hospital Deaths", rot=0)
plt.tight_layout()

In [None]:
df_mortality.groupby("GENDER").size().plot(kind="bar", rot=0)

In [None]:
df_mortality.groupby("ETHNICITY_BINARY").size().plot(kind="bar", rot=0)

In [None]:
df_mortality.groupby("ETHNICITY_LABEL").size().plot(kind="bar", rot=0)

## Split Data

In [None]:
SAMPLE=5000
df_mortality_train, df_mortality_test, df_mortality_val = split_data(df_mortality.head(SAMPLE), "EXPIRE")

df_mortality_train.to_csv(out_data_path+"mini_mortality_train.csv",index=False, sep="\t", header=True)    
df_mortality_test.to_csv(out_data_path+"mini_mortality_test.csv",index=False, sep="\t", header=True)    
df_mortality_val.to_csv(out_data_path+"mini_mortality_val.csv",index=False, sep="\t", header=True)    

In [None]:
df_mortality_train, df_mortality_test, df_mortality_val = split_data(df_mortality, "EXPIRE")

df_mortality_train.to_csv(out_data_path+"mortality_train.csv",index=False, sep="\t", header=True)    
df_mortality_test.to_csv(out_data_path+"mortality_test.csv",index=False, sep="\t", header=True)    
df_mortality_val.to_csv(out_data_path+"mortality_val.csv",index=False, sep="\t", header=True)    

In [None]:
fig, ax = plt.subplots(1,3, figsize=(10,5), sharey=True)

df_mg_tr = df_mortality_train.groupby("EXPIRE").size()/df_mortality_train.groupby("EXPIRE").size().sum()
df_mg_ts = df_mortality_test.groupby("EXPIRE").size()/df_mortality_test.groupby("EXPIRE").size().sum()
df_mg_val = df_mortality_val.groupby("EXPIRE").size()/df_mortality_val.groupby("EXPIRE").size().sum()
df_mg_tr.plot(ax=ax[0],kind="bar", title="Hospital Deaths Train", rot=0)
df_mg_ts.plot(ax=ax[1],kind="bar", title="Hospital Deaths Test", rot=0)
df_mg_val.plot(ax=ax[2],kind="bar", title="Hospital Deaths Val", rot=0)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))
df_mortality_train.groupby("GENDER").size().plot(kind="bar", rot=0, ax=ax[0])
df_mortality_test.groupby("GENDER").size().plot(kind="bar", rot=0, ax=ax[1],color="r")
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))
df_mortality_train.groupby("ETHNICITY_BINARY").size().plot(kind="bar", rot=0, ax=ax[0])
df_mortality_test.groupby("ETHNICITY_BINARY").size().plot(kind="bar", rot=0, ax=ax[1],color="r")
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5))
df_mortality_train.groupby("ETHNICITY_LABEL").size().plot(kind="bar", rot=0, ax=ax[0])
df_mortality_test.groupby("ETHNICITY_LABEL").size().plot(kind="bar", rot=0, ax=ax[1],color="r")
plt.tight_layout()

## Readmissions

In [None]:
mimic_path = "/Users/samir/Dev/resources/datasets/MIMIC/full/"
# df_readm = get_readmissions_dataset(mimic_path, mini_patients, path_out=out_data_path+"/mini_readmissions.csv")
df_readm = get_readmissions_dataset(mimic_path, patients, path_out=out_data_path+"/readmissions.csv")
df_readm

In [None]:
fig, ax = plt.subplots(1,1, figsize=(10,5))

df_rg = df_readm.groupby("READMISSION").size()/df_readm.groupby("READMISSION").size().sum()
df_rg.plot(ax=ax,kind="bar", title="Readmissions", rot=0)
plt.tight_layout()

In [None]:
df_readm.groupby("GENDER").size().plot(kind="bar", rot=0)

In [None]:
df_readm.groupby("ETHNICITY_BINARY").size().plot(kind="bar", rot=0)

In [None]:
df_readm.groupby("ETHNICITY_LABEL").size().plot(kind="bar", rot=0)

## Split Data

In [None]:
df_readm_train, df_readm_test, df_readm_val = split_data(df_readm, "READMISSION")

df_readm_train.to_csv(out_data_path+"readmissions_train.csv",index=False, sep="\t", header=True)    
df_readm_test.to_csv(out_data_path+"readmissions_test.csv",index=False, sep="\t", header=True)    
df_readm_val.to_csv(out_data_path+"readmissions_val.csv",index=False, sep="\t", header=True)    

In [None]:
SAMPLE=5000
df_readm_train, df_readm_test, df_readm_val = split_data(df_readm.head(SAMPLE), "READMISSION")

df_readm_train.to_csv(out_data_path+"mini_readmissions_train.csv",index=False, sep="\t", header=True)    
df_readm_test.to_csv(out_data_path+"mini_readmissions_test.csv",index=False, sep="\t", header=True)    
df_readm_val.to_csv(out_data_path+"mini_readmissions_val.csv",index=False, sep="\t", header=True)    

In [None]:
fig, ax = plt.subplots(1,3, figsize=(10,5), sharey=True)

df_mg_tr = df_readm_train.groupby("READMISSION").size() #/df_readm_train.groupby("READMISSION").size().sum()
df_mg_ts = df_readm_test.groupby("READMISSION").size() #/df_readm_test.groupby("READMISSION").size().sum()
df_mg_val = df_readm_val.groupby("READMISSION").size() #/df_readm_val.groupby("READMISSION").size().sum()
df_mg_tr.plot(ax=ax[0],kind="bar", title="Readmissions Train", rot=0)
df_mg_ts.plot(ax=ax[1],kind="bar", title="Readmissions Test", rot=0)
df_mg_val.plot(ax=ax[2],kind="bar", title="Readmissions Val", rot=0)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5), sharey=True)
df_readm_train.groupby("GENDER").size().plot(kind="bar", rot=0, ax=ax[0])
df_readm_test.groupby("GENDER").size().plot(kind="bar", rot=0, ax=ax[1],color="r")
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5), sharey=True)
df_readm_train.groupby("ETHNICITY_BINARY").size().plot(kind="bar", rot=0, ax=ax[0])
df_readm_test.groupby("ETHNICITY_BINARY").size().plot(kind="bar", rot=0, ax=ax[1],color="r")
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,5), sharey=True)
df_readm_train.groupby("ETHNICITY_LABEL").size().plot(kind="bar", rot=0, ax=ax[0])
df_readm_test.groupby("ETHNICITY_LABEL").size().plot(kind="bar", rot=0, ax=ax[1],color="r")
plt.tight_layout()

In [None]:
#subsample majority class (negative)
# df_mortality_balanced = subsample(df_mortality,"EXPIRE",1, out_data_path+"/mini_mortality_balanced.csv")


In [None]:
#subsample majority class (negative)
# df_readm_balanced = subsample(df_readm,"READMISSION",1, out_data_path+"/mini_readmissions_balanced.csv")
