In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn

## Read imputed files and creating training matrix using windows

In [337]:
def create_train_data(window_size=6, impute_method='mean'):
    X_train = []
    y_train = []
    features = None

    i = 0
    for file in os.listdir('../data/train_%s_imputed/'%impute_method):

        # Read file
        patient_df = []
        with open('../data/train_%s_imputed/%s' % (impute_method, file)) as f:

            if not features:
                features = f.readline().rstrip('\n').split('|')
            else:
                # This skips the headers
                f.readline()

            for idx, line in enumerate(f):
                line = line.rstrip('\n')
                patient_df.append(line.split('|'))

        patient_df = np.array(patient_df)
        
        # Get sliding-window data
        window_start = 0
        while (window_start + window_size) <= patient_df.shape[0]:
            window_data = patient_df[window_start:window_start + window_size, 1:37].astype(np.float)
            assert window_data.shape[0] == window_size

            x_i = np.reshape(window_data, (window_size*window_data.shape[1],))
            X_train.append(x_i)

            label = int(patient_df[window_start + window_size - 1, 41])
            y_train.append(label)

            window_start += 1
    
    return np.array(X_train), np.array(y_train), features

In [338]:
X_train, y_train, features = create_train_data()

In [339]:
X_train.shape, y_train.shape

((130976, 216), (130976,))

In [340]:
features[1:37]

['HR',
 'O2Sat',
 'Temp',
 'SBP',
 'MAP',
 'DBP',
 'Resp',
 'EtCO2',
 'BaseExcess',
 'HCO3',
 'FiO2',
 'pH',
 'PaCO2',
 'SaO2',
 'AST',
 'BUN',
 'Alkalinephos',
 'Calcium',
 'Chloride',
 'Creatinine',
 'Bilirubin_direct',
 'Glucose',
 'Lactate',
 'Magnesium',
 'Phosphate',
 'Potassium',
 'Bilirubin_total',
 'TroponinI',
 'Hct',
 'Hgb',
 'PTT',
 'WBC',
 'Fibrinogen',
 'Platelets',
 'Age',
 'Gender']

In [341]:
y_train.sum()

1799

## Train Models and Predict

In [259]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, precision_score, recall_score

In [371]:
# Read test files, create windows, and predict. 
def predict(model, model_type, window_size=6, impute_method='mean'):
    X_test = []
    y_test = []
    features = None
    
    for file in os.listdir('../data/test_%s_imputed/'%(impute_method)):
        X = []
        y = []
        
        # Read file 
        patient_df = []
        with open('../data/test_%s_imputed/%s' % (impute_method, file)) as f:

            if not features:
                features = f.readline().rstrip('\n').split('|')
            else:
                # This skips the headers
                f.readline()

            for idx, line in enumerate(f):
                line = line.rstrip('\n') 
                patient_df.append(line.split('|'))
        
        # Create patient df
        patient_df = np.array(patient_df)
        X_patient = np.concatenate((patient_df[:,1:37], patient_df[:,41][:,None]), axis=1).astype(np.float)
        
        assert X_patient.shape[0] == patient_df.shape[0]
        
        # Append 5 rows of mean values to top of df 
        mean_vals = np.mean(X_patient, axis=0)
        X_patient = np.concatenate(([mean_vals]*max(1,window_size-1), X_patient))
    
        # Get data for every window
        window_start = 0
        while (window_start + window_size) <= X_patient.shape[0]:
            window_data = X_patient[window_start:window_start + window_size, 0:X_patient.shape[1]-1]
            assert window_data.shape[0] == window_size

            x_i = np.reshape(window_data, (window_size*window_data.shape[1],))
            X.append(x_i)
            
            label = int(X_patient[window_start+window_size-1, X_patient.shape[1]-1])
            y.append(label)

            window_start += 1
        
        # Add X_test to X
        X_test.extend(list(X))
        y_test.extend(list(y))
        
        # Predict for every time step
        y_pred = model.predict(X)
        y_pred_prob = model.predict_proba(X)
        
        assert y_pred.shape[0] == len(y)
        
        # Save to file
        if not os.path.isdir('../data/model_predictions/%s_pred_%s_imputed_%d'%(model_type, impute_method, window_size)):
            os.mkdir('../data/model_predictions/%s_pred_%s_imputed_%d'%(model_type, impute_method, window_size))
            
        pred = np.transpose(np.vstack((y_pred_prob[:,1], y_pred)))

        pred_df = pd.DataFrame(pred, columns=["PredictedProbability", "PredictedLabel"])
        pred_df.to_csv('../data/model_predictions/%s_pred_%s_imputed_%d/%s'%(model_type, impute_method, window_size, file), sep='|', index=False)
        
    return np.array(X_test), np.array(y_test)

## Regularized Log. Reg. 

In [374]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

lr_model = LogisticRegressionCV(cv=5, solver='lbfgs', max_iter=1000, class_weight='balanced')

# window size, imp method, train acc, train recall, train prec., test acc, test auc, 
# test recall, test prec.
 
lr_results = []

for ws in range(1,7):
    for imp in ['mean', 'forw']:
        res = [ws, imp]
        print("Working on ws: %d, imp: %s"%(ws,imp))
        
        X_train, y_train, _ = create_train_data(window_size=ws, impute_method=imp)
        lr_model = lr_model.fit(X_train, y_train)
        
        res.append(lr_model.score(X_train, y_train))
        y_pred = lr_model.predict(X_train)
        res.extend([recall_score(y_train, y_pred), precision_score(y_train, y_pred)])
        
        X_test, y_test = predict(lr_model, 'RLR', window_size=ws, impute_method=imp)
        res.append(lr_model.score(X_test, y_test))
        
        y_pred_prob = lr_model.predict_proba(X_test)
        res.append(roc_auc_score(y_test, y_pred_prob[:,1]))
        
        y_pred = lr_model.predict(X_test)
        res.extend([recall_score(y_test, y_pred), precision_score(y_test, y_pred)])
        
        results.append(res)

In [380]:
lr_results = np.array(results)

## Random Forest

In [382]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(max_depth=5, class_weight='balanced')

rf_results = []

for ws in range(1,7):
    for imp in ['mean', 'forw']:
        res = [ws, imp]
        print("Working on ws: %d, imp: %s"%(ws,imp))
        
        X_train, y_train, _ = create_train_data(window_size=ws, impute_method=imp)
        rf_model = rf_model.fit(X_train, y_train)
        
        res.append(rf_model.score(X_train, y_train))
        y_pred = rf_model.predict(X_train)
        res.extend([recall_score(y_train, y_pred), precision_score(y_train, y_pred)])
        
        X_test, y_test = predict(rf_model, 'RF', window_size=ws, impute_method=imp)
        res.append(rf_model.score(X_test, y_test))
        
        y_pred_prob = rf_model.predict_proba(X_test)
        res.append(roc_auc_score(y_test, y_pred_prob[:,1]))
        
        y_pred = rf_model.predict(X_test)
        res.extend([recall_score(y_test, y_pred), precision_score(y_test, y_pred)])
        
        rf_results.append(res)

Working on ws: 1, imp: mean




Working on ws: 1, imp: forw
Working on ws: 2, imp: mean
Working on ws: 2, imp: forw
Working on ws: 3, imp: mean
Working on ws: 3, imp: forw
Working on ws: 4, imp: mean
Working on ws: 4, imp: forw
Working on ws: 5, imp: mean
Working on ws: 5, imp: forw
Working on ws: 6, imp: mean
Working on ws: 6, imp: forw


In [384]:
rf_results = np.array(rf_results)
rf_results

array([['1', 'mean', '0.841935142009326', '0.48801534036433364',
        '0.042747963382884016', '0.8386048808379032',
        '0.7370893571137952', '0.48816029143898', '0.04324673228981765'],
       ['1', 'forw', '0.8308671576939382', '0.6860019175455417',
        '0.054387898597544754', '0.811575746549887',
        '0.7223680742613077', '0.4972677595628415',
        '0.037670760314612944'],
       ['2', 'mean', '0.8398037774874809', '0.49754901960784315',
        '0.04312725727639686', '0.8389679003121915', '0.743989422678479',
        '0.5344506517690876', '0.04726613965744401'],
       ['2', 'forw', '0.8299790441976922', '0.7029411764705882',
        '0.05554479606460859', '0.8127918456653415',
        '0.7180933347045236', '0.4767225325884544',
        '0.03661850951223001'],
       ['3', 'mean', '0.862886078782453', '0.5231388329979879',
        '0.05280259951259139', '0.858233049603757', '0.7532411762392888',
        '0.48044692737430167', '0.048752834467120185'],
       ['3', '

## SVM

In [None]:
from sklearn.linear import SVC

## XG Boost

# AdaBoost

In [230]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_model = AdaBoostClassifier(n_estimators=100).fit(X_train, y_train)

In [226]:
adaboost_model.score(X_train, y_train)

0.8549199853408258

In [235]:
y_pred = adaboost_model.predict(X_train)
recall_score(y_train, y_pred), precision_score(y_train, y_pred)

(0.018343524180100056, 0.1952662721893491)

In [236]:
confusion_matrix(y_train, y_pred)

array([[129041,    136],
       [  1766,     33]])

In [229]:
X_test, y_test = predict(adaboost_model, 'AB')

In [231]:
X_test = X_test.astype(np.float)
y_test = y_test.astype(np.int)

In [232]:
y_pred = adaboost_model.predict(X_test)
recall_score(y_test, y_pred), precision_score(y_test, y_pred)

(0.00558659217877095, 0.057692307692307696)

In [233]:
roc_auc_score(y_test, adaboost_model.predict_proba(X_test)[:,1])
confusion_matrix(y_test, y_pred)

array([[36891,    49],
       [  534,     3]])