In [1]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn

## Read imputed files and creating training matrix using windows

In [250]:
X_train = []
y_train = []
window_size = 4
features = None

i = 0
for file in os.listdir('../data/train_mean_imputed/'):
    
    # Read file
    patient_df = []
    with open('../data/train_mean_imputed/%s' % (file)) as f:
        
        if not features:
            features = f.readline().rstrip('\n').split('|')
        else:
            # This skips the headers
            f.readline()
        
        for idx, line in enumerate(f):
            line = line.rstrip('\n')
            patient_df.append(line.split('|'))
    
    patient_df = np.array(patient_df)
    
    window_start = 0
    while (window_start + window_size) <= patient_df.shape[0]:
        window_data = patient_df[window_start:window_start + window_size, 1:37].astype(np.float)
        assert window_data.shape[0] == window_size
        
        x_i = np.reshape(window_data, (window_size*window_data.shape[1],))
        X_train.append(x_i)
        
        label = int(patient_df[window_start + window_size - 1, 41])
        y_train.append(label)
        
        window_start += 1
    

In [252]:
X_train, y_train = np.array(X_train), np.array(y_train)

In [253]:
X_train.shape, y_train.shape

((138976, 144), (138976,))

In [255]:
features[1:37]

['HR',
 'O2Sat',
 'Temp',
 'SBP',
 'MAP',
 'DBP',
 'Resp',
 'EtCO2',
 'BaseExcess',
 'HCO3',
 'FiO2',
 'pH',
 'PaCO2',
 'SaO2',
 'AST',
 'BUN',
 'Alkalinephos',
 'Calcium',
 'Chloride',
 'Creatinine',
 'Bilirubin_direct',
 'Glucose',
 'Lactate',
 'Magnesium',
 'Phosphate',
 'Potassium',
 'Bilirubin_total',
 'TroponinI',
 'Hct',
 'Hgb',
 'PTT',
 'WBC',
 'Fibrinogen',
 'Platelets',
 'Age',
 'Gender']

In [256]:
X_train = X_train.astype(np.float)

In [257]:
y_train = y_train.astype(np.int)

In [258]:
y_train.sum()

1928

## Train Models and Predict

In [259]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, precision_score, recall_score

In [334]:
# Read test files, create windows, and predict. 
def predict(model, model_type, impute_method='mean'):
    X = []
    y = []
    features = None
    
    for file in os.listdir('../data/test_%s_imputed/'%(impute_method)):
        X_test = []
        y_test = []
        
        # Read file 
        patient_df = []
        with open('../data/test_%s_imputed/%s' % (impute_method, file)) as f:

            if not features:
                features = f.readline().rstrip('\n').split('|')
            else:
                # This skips the headers
                f.readline()

            for idx, line in enumerate(f):
                line = line.rstrip('\n') 
                patient_df.append(line.split('|'))
        
        # Create patient df
        patient_df = np.array(patient_df)
        X_patient = patient_df[:,1:37].astype(np.float)
        
        # Append 5 rows of mean values to top of df 
        mean_vals = np.mean(X_patient, axis=0)
        X_patient = np.concatenate(([mean_vals]*(window_size-1), X_patient))
    
        # Get data for every window
        window_start = 0
        while (window_start + window_size) <= X_patient.shape[0]:
            window_data = X_patient[window_start:window_start + window_size, :]
            assert window_data.shape[0] == window_size

            x_i = np.reshape(window_data, (window_size*window_data.shape[1],))
            X_test.append(x_i)
            
            label = int(patient_df[window_start, 41])
            y_test.append(label)

            window_start += 1
        
        # Add X_test to X
        X.extend(list(X_test))
        y.extend(list(y_test))
        
        # Predict for every time step
        y_pred = model.predict(X_test)
        y_pred_prob = model.predict_proba(X_test)
        
        assert y_pred.shape[0] == len(y_test)
        
        # Save to file
        if not os.path.isdir('../data/%s_pred_%s_imputed'%(model_type, impute_method)):
            os.mkdir('../data/%s_pred_%s_imputed'%(model_type, impute_method))
            
        pred = np.transpose(np.vstack((y_pred_prob[:,1], y_pred)))

        pred_df = pd.DataFrame(pred, columns=["PredictedProbability", "PredictedLabel"])
        pred_df.to_csv('../data/%s_pred_%s_imputed/%s'%(model_type, impute_method, file), sep='|', index=False)
        
    return np.array(X), np.array(y)

## Regularized Log. Reg. 

In [308]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

lr_model = LogisticRegressionCV(cv=5, solver='lbfgs', max_iter=1000, class_weight='balanced')
lr_model = lr_model.fit(X_train, y_train)

In [309]:
lr_model.score(X_train, y_train)

0.7190018420446695

In [310]:
y_pred = lr_model.predict(X_train)
recall_score(y_train, y_pred), precision_score(y_train, y_pred)

(0.5316390041493776, 0.02616531372849339)

In [264]:
confusion_matrix(y_train, y_pred)

array([[98893, 38155],
       [  903,  1025]])

In [None]:
X_test, y_test = predict(lr_model, 'RLR')

In [311]:
y_pred_prob = lr_model.predict_proba(X_test)
roc_auc_score(y_test, y_pred_prob[:,1])

0.6666847139505504

In [312]:
y_pred = lr_model.predict(X_test)
precision_score(y_test, y_pred), recall_score(y_test, y_pred)

(0.027508243052284505, 0.5437616387337058)

In [313]:
confusion_matrix(y_test, y_pred)

array([[26617, 10323],
       [  245,   292]])

## Random Forest

In [329]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(max_depth=5, class_weight='balanced').fit(X_train, y_train)



In [330]:
rf_model.score(X_train, y_train)

0.8444263757771125

In [331]:
y_pred = rf_model.predict(X_train)
recall_score(y_train, y_pred), precision_score(y_train, y_pred)

(0.5363070539419087, 0.047516198704103674)

In [332]:
confusion_matrix(y_train, y_pred)

array([[116321,  20727],
       [   894,   1034]])

In [None]:
X_test, y_test = predict(rf_model, 'RF')

In [335]:
y_pred = rf_model.predict(X_test)
recall_score(y_test, y_pred), precision_score(y_test, y_pred)

(0.547486033519553, 0.05003403675970047)

In [336]:
roc_auc_score(y_test, rf_model.predict_proba(X_test)[:,1])
confusion_matrix(y_test, y_pred)

array([[31358,  5582],
       [  243,   294]])

## SVM

In [None]:
from sklearn.linear import SVC

## XG Boost

# AdaBoost

In [230]:
from sklearn.ensemble import AdaBoostClassifier
adaboost_model = AdaBoostClassifier(n_estimators=100).fit(X_train, y_train)

In [226]:
adaboost_model.score(X_train, y_train)

0.8549199853408258

In [235]:
y_pred = adaboost_model.predict(X_train)
recall_score(y_train, y_pred), precision_score(y_train, y_pred)

(0.018343524180100056, 0.1952662721893491)

In [236]:
confusion_matrix(y_train, y_pred)

array([[129041,    136],
       [  1766,     33]])

In [229]:
X_test, y_test = predict(adaboost_model, 'AB')

In [231]:
X_test = X_test.astype(np.float)
y_test = y_test.astype(np.int)

In [232]:
y_pred = adaboost_model.predict(X_test)
recall_score(y_test, y_pred), precision_score(y_test, y_pred)

(0.00558659217877095, 0.057692307692307696)

In [233]:
roc_auc_score(y_test, adaboost_model.predict_proba(X_test)[:,1])
confusion_matrix(y_test, y_pred)

array([[36891,    49],
       [  534,     3]])