In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sklearn

## Read imputed files and creating training matrix using windows

In [337]:
def create_train_data(window_size=6, impute_method='mean'):
    X_train = []
    y_train = []
    features = None

    i = 0
    for file in os.listdir('../data/train_%s_imputed/'%impute_method):

        # Read file
        patient_df = []
        with open('../data/train_%s_imputed/%s' % (impute_method, file)) as f:

            if not features:
                features = f.readline().rstrip('\n').split('|')
            else:
                # This skips the headers
                f.readline()

            for idx, line in enumerate(f):
                line = line.rstrip('\n')
                patient_df.append(line.split('|'))

        patient_df = np.array(patient_df)
        
        # Get sliding-window data
        window_start = 0
        while (window_start + window_size) <= patient_df.shape[0]:
            window_data = patient_df[window_start:window_start + window_size, 1:37].astype(np.float)
            assert window_data.shape[0] == window_size

            x_i = np.reshape(window_data, (window_size*window_data.shape[1],))
            X_train.append(x_i)

            label = int(patient_df[window_start + window_size - 1, 41])
            y_train.append(label)

            window_start += 1
    
    return np.array(X_train), np.array(y_train), features

In [338]:
X_train, y_train, features = create_train_data()

In [339]:
X_train.shape, y_train.shape

((130976, 216), (130976,))

In [388]:
features[1:37]

['HR',
 'O2Sat',
 'Temp',
 'SBP',
 'MAP',
 'DBP',
 'Resp',
 'EtCO2',
 'BaseExcess',
 'HCO3',
 'FiO2',
 'pH',
 'PaCO2',
 'SaO2',
 'AST',
 'BUN',
 'Alkalinephos',
 'Calcium',
 'Chloride',
 'Creatinine',
 'Bilirubin_direct',
 'Glucose',
 'Lactate',
 'Magnesium',
 'Phosphate',
 'Potassium',
 'Bilirubin_total',
 'TroponinI',
 'Hct',
 'Hgb',
 'PTT',
 'WBC',
 'Fibrinogen',
 'Platelets',
 'Age',
 'Gender']

In [341]:
y_train.sum()

1799

## Train Models and Predict

In [259]:
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, precision_score, recall_score

In [371]:
# Read test files, create windows, and predict. 
def predict(model, model_type, window_size=6, impute_method='mean'):
    X_test = []
    y_test = []
    features = None
    
    for file in os.listdir('../data/test_%s_imputed/'%(impute_method)):
        X = []
        y = []
        
        # Read file 
        patient_df = []
        with open('../data/test_%s_imputed/%s' % (impute_method, file)) as f:

            if not features:
                features = f.readline().rstrip('\n').split('|')
            else:
                # This skips the headers
                f.readline()

            for idx, line in enumerate(f):
                line = line.rstrip('\n') 
                patient_df.append(line.split('|'))
        
        # Create patient df
        patient_df = np.array(patient_df)
        X_patient = np.concatenate((patient_df[:,1:37], patient_df[:,41][:,None]), axis=1).astype(np.float)
        
        assert X_patient.shape[0] == patient_df.shape[0]
        
        # Append 5 rows of mean values to top of df 
        mean_vals = np.mean(X_patient, axis=0)
        X_patient = np.concatenate(([mean_vals]*max(1,window_size-1), X_patient))
    
        # Get data for every window
        window_start = 0
        while (window_start + window_size) <= X_patient.shape[0]:
            window_data = X_patient[window_start:window_start + window_size, 0:X_patient.shape[1]-1]
            assert window_data.shape[0] == window_size

            x_i = np.reshape(window_data, (window_size*window_data.shape[1],))
            X.append(x_i)
            
            label = int(X_patient[window_start+window_size-1, X_patient.shape[1]-1])
            y.append(label)

            window_start += 1
        
        # Add X_test to X
        X_test.extend(list(X))
        y_test.extend(list(y))
        
        # Predict for every time step
        y_pred = model.predict(X)
        y_pred_prob = model.predict_proba(X)
        
        assert y_pred.shape[0] == len(y)
        
        # Save to file
        if not os.path.isdir('../data/model_predictions/%s_pred_%s_imputed_%d'%(model_type, impute_method, window_size)):
            os.mkdir('../data/model_predictions/%s_pred_%s_imputed_%d'%(model_type, impute_method, window_size))
            
        pred = np.transpose(np.vstack((y_pred_prob[:,1], y_pred)))

        pred_df = pd.DataFrame(pred, columns=["PredictedProbability", "PredictedLabel"])
        pred_df.to_csv('../data/model_predictions/%s_pred_%s_imputed_%d/%s'%(model_type, impute_method, window_size, file), sep='|', index=False)
        
    return np.array(X_test), np.array(y_test)

## Regularized Log. Reg. 

In [374]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

lr_model = LogisticRegressionCV(cv=5, solver='lbfgs', max_iter=1000, class_weight='balanced')

# window size, imp method, train acc, train recall, train prec., test acc, test auc, 
# test recall, test prec.
 
lr_results = []

for ws in range(1,7):
    for imp in ['mean', 'forw']:
        res = [ws, imp]
        print("Working on ws: %d, imp: %s"%(ws,imp))
        
        X_train, y_train, _ = create_train_data(window_size=ws, impute_method=imp)
        lr_model = lr_model.fit(X_train, y_train)
        
        res.append(lr_model.score(X_train, y_train))
        y_pred = lr_model.predict(X_train)
        res.extend([recall_score(y_train, y_pred), precision_score(y_train, y_pred)])
        
        X_test, y_test = predict(lr_model, 'RLR', window_size=ws, impute_method=imp)
        res.append(lr_model.score(X_test, y_test))
        
        y_pred_prob = lr_model.predict_proba(X_test)
        res.append(roc_auc_score(y_test, y_pred_prob[:,1]))
        
        y_pred = lr_model.predict(X_test)
        res.extend([recall_score(y_test, y_pred), precision_score(y_test, y_pred)])
        
        results.append(res)

In [380]:
lr_results = np.array(results)

In [392]:
lr_results.tofile('../results/lr_results', sep='\n')

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(max_depth=5, class_weight='balanced')

rf_results = []

for ws in range(1,7):
    for imp in ['mean', 'forw']:
        res = [ws, imp]
        print("Working on ws: %d, imp: %s"%(ws,imp))
        
        X_train, y_train, _ = create_train_data(window_size=ws, impute_method=imp)
        rf_model = rf_model.fit(X_train, y_train)
        
        res.append(rf_model.score(X_train, y_train))
        y_pred = rf_model.predict(X_train)
        res.extend([recall_score(y_train, y_pred), precision_score(y_train, y_pred)])
        
        X_test, y_test = predict(rf_model, 'RF', window_size=ws, impute_method=imp)
        res.append(rf_model.score(X_test, y_test))
        
        y_pred_prob = rf_model.predict_proba(X_test)
        res.append(roc_auc_score(y_test, y_pred_prob[:,1]))
        
        y_pred = rf_model.predict(X_test)
        res.extend([recall_score(y_test, y_pred), precision_score(y_test, y_pred)])
        
        rf_results.append(res)

In [389]:
rf_results = np.array(rf_results)

In [396]:
rf_results.tofile('../results/rf_results', sep='\n')

## SVM

In [None]:
from sklearn.svm import SVC

svm_model = SVC(class_weight='balanced', max_iter=1000, probability=True)

svm_results = []

for ws in range(1,7):
    for imp in ['mean', 'forw']:
        res = [ws, imp]
        print("Working on ws: %d, imp: %s"%(ws,imp))
        
        X_train, y_train, _ = create_train_data(window_size=ws, impute_method=imp)
        svm_model = svm_model.fit(X_train, y_train)
        
        res.append(svm_model.score(X_train, y_train))
        y_pred = svm_model.predict(X_train)
        res.extend([recall_score(y_train, y_pred), precision_score(y_train, y_pred)])
        
        X_test, y_test = predict(svm_model, 'SVM', window_size=ws, impute_method=imp)
        res.append(svm_model.score(X_test, y_test))
        
        y_pred_prob = svm_model.predict_proba(X_test)
        res.append(roc_auc_score(y_test, y_pred_prob[:,1]))
        
        y_pred = svm_model.predict(X_test)
        res.extend([recall_score(y_test, y_pred), precision_score(y_test, y_pred)])
        
        svm_results.append(res)

## XG Boost

In [393]:
from sklearn.ensemble import GradientBoostingClassifier

xg_model = GradientBoostingClassifier(max_features='auto')

xg_results = []

for ws in range(1,7):
    for imp in ['mean', 'forw']:
        res = [ws, imp]
        print("Working on ws: %d, imp: %s"%(ws,imp))
        
        X_train, y_train, _ = create_train_data(window_size=ws, impute_method=imp)
        xg_model = xg_model.fit(X_train, y_train)
        
        res.append(xg_model.score(X_train, y_train))
        y_pred = xg_model.predict(X_train)
        res.extend([recall_score(y_train, y_pred), precision_score(y_train, y_pred)])
        
        X_test, y_test = predict(xg_model, 'XG', window_size=ws, impute_method=imp)
        res.append(xg_model.score(X_test, y_test))
        
        y_pred_prob = xg_model.predict_proba(X_test)
        res.append(roc_auc_score(y_test, y_pred_prob[:,1]))
        
        y_pred = xg_model.predict(X_test)
        res.extend([recall_score(y_test, y_pred), precision_score(y_test, y_pred)])
        
        xg_results.append(res)

Working on ws: 1, imp: mean
Working on ws: 1, imp: forw
Working on ws: 2, imp: mean
Working on ws: 2, imp: forw
Working on ws: 3, imp: mean
Working on ws: 3, imp: forw
Working on ws: 4, imp: mean
Working on ws: 4, imp: forw
Working on ws: 5, imp: mean
Working on ws: 5, imp: forw
Working on ws: 6, imp: mean
Working on ws: 6, imp: forw


In [394]:
xg_results = np.array(xg_results)

In [397]:
xg_results.tofile('../results/xg_results', sep='\n')

In [399]:
xg_results

array([['1', 'mean', '0.986872085629504', '0.05225311601150527',
        '0.956140350877193', '0.9852899134547912', '0.7632749368086555',
        '0.007285974499089253', '0.16'],
       ['1', 'forw', '0.9877530203476049', '0.12607861936720996',
        '0.9100346020761245', '0.9823790836083894', '0.7116494381646905',
        '0.0', '0.0'],
       ['2', 'mean', '0.9871543653385587', '0.0803921568627451',
        '0.9318181818181818', '0.9846572564506231', '0.7707686176889597',
        '0.0037243947858473', '0.047619047619047616'],
       ['2', 'forw', '0.9875966144132375', '0.11372549019607843',
        '0.9392712550607287', '0.9812151452891107', '0.7286990630535802',
        '0.0037243947858473', '0.011695906432748537'],
       ['3', 'mean', '0.9871936548791406', '0.08651911468812877',
        '0.9197860962566845', '0.9843904261280252', '0.7788484320539926',
        '0.00186219739292365', '0.02'],
       ['3', 'forw', '0.9880259623992838', '0.14738430583501005',
        '0.945161290322

# AdaBoost

In [395]:
from sklearn.ensemble import AdaBoostClassifier

ab_model = AdaBoostClassifier(n_estimators=100)

ab_results = []

for ws in range(1,7):
    for imp in ['mean', 'forw']:
        res = [ws, imp]
        print("Working on ws: %d, imp: %s"%(ws,imp))
        
        X_train, y_train, _ = create_train_data(window_size=ws, impute_method=imp)
        ab_model = ab_model.fit(X_train, y_train)
        
        res.append(ab_model.score(X_train, y_train))
        y_pred = ab_model.predict(X_train)
        res.extend([recall_score(y_train, y_pred), precision_score(y_train, y_pred)])
        
        X_test, y_test = predict(ab_model, 'AB', window_size=ws, impute_method=imp)
        res.append(ab_model.score(X_test, y_test))
        
        y_pred_prob = ab_model.predict_proba(X_test)
        res.append(roc_auc_score(y_test, y_pred_prob[:,1]))
        
        y_pred = ab_model.predict(X_test)
        res.extend([recall_score(y_test, y_pred), precision_score(y_test, y_pred)])
        
        ab_results.append(res)

ab_results = np.array(ab_results)

Working on ws: 1, imp: mean
Working on ws: 1, imp: forw
Working on ws: 2, imp: mean
Working on ws: 2, imp: forw
Working on ws: 3, imp: mean
Working on ws: 3, imp: forw
Working on ws: 4, imp: mean
Working on ws: 4, imp: forw
Working on ws: 5, imp: mean
Working on ws: 5, imp: forw
Working on ws: 6, imp: mean
Working on ws: 6, imp: forw


In [398]:
ab_results.tofile('../results/ab_results', sep='\n')

In [400]:
ab_results

array([['1', 'mean', '0.9860706337431114', '0.004314477468839885',
        '0.2571428571428571', '0.9853938716635913', '0.7433725448160046',
        '0.0018214936247723133', '0.06666666666666667'],
       ['1', 'forw', '0.9860573866044934', '0.003835091083413231',
        '0.22857142857142856', '0.9835486134573901',
        '0.6740974606665338', '0.020036429872495445',
        '0.10377358490566038'],
       ['2', 'mean', '0.9858208142826039', '0.011274509803921568',
        '0.25555555555555554', '0.9851375510312992', '0.762122784040555',
        '0.00931098696461825', '0.16666666666666666'],
       ['2', 'forw', '0.9859909100805574', '0.003431372549019608',
        '0.21212121212121213', '0.9828961763214772',
        '0.6838627539348625', '0.027932960893854747',
        '0.11194029850746269'],
       ['3', 'mean', '0.9856549350940018', '0.012575452716297788',
        '0.22123893805309736', '0.9850041358700002',
        '0.7446418219085961', '0.00931098696461825',
        '0.1428571428