In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer

In [2]:
data_age = pd.read_csv("id_age_train.csv", sep=",")
data_vitals = pd.read_csv("id_time_vitals_train.csv", sep=",")
data_labels = pd.read_csv("id_label_train.csv", sep=",").set_index('ID')
data_labs = pd.read_csv("id_time_labs_train.csv", sep = ",")
data_timeseries = pd.merge(data_labs, data_vitals)
patients = data_timeseries['ID']
patient_ids = list(data_timeseries['ID'].unique())
labels = np.zeros(len(patients))
for p in patient_ids:
    if int(data_labels.ix[p]['LABEL'])==1:
        data_id = data_timeseries[data_timeseries['ID']==p]
        indices = data_id[data_id['ICU']==1].index
        num = len(indices)
        for i in indices:
            labels[i]=1
data_timeseries['LABEL'] = labels
data = pd.merge( data_age, data_timeseries, on='ID')

In [3]:
data['V6'] = data['V6'].apply(lambda x: 80 if x<80 else 112 if x>112 else x)
data['V5'] = data['V5'].apply(lambda x: 100 if x>100 else x if x>0 else np.nan )
data['V4'] = data['V4'].apply(lambda x: x if x>0 else np.nan )
data['V3'] = data['V3'].apply(lambda x: x if (x>30 and x<220) else np.nan )
data['V2'] = data['V2'].apply(lambda x: x if (x>15 and x<200) else np.nan )
data['V1'] = data['V1'].apply(lambda x: x if (x>30 and x<300) else np.nan )
data['L1'] = data['L1'].apply(lambda x: x if (x>0 and x<14) else np.nan)
data['L2'] = data['L2'].apply(lambda x: 132 if x>132 else x if x>0 else np.nan )
data[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']] = data[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']].applymap(lambda x: x if x>0 else np.nan )
data['L7'] = data['L7'].apply(lambda x: x if x<700 else 700 if x>700 else np.nan)
data['L8'] = data['L8'].apply(lambda x: x if x<200 else 200 if x>200 else np.nan)
data['L9'] = data['L9'].apply(lambda x: x if x<100 else x/1000)
data['L10'] = data['L10'].apply(lambda x: x if (x>0 and x<100) else np.nan)
data['L11'] = data['L11'].apply(lambda x: x if (x>0 and x<2000) else 2000 if x>2000 else np.nan)
data['L12'] = data['L12'].apply(lambda x: x if x<5 else 5+(x-5)/10)
data['L15'] =data['L15'].apply(lambda x: x if (x>0 and x<20) else 20 if x>20 else np.nan)
data['L18'] =data['L18'].apply(lambda x: x if x<1000 else 1000 if x>1000 else np.nan)
data['L19'] =data['L19'].apply(lambda x: x if x<800 else 800 if x>800 else np.nan)
data['L20'] =data['L20'].apply(lambda x: x/100 if x>1000 else x if x>0 else np.nan)
data['L23'] =data['L23'].apply(lambda x: 3000 if x>3000 else x if x>0 else -1*x)



In [4]:
print "Preprocessing"
data_grouped = data.groupby(['ID','TIME']).mean()
data_nonicu = data[data['ICU']==0]
data_icu = data[data['ICU']==1]
normal = {attribute: data[attribute].mean() for attribute in data.columns}
patient_ids = data['ID'].unique()
for patient in patient_ids:
    data_grouped.loc[patient, 0] = data_grouped.loc[patient, 0].fillna(normal)
data_filled = data_grouped.reset_index()#.fillna(method='ffill')
data_np = data_filled.as_matrix()
print "...done"

Preprocessing
...done


In [5]:
data_without_labels = data_filled[data_filled.columns[:-2]]
data_by_id = data_without_labels.groupby('ID')
timecolumns = data_filled.columns[3:-2]
data_medians = data_by_id.apply(pd.expanding_median)[timecolumns].rename(columns=lambda x: 'MEDIAN_'+x)
data_stds = data_by_id.apply(pd.expanding_std).fillna(0)[timecolumns].rename(columns=lambda x: 'STD_'+x)
data_mins = data_by_id.apply(pd.expanding_min).fillna(0)[timecolumns].rename(columns=lambda x: 'MIN_'+x)
data_maxs = data_by_id.apply(pd.expanding_max)[timecolumns].rename(columns=lambda x: 'MAX_'+x)
data_counts = data_by_id.apply(pd.expanding_count)[timecolumns].rename(columns=lambda x: 'COUNT_'+x)

In [6]:
data_stats_full = pd.concat([data_without_labels[['ID','TIME','AGE']],data_medians, data_stds, data_mins, data_maxs, data_counts, data_filled[['ICU','LABEL']]], axis = 1)
data_stats_np = data_stats_full.as_matrix()

In [16]:
%xdel data_stds

###Preparing Validation Features
Delete Cached dataframes before proceeding

In [17]:
data_age = pd.read_csv("id_age_val.csv", sep=",")
data_vitals = pd.read_csv("id_time_vitals_val.csv", sep=",")
data_labels = pd.read_csv("id_label_val.csv", sep=",", header = None, names = ['ID','LABEL']).set_index('ID')
data_labs = pd.read_csv("id_time_labs_val.csv", sep = ",")
data_timeseries = pd.merge(data_labs, data_vitals)
patients = data_timeseries['ID']
patient_ids = list(data_timeseries['ID'].unique())
labels = np.zeros(len(patients))
#for p in patient_ids:
#    if int(data_labels.ix[p]['LABEL'])==1:
#        data_id = data_timeseries[data_timeseries['ID']==p]
#        indices = data_id[data_id['ICU']==1].index
#        num = len(indices)
#        for i in indices:
#            labels[i]=1
#data_timeseries['LABEL'] = labels
data = pd.merge( data_age, data_timeseries, on='ID')

In [18]:
data['V6'] = data['V6'].apply(lambda x: 80 if x<80 else 112 if x>112 else x)
data['V5'] = data['V5'].apply(lambda x: 100 if x>100 else x if x>0 else np.nan )
data['V4'] = data['V4'].apply(lambda x: x if x>0 else np.nan )
data['V3'] = data['V3'].apply(lambda x: x if (x>30 and x<220) else np.nan )
data['V2'] = data['V2'].apply(lambda x: x if (x>15 and x<200) else np.nan )
data['V1'] = data['V1'].apply(lambda x: x if (x>30 and x<300) else np.nan )
data['L1'] = data['L1'].apply(lambda x: x if (x>0 and x<14) else np.nan)
data['L2'] = data['L2'].apply(lambda x: 132 if x>132 else x if x>0 else np.nan )
#data[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']] = data[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']].applymap(lambda x: x if x>0 else np.nan )
data['L7'] = data['L7'].apply(lambda x: x if x<700 else 700 if x>700 else np.nan)
data['L8'] = data['L8'].apply(lambda x: x if x<200 else 200 if x>200 else np.nan)
data['L9'] = data['L9'].apply(lambda x: x if x<100 else x/1000)
data['L10'] = data['L10'].apply(lambda x: x if (x>0 and x<100) else np.nan)
data['L11'] = data['L11'].apply(lambda x: x if (x>0 and x<2000) else 2000 if x>2000 else np.nan)
data['L12'] = data['L12'].apply(lambda x: x if x<5 else 5+(x-5)/10)
data['L15'] =data['L15'].apply(lambda x: x if (x>0 and x<20) else 20 if x>20 else np.nan)
data['L18'] =data['L18'].apply(lambda x: x if x<1000 else 1000 if x>1000 else np.nan)
data['L19'] =data['L19'].apply(lambda x: x if x<800 else 800 if x>800 else np.nan)
data['L20'] =data['L20'].apply(lambda x: x/100 if x>1000 else x if x>0 else np.nan)
data['L23'] =data['L23'].apply(lambda x: 3000 if x>3000 else x if x>0 else -1*x)



In [19]:
data_grouped = data.groupby(['ID','TIME']).mean()
for patient in patient_ids:
    data_grouped.loc[patient, 0] = data_grouped.loc[patient, 0].fillna(normal)
data_filled = data_grouped.reset_index()#.fillna(method='ffill')

In [20]:
data_by_id = data_filled.groupby('ID')
timecolumns = data_filled.columns[3:-1]
data_medians = data_by_id.apply(pd.expanding_median)[timecolumns].rename(columns=lambda x: 'MEDIAN_'+x)
data_stds = data_by_id.apply(pd.expanding_std).fillna(0)[timecolumns].rename(columns=lambda x: 'STD_'+x)
data_mins = data_by_id.apply(pd.expanding_min).fillna(0)[timecolumns].rename(columns=lambda x: 'MIN_'+x)
data_maxs = data_by_id.apply(pd.expanding_max)[timecolumns].rename(columns=lambda x: 'MAX_'+x)
data_counts = data_by_id.apply(pd.expanding_count)[timecolumns].rename(columns=lambda x: 'COUNT_'+x)

In [21]:
val_stats_full = pd.concat([data_filled[['ID','TIME','AGE']],data_medians, data_stds, data_mins, data_maxs, data_counts, data_filled['ICU']], axis = 1)
val_stats_np = data_stats_full.as_matrix()

###Preprocessed Validation set
Deleted cached variables

In [22]:
train_feats = data_stats_np[:,2:-2]
features= []
for k in xrange(60,90,5):
    kbest = SelectKBest(k=k)
    kbest.fit(train_feats,data_stats_np[:,-1])
    kfeatures= data_stats_full.columns[2:-2][kbest.get_support()]
    kfeaturesfull = [['ID', 'TIME'],kfeatures,['ICU','LABEL']]
    kfeaturesfullval = [['ID', 'TIME'], kfeatures,['ICU']]
    kfeaturesfull = [item for sublist in kfeaturesfull for item in sublist]
    kfeaturesfullval = [item for sublist in kfeaturesfullval for item in sublist]
    features.append((k,kfeaturesfull,kfeaturesfullval))

In [None]:
features[7][1]

In [None]:
from sklearn.utils import compute_class_weight
from sklearn.naive_bayes import GaussianNB
class_weights = compute_class_weight('auto', [0,1], data_stats_np[:,-1])
class_weight_dictionary = {0:0.35, 1:0.65}
class_weight_dictionary2 = {0:0.25, 1:0.75}
class_weight_dictionary3 = {0:0.2, 1:0.8}
class_weight_dictionary4 = {0: 0.3, 1:0.7}

In [None]:
data_labels = pd.read_csv("id_label_val.csv", header = None, names = ['ID','LABEL'],sep=",").set_index('ID')
#for p in data_stats_full['ID'].unique():
#    if data_labels.ix[p]['LABEL']==1:
#        data_stats_full = data_stats_full.append(data_stats_full[data_stats_full.ID==p])

In [None]:
for triple in features[7:8]:
    train_stats = data_stats_full[triple[1]].as_matrix()
    val_stats = val_stats_full[triple[2]]
    random = [41,np.random.randint(1,150),np.random.randint(1,150),np.random.randint(1,150)]
    print random
    clf1 = SGDClassifier(loss = 'squared_hinge',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary3)
    clf2 = SGDClassifier(loss = 'squared_hinge',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary4)
    #clf3 = SGDClassifier(loss = 'squared_hinge',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary2)
    clf3 = MultinomialNB(fit_prior = True, class_prior= [0.3,0.7])
#    clf4 = MultinomialNB(fit_prior = True)
    clf5 = SGDClassifier(loss = 'log',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary)
    
    clfs = [clf1,clf2,clf5,clf4]
    clf4.partial_fit(train_stats[:,2:-2],data_stats_full.as_matrix()[:,-1],[0,1])
#        clf4.partial_fit(train_stats[:,2:-2],data_stats_np[:,-1],[0,1])

    for clf in clfs[:-1]:
        clf.fit(train_stats[:,2:-2],data_stats_full.as_matrix()[:,-1])
    print 'Testing'
    test_feats_grouped = val_stats.set_index('ID')
    test_ids = test_feats_grouped.index.unique()
    final_preds = [[],[],[],[],[]]
    final_answers = []
    for id in test_ids:
        test_id = test_feats_grouped.ix[id]
        test_id_np = test_id.as_matrix()
        test_id_np = np.atleast_2d(test_id_np)
        icu_indices = np.nonzero(test_id_np[:,-1])[0]
        prev_prediction = 0
        final_answers.append(data_labels.ix[int(id)]['LABEL'])
        flag = [0,0,0,0,0]
        final=[0,0,0,0,0]
        icui=0
        for ind in icu_indices:
            partial_data_feats = test_id_np[:ind+1,1:-1]
            if icui==0 and ind>0:
                for clf in clfs:
                    clf.partial_fit(partial_data_feats[:-1,:], np.zeros((partial_data_feats[:-1,:].shape[0],)))
            if icui>0:
                if ind - icu_indices[icui-1]>1 :
                    for i in range(0,len(clfs)):
                        results = np.empty((partial_data_feats[icu_indices[icui-1]:ind,:].shape[0],))
                        results.fill(int(prev_prediction))
                        clfs[i].partial_fit(partial_data_feats[icu_indices[icui-1]:ind,:], results)
                else:
                    for i in range(0, len(clfs)):
                        prev_icu = np.atleast_2d(partial_data_feats[icu_indices[icui-1],:])
                        clfs[i].partial_fit(prev_icu, prev_prediction)
            for i in range(0,len(clfs)):
                prediction = clfs[i].predict(partial_data_feats[-1,:])
                prev_prediction = prediction
                if flag[i]==0 and int(prediction[0])==1:
                    final[i]=1
                    flag[i] +=1
            icui+=1
        for i in range(0,len(clfs)):
            final_preds[i].append(final[i])
    
        
    print '...done'
    print'Scoring'
    for i in range(0,len(clfs)):
        cm = confusion_matrix(final_answers, final_preds[i])
        TN = cm[0][0]
        FP = cm[0][1]
        FN = cm[1][0]
        TP = cm[1][1]
        specificity = float(TN)/(TN+FP)
        sensitivity = float(TP)/(TP+FN)
        print 'K: ' + str(triple[0]) + '___spec=' + str(specificity) + '___sens= ' + str(sensitivity) + '___for clf' + str(i)
    bagged_clf_preds = [1 if final_preds[1][x] + final_preds[2][x] + final_preds[0][x] + 2*final_preds[3][x]>2 else 0 for x in range(0,len(final_preds[0]))]
    cm = confusion_matrix(final_answers,bagged_clf_preds)
    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TP = cm[1][1]
    specificity = float(TN)/(TN+FP)
    sensitivity = float(TP)/(TP+FN)
    print 'K: ' + str(triple[0]) + '___spec=' + str(specificity) + '___sens= ' + str(sensitivity) + '___for bagged clf'
    print '...done'

    clf1 = SGDClassifier(loss = 'squared_hinge',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary3)
    clf2 = SGDClassifier(loss = 'log',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary2)
    clf3 = SGDClassifier(loss = 'squared_hinge',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary)
    
[11, 129, 85]
Testing
...done
Scoring
K: 30___spec=0.999103942652___sens= 0.0365853658537___for clf0
K: 30___spec=1.0___sens= 0.0___for clf1
K: 30___spec=0.996415770609___sens= 0.0609756097561___for clf2
K: 30___spec=0.999103942652___sens= 0.0365853658537___for bagged clf
...done
[7, 109, 124]
Testing
...done
Scoring
K: 35___spec=0.996415770609___sens= 0.0487804878049___for clf0
K: 35___spec=1.0___sens= 0.0243902439024___for clf1
K: 35___spec=0.979390681004___sens= 0.146341463415___for clf2
K: 35___spec=0.996415770609___sens= 0.0609756097561___for bagged clf
...done
[58, 17, 85]
Testing
...done
Scoring
K: 40___spec=0.998207885305___sens= 0.0365853658537___for clf0
K: 40___spec=1.0___sens= 0.0___for clf1
K: 40___spec=0.994623655914___sens= 0.19512195122___for clf2
K: 40___spec=0.999103942652___sens= 0.0365853658537___for bagged clf
...done
[89, 113, 18]
Testing
...done
Scoring
K: 45___spec=0.994623655914___sens= 0.158536585366___for clf0
K: 45___spec=1.0___sens= 0.0487804878049___for clf1
K: 45___spec=0.990143369176___sens= 0.182926829268___for clf2
K: 45___spec=0.996415770609___sens= 0.134146341463___for bagged clf
...done
[34, 58, 27]
Testing
...done
Scoring
K: 50___spec=0.998207885305___sens= 0.0365853658537___for clf0
K: 50___spec=1.0___sens= 0.0121951219512___for clf1
K: 50___spec=0.991039426523___sens= 0.0975609756098___for clf2
K: 50___spec=0.998207885305___sens= 0.0365853658537___for bagged clf
...done
[128, 148, 117]
Testing
...done
Scoring
K: 55___spec=0.996415770609___sens= 0.0487804878049___for clf0
K: 55___spec=0.999103942652___sens= 0.0243902439024___for clf1
K: 55___spec=0.989247311828___sens= 0.121951219512___for clf2
K: 55___spec=0.997311827957___sens= 0.0487804878049___for bagged clf
...done
[82, 109, 54]
Testing
...done
Scoring
K: 60___spec=0.993727598566___sens= 0.0731707317073___for clf0
K: 60___spec=1.0___sens= 0.0___for clf1
K: 60___spec=0.991039426523___sens= 0.19512195122___for clf2
K: 60___spec=0.998207885305___sens= 0.0609756097561___for bagged clf
...done
[41, 91, 145]
Testing
...done
Scoring
K: 65___spec=0.992831541219___sens= 0.182926829268___for clf0
K: 65___spec=1.0___sens= 0.0365853658537___for clf1
K: 65___spec=0.987455197133___sens= 0.219512195122___for clf2
K: 65___spec=0.995519713262___sens= 0.170731707317___for bagged clf
...done
[69, 41, 50]
Testing
...done
Scoring
K: 70___spec=0.999103942652___sens= 0.0487804878049___for clf0
K: 70___spec=0.997311827957___sens= 0.0609756097561___for clf1
K: 70___spec=0.977598566308___sens= 0.19512195122___for clf2
K: 70___spec=0.997311827957___sens= 0.0487804878049___for bagged clf
...done
[96, 124, 61]
Testing
...done
Scoring
K: 75___spec=0.991935483871___sens= 0.109756097561___for clf0
K: 75___spec=0.991935483871___sens= 0.109756097561___for clf1
K: 75___spec=0.990143369176___sens= 0.19512195122___for clf2
K: 75___spec=0.992831541219___sens= 0.109756097561___for bagged clf
...done
[10, 23, 57]
Testing
...done
Scoring
K: 80___spec=0.981182795699___sens= 0.170731707317___for clf0
K: 80___spec=0.995519713262___sens= 0.0853658536585___for clf1
K: 80___spec=0.974014336918___sens= 0.19512195122___for clf2
K: 80___spec=0.98476702509___sens= 0.134146341463___for bagged clf
...done
[139, 103, 85]
Testing
...done
Scoring
K: 85___spec=0.995519713262___sens= 0.109756097561___for clf0
K: 85___spec=0.995519713262___sens= 0.0853658536585___for clf1
K: 85___spec=0.987455197133___sens= 0.182926829268___for clf2
K: 85___spec=0.995519713262___sens= 0.121951219512___for bagged clf
...done
[16, 79, 81]
Testing
...done
Scoring
K: 90___spec=0.991039426523___sens= 0.170731707317___for clf0
K: 90___spec=0.986559139785___sens= 0.146341463415___for clf1
K: 90___spec=0.976702508961___sens= 0.243902439024___for clf2
K: 90___spec=0.987455197133___sens= 0.170731707317___for bagged clf
...done
[28, 54, 114]
Testing
...done
Scoring
K: 95___spec=0.994623655914___sens= 0.134146341463___for clf0
K: 95___spec=0.998207885305___sens= 0.0853658536585___for clf1
K: 95___spec=0.986559139785___sens= 0.182926829268___for clf2
K: 95___spec=0.996415770609___sens= 0.109756097561___for bagged clf
...done

    clf = SGDClassifier(loss = 'log',warm_start=True, penalty='elasticnet',n_iter=30)

Testing
...done
Scoring
K: 30___spec=0.996415770609___sens= 0.0731707317073
Testing
...done
Scoring
K: 35___spec=0.996415770609___sens= 0.0487804878049
Testing
...done
Scoring
K: 40___spec=0.991935483871___sens= 0.0609756097561
Testing
...done
Scoring
K: 45___spec=0.991935483871___sens= 0.0975609756098
Testing
...done
Scoring
K: 50___spec=0.997311827957___sens= 0.146341463415
Testing
...done
Scoring
K: 55___spec=0.995519713262___sens= 0.0731707317073
Testing
...done
Scoring
K: 60___spec=0.997311827957___sens= 0.134146341463
Testing
...done
Scoring
K: 65___spec=0.998207885305___sens= 0.0487804878049
Testing
...done
Scoring
K: 70___spec=0.992831541219___sens= 0.0975609756098
Testing
...done
Scoring
K: 75___spec=0.986559139785___sens= 0.170731707317
Testing
...done
Scoring
K: 80___spec=0.974910394265___sens= 0.30487804878
Testing
...done
Scoring
K: 85___spec=0.990143369176___sens= 0.121951219512
Testing
...done
Scoring
K: 90___spec=0.974014336918___sens= 0.207317073171
Testing
...done
Scoring
K: 95___spec=0.982974910394___sens= 0.170731707317
...done

    clf = SGDClassifier(loss = 'log',warm_start=True, penalty='elasticnet',n_iter=30, l1_ratio=0.2)

Testing
...done
Scoring
K: 30___spec=0.999103942652___sens= 0.0243902439024
Testing
...done
Scoring
K: 35___spec=0.993727598566___sens= 0.0853658536585
Testing
...done
Scoring
K: 40___spec=0.999103942652___sens= 0.121951219512
Testing
...done
Scoring
K: 45___spec=0.996415770609___sens= 0.0487804878049
Testing
...done
Scoring
K: 50___spec=0.995519713262___sens= 0.0487804878049
Testing
...done
Scoring
K: 55___spec=1.0___sens= 0.0487804878049
Testing
...done
Scoring
K: 60___spec=0.998207885305___sens= 0.0609756097561
Testing
...done
Scoring
K: 65___spec=0.999103942652___sens= 0.0365853658537
Testing
...done
Scoring
K: 70___spec=0.998207885305___sens= 0.121951219512
Testing
...done
Scoring
K: 75___spec=0.997311827957___sens= 0.121951219512
Testing
...done
Scoring
K: 80___spec=0.982078853047___sens= 0.170731707317
Testing
...done
Scoring
K: 85___spec=0.994623655914___sens= 0.109756097561
Testing
...done
Scoring
K: 90___spec=0.997311827957___sens= 0.0487804878049
Testing
...done
Scoring
K: 95___spec=0.992831541219___sens= 0.121951219512
...done

    clf = SGDClassifier(loss = 'log',warm_start=True, penalty='elasticnet',n_iter=30, l1_ratio=0.25)

Testing
...done
Scoring
K: 30___spec=1.0___sens= 0.0121951219512
Testing
...done
Scoring
K: 35___spec=0.999103942652___sens= 0.0487804878049
Testing
...done
Scoring
K: 40___spec=1.0___sens= 0.0609756097561
Testing
...done
Scoring
K: 45___spec=1.0___sens= 0.0
Testing
...done
Scoring
K: 50___spec=0.991935483871___sens= 0.0731707317073
Testing
...done
Scoring
K: 55___spec=0.993727598566___sens= 0.0731707317073
Testing
...done
Scoring
K: 60___spec=0.994623655914___sens= 0.0975609756098
Testing
...done
Scoring
K: 65___spec=0.997311827957___sens= 0.0487804878049
Testing
...done
Scoring
K: 70___spec=0.999103942652___sens= 0.0365853658537
Testing
...done
Scoring
K: 75___spec=0.993727598566___sens= 0.0853658536585
Testing
...done
Scoring
K: 80___spec=0.987455197133___sens= 0.121951219512
Testing
...done
Scoring
K: 85___spec=0.98835125448___sens= 0.146341463415
Testing
...done
Scoring
K: 90___spec=0.975806451613___sens= 0.268292682927
Testing
...done
Scoring
K: 95___spec=0.986559139785___sens= 0.121951219512
...done

    clf = SGDClassifier(loss = 'hinge_squared',warm_start=True, penalty='elasticnet',n_iter=30, l1_ratio=0.25)

Testing
...done
Scoring
K: 30___spec=0.993727598566___sens= 0.0731707317073
Testing
...done
Scoring
K: 35___spec=0.998207885305___sens= 0.0365853658537
Testing
...done
Scoring
K: 40___spec=0.999103942652___sens= 0.0365853658537
Testing
...done
Scoring
K: 45___spec=0.999103942652___sens= 0.0975609756098
Testing
...done
Scoring
K: 50___spec=0.987455197133___sens= 0.134146341463
Testing
...done
Scoring
K: 55___spec=0.991039426523___sens= 0.0975609756098
Testing
...done
Scoring
K: 60___spec=0.998207885305___sens= 0.0609756097561
Testing
...done
Scoring
K: 65___spec=0.997311827957___sens= 0.0609756097561
Testing
...done
Scoring
K: 70___spec=0.996415770609___sens= 0.0731707317073
Testing
...done
Scoring
K: 75___spec=0.998207885305___sens= 0.146341463415
Testing
...done
Scoring
K: 80___spec=0.974910394265___sens= 0.207317073171
Testing
...done
Scoring
K: 85___spec=0.990143369176___sens= 0.121951219512
Testing
...done
Scoring
K: 90___spec=0.996415770609___sens= 0.0975609756098
Testing
...done
Scoring
K: 95___spec=0.985663082437___sens= 0.219512195122
...done

In [None]:
testvitals = 'id_time_vitals_test.csv'
testlabs = 'id_time_labs_test.csv'
testage = 'id_age_test.csv'

In [None]:
test_data = read_test(testvitals,testlabs,testage)

In [None]:
test_data_p = preprocess(test_data,normal)

In [None]:
def preprocess(data, normal):
    data['V6'] = data['V6'].apply(lambda x: 80 if x<80 else 112 if x>112 else x)
    data['V5'] = data['V5'].apply(lambda x: 100 if x>100 else x if x>0 else np.nan )
    data['V4'] = data['V4'].apply(lambda x: x if x>0 else np.nan )
    data['V3'] = data['V3'].apply(lambda x: x if (x>30 and x<220) else np.nan )
    data['V2'] = data['V2'].apply(lambda x: x if (x>15 and x<200) else np.nan )
    data['V1'] = data['V1'].apply(lambda x: x if (x>30 and x<300) else np.nan )
    data['L1'] = data['L1'].apply(lambda x: x if (x>0 and x<14) else np.nan)
    data['L2'] = data['L2'].apply(lambda x: 132 if x>132 else x if x>0 else np.nan )
    data[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']] = data[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']].applymap(lambda x: x if x>0 else np.nan )
    data['L7'] = data['L7'].apply(lambda x: x if x<700 else 700 if x>700 else np.nan)
    data['L8'] = data['L8'].apply(lambda x: x if x<200 else 200 if x>200 else np.nan)
    data['L9'] = data['L9'].apply(lambda x: x if x<100 else x/1000)
    data['L10'] = data['L10'].apply(lambda x: x if (x>0 and x<100) else np.nan)
    data['L11'] = data['L11'].apply(lambda x: x if (x>0 and x<2000) else 2000 if x>2000 else np.nan)
    data['L12'] = data['L12'].apply(lambda x: x if x<5 else 5+(x-5)/10)
    data['L15'] =data['L15'].apply(lambda x: x if (x>0 and x<20) else 20 if x>20 else np.nan)
    data['L18'] =data['L18'].apply(lambda x: x if x<1000 else 1000 if x>1000 else np.nan)
    data['L19'] =data['L19'].apply(lambda x: x if x<800 else 800 if x>800 else np.nan)
    data['L20'] =data['L20'].apply(lambda x: x/100 if x>1000 else x if x>0 else np.nan)
    data['L23'] =data['L23'].apply(lambda x: 3000 if x>3000 else x if x>0 else -1*x)
    data_grouped = data.groupby(['ID','TIME']).mean()
    patient_ids = data['ID'].unique()
    for patient in patient_ids:
        data_grouped.loc[patient, 0] = data_grouped.loc[patient, 0].fillna(normal)
    data_p = data_grouped.reset_index()
    return data_p

In [None]:
def read_test(testvitals, testlabs, testage):
    data_age_test = pd.read_csv(testage, sep=",")
    data_vitals_test = pd.read_csv(testvitals, sep=",")
    data_labs_test = pd.read_csv(testlabs, sep = ",")
    data_timeseries_test = pd.merge(data_labs_test, data_vitals_test)
    #patient_ids_test = list(data_timeseries_test['ID'].unique())
    data_test = pd.merge( data_age_test, data_timeseries_test, on='ID')
    return data_test

In [None]:
test_feat_filled.head()

In [None]:
features[7][2]

In [None]:
test_feat_filled = make_features_test(test_data_p,features[7][2])

In [None]:
def make_features_test(data_p,k_features):
    data_without_labels = data_p[data_p.columns[:-1]]
    data_by_id =  data_without_labels.groupby('ID')
    timecolumns = data_p.columns[3:-1]
    data_medians = data_by_id.apply(pd.expanding_median)[timecolumns].rename(columns=lambda x: 'MEDIAN_'+x)
    data_stds = data_by_id.apply(pd.expanding_std).fillna(0)[timecolumns].rename(columns=lambda x: 'STD_'+x)
    data_mins = data_by_id.apply(pd.expanding_min)[timecolumns].rename(columns=lambda x: 'MIN_'+x)
    data_maxs = data_by_id.apply(pd.expanding_max)[timecolumns].rename(columns=lambda x: 'MAX_'+x)
    data_counts = data_by_id.apply(pd.expanding_count)[timecolumns].rename(columns=lambda x: 'COUNT_'+x)
    data_with_stats = pd.concat([data_medians, data_stds, data_mins, data_maxs, data_counts], axis = 1)
    data_subset_full = pd.concat([data_p[['ID','TIME','AGE']], data_with_stats,data_p[['ICU']]], axis=1)
    data_subset = data_subset_full[k_features]
    
    return data_subset

In [None]:
final_test_preds = predict(test_feat_filled)

In [None]:
import csv
with open('output.csv', 'w') as f:
    writer = csv.writer(f, delimiter=',')
    for pred in final_test_preds:
        writer.writerow([pred[0], pred[1], pred[2]])
print "...done"

In [None]:
test_labels = pd.read_csv('output.csv',header = None, names = ['ID','TIME','LABEL'])

In [None]:
grouped_labels = test_labels.groupby('ID').sum().reset_index()
len(grouped_labels[grouped_labels.LABEL>0])

In [None]:
def predict(data_test):
    train_stats = data_stats_full[triple[1]].as_matrix()
    val_stats = val_stats_full[triple[2]]
    random = [41,np.random.randint(1,150),np.random.randint(1,150),np.random.randint(1,150)]
    print random
    clf1 = SGDClassifier(loss = 'squared_hinge',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary3)
    clf2 = SGDClassifier(loss = 'squared_hinge',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary4)
    clf3 = SGDClassifier(loss = 'log',warm_start=True, random_state = random[0],penalty='elasticnet',n_iter=30, l1_ratio=0.20, class_weight=class_weight_dictionary)
    clf4 = MultinomialNB(fit_prior = True, class_prior= [0.3,0.7])
    clfs = [clf1,clf2,clf3,clf4]
    clf4.partial_fit(train_stats[:,2:-2],data_stats_full.as_matrix()[:,-1],[0,1])
    for clf in clfs[:-1]:
        clf.fit(train_stats[:,2:-2],data_stats_full.as_matrix()[:,-1])
    test_feats_grouped = data_test.set_index('ID')
    test_ids = test_feats_grouped.index.unique()
    final_predictions = []
    for id in test_ids:
        test_id = test_feats_grouped.ix[id]
        test_id_np = np.atleast_2d(test_id.as_matrix())
        icu_indices = np.nonzero(test_id_np[:,-1])[0]
        prev_prediction = [0,0,0,0]
        icui=0
        for ind in icu_indices:
            partial_data_feats = test_id_np[:ind+1,1:-1]
            if icui==0 and ind>0:
                for clf in clfs:
                    clf.partial_fit(partial_data_feats[:-1,:], np.zeros((partial_data_feats[:-1,:].shape[0],)))
            if icui>0:
                for i in range(0,len(clfs)): 
                    if ind - icu_indices[icui-1]>1:
                        results = np.empty((partial_data_feats[icu_indices[icui-1]:ind,:].shape[0],))
                        results.fill(int(prev_prediction[i]))
                        clfs[i].partial_fit(partial_data_feats[icu_indices[icui-1]:ind,:], results)
                    else:
                        prev_icu = np.atleast_2d(partial_data_feats[icu_indices[icui-1],:])
                        clfs[i].partial_fit(prev_icu, np.atleast_1d(prev_prediction[i]))
            bagged_prediction = 0
            for i in range(0,len(clfs)-1):
                prediction = int(clfs[i].predict(partial_data_feats[-1,:])[0])
                prev_prediction[i] = prediction
                bagged_prediction += prediction
            prediction = int(clf4.predict(partial_data_feats[-1,:])[0])
            prev_prediction[3]=prediction
            bagged_prediction += 2*prediction
            if bagged_prediction >2:
                bagged_final_prediction = 1
            else:
                bagged_final_prediction = 0
            final_predictions.append((int(id), int(test_id_np[ind, 0]), int(bagged_final_prediction)))
            icui+=1
    return final_predictions

In [None]:
a = np.array([0])

In [None]:
a