In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer

In [2]:
data_age = pd.read_csv("id_age_train.csv", sep=",")
data_vitals = pd.read_csv("id_time_vitals_train.csv", sep=",")
data_labels = pd.read_csv("id_label_train.csv", sep=",").set_index('ID')
data_labs = pd.read_csv("id_time_labs_train.csv", sep = ",")
data_timeseries = pd.merge(data_labs, data_vitals)
patients = data_timeseries['ID']
patient_ids = list(data_timeseries['ID'].unique())
labels = np.zeros(len(patients))
for p in patient_ids:
    if int(data_labels.ix[p]['LABEL'])==1:
        data_id = data_timeseries[data_timeseries['ID']==p]
        indices = data_id[data_id['ICU']==1].index
        num = len(indices)
        for i in indices:
            labels[i]=1
data_timeseries['LABEL'] = labels
data = pd.merge( data_age, data_timeseries, on='ID')

In [10]:
%xdel data_timeseries
%xdel data_age
%xdel data_vitals
%xdel data_labs

In [3]:
data['V6'] = data['V6'].apply(lambda x: 80 if x<80 else 112 if x>112 else x)
data['V5'] = data['V5'].apply(lambda x: 100 if x>100 else x if x>0 else np.nan )
data['V4'] = data['V4'].apply(lambda x: x if x>0 else np.nan )
data['V3'] = data['V3'].apply(lambda x: x if (x>30 and x<220) else np.nan )
data['V2'] = data['V2'].apply(lambda x: x if (x>15 and x<200) else np.nan )
data['V1'] = data['V1'].apply(lambda x: x if (x>30 and x<300) else np.nan )
data['L1'] = data['L1'].apply(lambda x: x if (x>0 and x<14) else np.nan)
data['L2'] = data['L2'].apply(lambda x: 132 if x>132 else x if x>0 else np.nan )
data[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']] = data[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']].applymap(lambda x: x if x>0 else np.nan )
data['L7'] = data['L7'].apply(lambda x: x if x<700 else 700 if x>700 else np.nan)
data['L8'] = data['L8'].apply(lambda x: x if x<200 else 200 if x>200 else np.nan)
data['L9'] = data['L9'].apply(lambda x: x if x<100 else x/1000)
data['L10'] = data['L10'].apply(lambda x: x if (x>0 and x<100) else np.nan)
data['L11'] = data['L11'].apply(lambda x: x if (x>0 and x<2000) else 2000 if x>2000 else np.nan)
data['L12'] = data['L12'].apply(lambda x: x if x<5 else 5+(x-5)/10)
data['L15'] =data['L15'].apply(lambda x: x if (x>0 and x<20) else 20 if x>20 else np.nan)
data['L18'] =data['L18'].apply(lambda x: x if x<1000 else 1000 if x>1000 else np.nan)
data['L19'] =data['L19'].apply(lambda x: x if x<800 else 800 if x>800 else np.nan)
data['L20'] =data['L20'].apply(lambda x: x/100 if x>1000 else x if x>0 else np.nan)
data['L23'] =data['L23'].apply(lambda x: 3000 if x>3000 else x if x>0 else -1*x)



In [4]:
print "Preprocessing"
data_grouped = data.groupby(['ID','TIME']).mean()
normal = {attribute: data[attribute].mean() for attribute in data.columns}
patient_ids = data['ID'].unique()
for patient in patient_ids:
    data_grouped.loc[patient, 0] = data_grouped.loc[patient, 0].fillna(normal)
data_filled = data_grouped.reset_index()#.fillna(method='ffill')
data_np = data_filled.as_matrix()
print "...done"

Preprocessing
...done


In [9]:
%xdel data_grouped

In [5]:
data_without_labels = data_filled[data_filled.columns[:-2]]
data_by_id = data_without_labels.groupby('ID')
timecolumns = data_filled.columns[3:-2]
data_medians = data_by_id.apply(pd.expanding_median)[timecolumns].rename(columns=lambda x: 'MEDIAN_'+x)
data_stds = data_by_id.apply(pd.expanding_std).fillna(0)[timecolumns].rename(columns=lambda x: 'STD_'+x)
data_mins = data_by_id.apply(pd.expanding_min).fillna(0)[timecolumns].rename(columns=lambda x: 'MIN_'+x)
data_maxs = data_by_id.apply(pd.expanding_max)[timecolumns].rename(columns=lambda x: 'MAX_'+x)
data_counts = data_by_id.apply(pd.expanding_count)[timecolumns].rename(columns=lambda x: 'COUNT_'+x)

In [8]:
data_stats_full = pd.concat([data_without_labels[['ID','TIME', 'AGE']],data_medians, data_stds, data_mins, data_maxs, data_counts, data_filled[['ICU','LABEL']]], axis = 1)
data_stats_np = data_stats_full.as_matrix()

In [None]:
%xdel data_medians
%xdel data_stds
%xdel data_mins
%xdel data_maxs
%xdel data_counts
%xdel data_by_id
%xdel data_without_labels

In [6]:
data_age_val = pd.read_csv("id_age_val.csv", sep=",")
data_vitals_val = pd.read_csv("id_time_vitals_val.csv", sep=",")
data_labels_val = pd.read_csv("id_label_val.csv", sep=",", header=None, names=['ID', 'LABEL']).set_index('ID')
data_labs_val = pd.read_csv("id_time_labs_val.csv", sep = ",")
data_timeseries_val = pd.merge(data_labs_val, data_vitals_val)
patient_ids_val = list(data_timeseries_val['ID'].unique())
data_val = pd.merge( data_age_val, data_timeseries_val, on='ID')

In [11]:
data_val['V6'] = data_val['V6'].apply(lambda x: 80 if x<80 else 112 if x>112 else x)
data_val['V5'] = data_val['V5'].apply(lambda x: 100 if x>100 else x if x>0 else np.nan )
data_val['V4'] = data_val['V4'].apply(lambda x: x if x>0 else np.nan )
data_val['V3'] = data_val['V3'].apply(lambda x: x if (x>30 and x<220) else np.nan )
data_val['V2'] = data_val['V2'].apply(lambda x: x if (x>15 and x<200) else np.nan )
data_val['V1'] = data_val['V1'].apply(lambda x: x if (x>30 and x<300) else np.nan )
data_val['L1'] = data_val['L1'].apply(lambda x: x if (x>0 and x<14) else np.nan)
data_val['L2'] = data_val['L2'].apply(lambda x: 132 if x>132 else x if x>0 else np.nan )
data_val[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']] = data_val[['L3', 'L4', 'L5', 'L6', 'L16','L17', 'L13', 'L14', 'L21', 'L22', 'L24', 'L25']].applymap(lambda x: x if x>0 else np.nan )
data_val['L7'] = data_val['L7'].apply(lambda x: x if x<700 else 700 if x>700 else np.nan)
data_val['L8'] = data_val['L8'].apply(lambda x: x if x<200 else 200 if x>200 else np.nan)
data_val['L9'] = data_val['L9'].apply(lambda x: x if x<100 else x/1000)
data_val['L10'] = data_val['L10'].apply(lambda x: x if (x>0 and x<100) else np.nan)
data_val['L11'] = data_val['L11'].apply(lambda x: x if (x>0 and x<2000) else 2000 if x>2000 else np.nan)
data_val['L12'] = data_val['L12'].apply(lambda x: x if x<5 else 5+(x-5)/10)
data_val['L15'] =data_val['L15'].apply(lambda x: x if (x>0 and x<20) else 20 if x>20 else np.nan)
data_val['L18'] =data_val['L18'].apply(lambda x: x if x<1000 else 1000 if x>1000 else np.nan)
data_val['L19'] =data_val['L19'].apply(lambda x: x if x<800 else 800 if x>800 else np.nan)
data_val['L20'] =data_val['L20'].apply(lambda x: x/100 if x>1000 else x if x>0 else np.nan)
data_val['L23'] =data_val['L23'].apply(lambda x: 3000 if x>3000 else x if x>0 else -1*x)

In [None]:
%xdel data_timeseries_val
%xdel data_age_val
%xdel data_vitals_val
%xdel data_labs_val

In [None]:
print "Preprocessing"
data_grouped_val = data_val.groupby(['ID','TIME']).mean()
patient_ids_val = data_val['ID'].unique()
for patient in patient_ids_val:
    data_grouped_val.loc[patient, 0] = data_grouped_val.loc[patient, 0].fillna(normal)
data_filled_val = data_grouped_val.reset_index()#.fillna(method='ffill')
print "...done"

In [None]:
%xdel data_grouped_val

In [None]:
data_without_labels_val = data_filled_val[data_filled_val.columns[:-1]]
data_by_id_val =  data_without_labels_val.groupby('ID')
data_medians_val = data_by_id_val.apply(pd.expanding_median)[timecolumns].rename(columns=lambda x: 'MEDIAN_'+x)
data_stds_val = data_by_id_val.apply(pd.expanding_std).fillna(0)[timecolumns].rename(columns=lambda x: 'STD_'+x)
data_mins_val = data_by_id_val.apply(pd.expanding_min)[timecolumns].rename(columns=lambda x: 'MIN_'+x)
data_maxs_val = data_by_id_val.apply(pd.expanding_max)[timecolumns].rename(columns=lambda x: 'MAX_'+x)
data_counts_val = data_by_id_val.apply(pd.expanding_count)[timecolumns].rename(columns=lambda x: 'COUNT_'+x)

In [None]:
data_stats_val = pd.concat([data_filled_val[['ID','TIME', 'AGE']], data_medians_val, data_stds_val, data_mins_val, data_maxs_val, data_counts_val, data_filled_val[['ICU']]], axis = 1)

In [None]:
%xdel data_medians_val
%xdel data_stds_val
%xdel data_mins_val
%xdel data_maxs_val
%xdel data_counts_val
%xdel data_by_id_val
%xdel data_without_labels_val

#Previous code has to be run only once. Now repeat the upcoming code by changing values of K

In [1]:
ks = arange(30,100,8)

NameError: name 'arange' is not defined

In [14]:
sel = SelectKBest(k=k)
train_feats = data_stats_np[:,2:-2]
sel.fit(train_feats, data_stats_np[:,-1])

In [None]:
indices_of_best = sel.get_support()
features = data_stats_full.columns[2:-2]
kfeatures = features[indices_of_best]

##Check that these lists are flat

In [None]:
kfeaturesfull = ['ID', 'TIME'] + kfeatures + ['ICU','LABEL']
kfeaturesfullval = ['ID', 'TIME'] + kfeatures + ['ICU']

In [None]:
train_stats = data_stats_full[kfeaturesfull]
val_stats = data_stats_val[kfeaturesval]

In [None]:

print 'Testing'
test_feats_grouped = val_stats.set_index('ID')
test_ids = test_feats_grouped.index.unique()
final_preds = []
final_answers =[]
for id in test_ids:
    test_id = test_feats_grouped.ix[id]
    test_id_np = test_id.as_matrix()
    test_id_np = np.atleast_2d(test_id_np)
    icu_indices = np.nonzero(test_id_np[:,-1])[0]
    prev_prediction = 0
    final_answers.append(data_labels_val.ix[int(id)]['LABEL'])
    flag = 0
    final=0
    icui=0
    for ind in icu_indices:
        partial_data_feats = test_id_np[:ind+1,1:-1]
        if icui==0 and ind>0:
            clf.partial_fit(partial_data_feats[:-1,:], np.zeros((partial_data_feats[:-1,:].shape[0],)))
        if icui>0:
            if ind - icu_indices[icui-1]>1 :
                results = np.empty((partial_data_feats[icu_indices[icui-1]:ind,:].shape[0],))
                results.fill(int(prev_prediction[0]))
                clf.partial_fit(partial_data_feats[icu_indices[icui-1]:ind,:], results)
            else:
                prev_icu = np.atleast_2d(partial_data_feats[icu_indices[icui-1],:])
                clf.partial_fit(prev_icu, prev_prediction)
        prediction = clf.predict(partial_data_feats[-1,:])
        prev_prediction = prediction
        if flag==0 and int(prediction[0])==1:
            final=1
            flag+=1
        icui+=1
    final_preds.append(final)
print '...done'
print'Scoring'
cm = confusion_matrix(final_answers, final_preds)
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]
specificity = float(TN)/(TN+FP)
sensitivity = float(TP)/(TP+FN)
print '...done'

In [None]:
specificity

In [None]:
sensitivity

In [None]:
cm