In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest

##Reading training data
Here, I'm giving a label of 1 for all ICU time stamps of patients who've died. I've only fixed one outlier. Could you guys add your outlier fixing code here?

In [2]:
data_age = pd.read_csv("id_age_train.csv", sep=",")
data_vitals = pd.read_csv("id_time_vitals_train.csv", sep=",")
data_labels = pd.read_csv("id_label_train.csv", sep=",").set_index('ID')
data_labs = pd.read_csv("id_time_labs_train.csv", sep = ",")
data_timeseries = pd.merge(data_labs, data_vitals)
patient_ids = list(data_timeseries['ID'].unique())
patients = data_timeseries.ID
labels = np.zeros(len(patients))
for p in patient_ids:
    if int(data_labels.ix[p]['LABEL'])==1:
        data_id = data_timeseries[data_timeseries['ID']==p]
        indices = data_id[data_id['ICU']==1].index
        num = len(indices)
        for i in indices:
            labels[i]=1
data_timeseries['LABEL'] = labels
data = pd.merge( data_age, data_timeseries, on='ID')
data.loc[445378,'L20'] = 20.5267

##Reading Validation data


In [3]:
data_age_val = pd.read_csv("id_age_val.csv", sep=",")
data_vitals_val = pd.read_csv("id_time_vitals_val.csv", sep=",")
data_labels_val = pd.read_csv("id_label_val.csv", sep=",", header=None, names=['ID', 'LABEL']).set_index('ID')
data_labs_val = pd.read_csv("id_time_labs_val.csv", sep = ",")
data_timeseries_val = pd.merge(data_labs_val, data_vitals_val)
patient_ids_val = list(data_timeseries_val['ID'].unique())
data_val = pd.merge( data_age_val, data_timeseries_val, on='ID')

##Preprocessing training data
Finding means of all attrs and filling NAs of 0th timestamp of each with that. Then ffill.

In [4]:
print "Preprocessing"
data_grouped = data.groupby(['ID','TIME']).mean()
normal = {attribute: data[attribute].mean() for attribute in data.columns}
patient_ids = data['ID'].unique()
for patient in patient_ids:
    data_grouped.loc[patient, 0] = data_grouped.loc[patient, 0].fillna(normal)
data_filled = data_grouped.reset_index().fillna(method='ffill')
data_np = data_filled.as_matrix()
print "...done"

Preprocessing
...done


##Adding new features
data_without_labels has all columns except the last 2 (ICU and LABEL). 
time_columns has the column names of all the time series variables
Then you apply whatever stats thing you want (median here) after grouping by ID so that it applies it for each ID.
Only take the measure for the time series variables

In [5]:
data_without_labels = data_filled[data_filled.columns[:-2]]

In [6]:
time_columns = data_without_labels.columns[3:]
data_medians = data_without_labels.groupby('ID').apply(pd.expanding_median)

Only take the measure for the time series variables. Then rename the columns by prefixing your measure name

In [7]:
data_medians = data_medians[time_columns].rename(columns=lambda x: 'MEDIAN_'+x)

Add these new columns

In [8]:
data_with_medians = pd.concat([data_without_labels, data_medians], axis = 1)

Add the ICU and LABEL back

In [9]:
data_medians_labels = pd.concat([data_with_medians, data_filled[data_filled.columns[-2:]]], axis=1)

In [10]:
data_medians_labels.head()

Unnamed: 0,ID,TIME,AGE,L1,L2,L3,L4,L5,L6,L7,...,MEDIAN_L24,MEDIAN_L25,MEDIAN_V1,MEDIAN_V2,MEDIAN_V3,MEDIAN_V4,MEDIAN_V5,MEDIAN_V6,ICU,LABEL
0,1,0,42,7.367832,43.743643,121.660822,137.729164,4.053076,25.617869,34.883442,...,37.891432,1.970802,86.0,49.0,70,20.426097,87,98.24268,0,0
1,1,4320,42,6.60759,114.044116,128.393872,137.729164,4.053076,11.327002,34.883442,...,37.891432,1.970802,86.0,49.0,70,20.426097,87,98.24268,0,0
2,1,5646,42,6.60759,114.044116,128.393872,137.729164,4.053076,11.327002,34.883442,...,37.891432,1.970802,86.0,49.0,70,20.426097,87,98.24268,1,1
3,1,5703,42,6.60759,114.044116,128.393872,137.729164,4.053076,11.327002,34.883442,...,37.891432,1.970802,88.5,53.5,70,20.426097,87,97.42134,1,1
4,1,6342,42,6.60759,114.044116,128.393872,137.729164,4.053076,11.327002,34.883442,...,37.891432,1.970802,91.0,58.0,70,20.426097,87,96.6,1,1


In [11]:
data_median_np = data_medians_labels.as_matrix()

##Training

Select K best features based on the amount they affect the labels (I think)

In [12]:
k=50

In [14]:
from sklearn.feature_selection import chi2

In [15]:
sel = SelectKBest(chi2, k=k)
train_feats = data_median_np[:,2:-2]
sel.fit(train_feats, data_median_np[:,-1])

SelectKBest(k=50, score_func=<function chi2 at 0x7f4cd796b668>)

get_support() returns a boolean array with True if that feature is one of the k best and False otherwise. So we filter out the selected features from the dataframe.

In [16]:
indices_of_best = sel.get_support()
features = data_medians_labels.columns[2:-2]
kfeatures = features[indices_of_best]
data_subset = data_medians_labels[kfeatures]

In [17]:
data_subset.head()

Unnamed: 0,AGE,L2,L3,L4,L5,L6,L7,L8,L9,L10,...,MEDIAN_L18,MEDIAN_L20,MEDIAN_L21,MEDIAN_L22,MEDIAN_L23,MEDIAN_V1,MEDIAN_V2,MEDIAN_V3,MEDIAN_V4,MEDIAN_V5
0,42,43.743643,121.660822,137.729164,4.053076,25.617869,34.883442,1.908826,169.829407,31.298237,...,148.042571,56.253642,2.870504,98.839073,64.684401,86.0,49.0,70,20.426097,87
1,42,114.044116,128.393872,137.729164,4.053076,11.327002,34.883442,1.908826,169.829407,31.298237,...,148.042571,78.229965,2.870504,98.839073,64.684401,86.0,49.0,70,20.426097,87
2,42,114.044116,128.393872,137.729164,4.053076,11.327002,34.883442,1.908826,169.829407,31.298237,...,148.042571,100.206288,2.870504,98.839073,64.684401,86.0,49.0,70,20.426097,87
3,42,114.044116,128.393872,137.729164,4.053076,11.327002,34.883442,1.908826,169.829407,31.298237,...,148.042571,100.206288,2.870504,98.839073,64.684401,88.5,53.5,70,20.426097,87
4,42,114.044116,128.393872,137.729164,4.053076,11.327002,34.883442,1.908826,169.829407,31.298237,...,148.042571,100.206288,2.870504,98.839073,64.684401,91.0,58.0,70,20.426097,87


In [18]:
train_subset = data_subset.as_matrix()
print 'Training'
clf = SGDClassifier(loss='modified_huber', penalty = 'elasticnet', l1_ratio=0.25, n_iter= 30, warm_start=True)
#train_feats = data_np_medians[:,2:-2]
clf.fit(train_subset, data_median_np[:,-1])
print '...done'

Training
...done


##Preprocessing Validation data
THIS NEEDS TO BE DONE IN THE TESTING FOR LOOP!

In [19]:
print "Preprocessing"
data_grouped_val = data_val.groupby(['ID','TIME']).mean()
patient_ids_val = data_val['ID'].unique()
for patient in patient_ids_val:
    data_grouped_val.loc[patient, 0] = data_grouped_val.loc[patient, 0].fillna(normal)
data_filled_val = data_grouped_val.reset_index().fillna(method='ffill')
print "...done"

Preprocessing
...done


##Testing

Get data for each ID    
Find indices of rows when patient is in the ICU   
For each index in these indices do:   
* get the data above it.
* make it into a dataframe (probably can optimize these steps by not doing pd->np->pd)
* find the stat measures for the dataframe like we did before
* concatenate and make np again
* take only the k best columns
* if we're on the first icu index and that's not the 0th timestamp -> fit the classifier on all the nonICU rows above it, setting outputs as 0s.
* if there's a gap of nonICUs between two ICU indices -> fit the classifier on these nonICUs giving the ouput as the previous ICU prediction. i.e if we'd predicted 1 on the previous ICU then he's liekly to die and shouldn't have been taken out of ICU.
* fit it on the previous ICU stamp using the prediction you made for that stamp
* predict for this ICU stamp 
* if it predicts 1 at any time, the label for the guy is 1 (this doesn't make the csv output)    
Then score



In [33]:
data_filled_val

Unnamed: 0,ID,TIME,AGE,L1,L2,L3,L4,L5,L6,L7,...,L23,L24,L25,V1,V2,V3,V4,V5,V6,ICU
0,3595,0,44,7.367832,43.743643,121.660822,137.729164,4.053076,25.617869,34.883442,...,64.684401,37.891432,1.970802,102.000000,55.000000,106.00000,19.000000,99.000000,96.30000,1
1,3595,2700,44,7.367832,43.743643,121.660822,127.166699,4.073447,25.617869,13.553313,...,64.684401,37.891432,1.833126,102.000000,55.000000,106.00000,19.000000,99.000000,96.30000,1
2,3595,12510,44,7.367832,43.743643,121.660822,127.166699,4.073447,25.617869,13.553313,...,64.684401,37.891432,1.833126,102.000000,55.000000,99.00000,19.000000,99.000000,96.30000,1
3,3595,81116,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,64.684401,37.891432,1.708936,102.000000,55.000000,99.00000,19.000000,99.000000,96.30000,1
4,3595,101563,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,64.684401,37.891432,1.708936,102.000000,55.000000,99.00000,19.000000,99.000000,96.30000,1
5,3595,117236,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,64.684401,37.891432,1.708936,98.000000,39.000000,96.00000,19.000000,101.000000,98.80000,0
6,3595,131662,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,64.684401,37.891432,1.708936,100.000000,73.000000,88.00000,19.000000,101.000000,98.80000,0
7,3595,147872,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,64.684401,37.891432,1.708936,108.000000,61.000000,83.00000,19.000000,99.000000,98.50000,0
8,3595,159453,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,64.684401,37.891432,1.708936,130.000000,56.000000,82.00000,18.000000,99.000000,97.90000,0
9,3595,170611,44,7.367832,43.743643,121.660822,129.734407,3.947013,25.617869,9.925039,...,64.684401,37.891432,1.772833,130.000000,56.000000,82.00000,18.000000,99.000000,97.90000,0


In [20]:
col = data_filled_val.columns.tolist()
df_medians = data_filled_val[col[:-1]].groupby('ID').apply(pd.expanding_median)


In [21]:
data_filled_df = pd.concat([data_filled_val, df_medians], axis=1)
data_filled_df.set_index('ID',inplace=True)


In [38]:
df_medians

Unnamed: 0,ID,TIME,AGE,L1,L2,L3,L4,L5,L6,L7,...,L22,L23,L24,L25,V1,V2,V3,V4,V5,V6
0,3595,0.0,44,7.367832,43.743643,121.660822,137.729164,4.053076,25.617869,34.883442,...,98.839073,64.684401,37.891432,1.970802,102.000000,55.000000,106.00000,19.000000,99.000000,96.30000
1,3595,1350.0,44,7.367832,43.743643,121.660822,132.447932,4.063262,25.617869,24.218378,...,82.366804,64.684401,37.891432,1.901964,102.000000,55.000000,106.00000,19.000000,99.000000,96.30000
2,3595,2700.0,44,7.367832,43.743643,121.660822,127.166699,4.073447,25.617869,13.553313,...,65.894536,64.684401,37.891432,1.833126,102.000000,55.000000,106.00000,19.000000,99.000000,96.30000
3,3595,7605.0,44,7.367832,43.743643,121.660822,128.008490,4.063262,25.617869,15.243646,...,65.894536,64.684401,37.891432,1.833126,102.000000,55.000000,102.50000,19.000000,99.000000,96.30000
4,3595,12510.0,44,7.367832,43.743643,121.660822,128.850281,4.053076,25.617869,16.933978,...,65.894536,64.684401,37.891432,1.833126,102.000000,55.000000,99.00000,19.000000,99.000000,96.30000
5,3595,46813.0,44,7.367832,43.743643,121.660822,128.850281,4.044154,25.617869,16.933978,...,61.370596,64.684401,37.891432,1.771031,102.000000,55.000000,99.00000,19.000000,99.000000,96.30000
6,3595,81116.0,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,56.846656,64.684401,37.891432,1.708936,102.000000,55.000000,99.00000,19.000000,99.000000,96.30000
7,3595,91339.5,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,56.846656,64.684401,37.891432,1.708936,102.000000,55.000000,99.00000,19.000000,99.000000,96.30000
8,3595,101563.0,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,56.846656,64.684401,37.891432,1.708936,102.000000,55.000000,99.00000,19.000000,99.000000,96.30000
9,3595,109399.5,44,7.367832,43.743643,121.660822,128.850281,4.035232,25.617869,16.933978,...,56.846656,64.684401,37.891432,1.708936,102.000000,55.000000,97.50000,19.000000,99.000000,97.10000


In [22]:
print 'Testing'
col = data_filled_val.columns.tolist()
df_medians = data_filled_val[col[:-1]].groupby('ID').apply(pd.expanding_median)[partial_df.columns[2:]].rename(columns=lambda x:'MEDIAN_'+x)
#data_filled_val.set_index('ID', inplace=True)
data_filled_df = pd.concat([data_filled_val, df_medians], axis=1)
data_filled_df.set_index('ID',inplace=True)
test_ids = set(data_filled_val.index)
final_preds = []
final_answers =[]
for id in test_ids:
    if id%1000==0:
        print id
    test_id = data_filled_df.ix[id]
    test_id_np = np.atleast_2d(test_id.as_matrix())
    icu_indices = np.nonzero(test_id_np[:,-2])[0]
    prev_prediction = 0
    final_answers.append(data_labels_val.ix[int(id)]['LABEL'])
    flag = 0
    final=0
    icui=0
    for ind in icu_indices:
        partial_data_feats = test_id_np[:ind+1,1:-1]
        partial_df = pd.DataFrame(partial_data_feats, columns=test_id.columns[1:-1])
        partial_data_feats = partial_df.as_matrix()
        #partial_data_feats = partial_data_feats[:,indices_of_best]
        if icui==0 and ind>0:
            clf.partial_fit(partial_data_feats[:-1,:], np.zeros((partial_data_feats[:-1].shape[0],)))
        if icui>0:
            if ind - icu_indices[icui-1]>1 :
                results = np.empty((partial_data_feats[icu_indices[icui-1]+1:ind,:].shape[0],))
                results.fill(int(prev_prediction[0]))
                clf.partial_fit(partial_data_feats[icu_indices[icui-1]+1:ind,:], results)
            prev_icu = np.atleast_2d(partial_data_feats[icu_indices[icui-1],:])
            clf.partial_fit(prev_icu, prev_prediction)
        prediction = clf.predict(partial_data_feats[-1,:])
        prev_prediction = prediction
        if flag==0 and int(prediction[0])==1:
            final=1
            flag+=1
        icui+=1
    final_preds.append(final)
print '...done'
print'Scoring'
cm = confusion_matrix(final_answers, final_preds)
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
TP = cm[1][1]
specificity = float(TN)/(TN+FP)
sensitivity = float(TP)/(TP+FN)
print '...done'

Testing


NameError: name 'partial_df' is not defined

In [304]:
specificity

0.9919354838709677

In [305]:
sensitivity

0.10975609756097561

In [63]:
cm = np.array([[1107,15],[79,9]])

In [80]:
def xerox_scorer(clf,test, y):
    ###INSERT THE CODE TO GET THE FINAL PREDICTIONS HERE
    ###X is the preprocessed Validation Data
    ###Y are final labels
    test = test.set_index('ID')
    test_ids = set(test.index)
    final_predictions = []
    for id in test_ids:
        #if id%100==0:
        #	print id
        test_df = test.ix[id]
        test_np = np.atleast_2d(test_df.as_matrix())
        icu_indices = np.nonzero(test_np[:,-1])[0]
        previous_prediction = 0
        flag = 0
        final = 0
        icuindex = 0
        for ind in icu_indices:
            partial_data = test_np[:ind+1,:-1]
            partial_data[:,3:] = scale(partial_data[:,3:])
            partial_data_feats = partial_data[:,3:]
            if icuindex==0 and ind>0:
                clf.partial_fit(partial_data_feats[:-1,:], np.zeros((partial_data_feats[:-1].shape[0],)))
            if icuindex>0:
                if ind - icu_indices[icuindex-1]>1:	
                    clf.partial_fit(partial_data_feats[icu_indices[icuindex-1]+1:-1,:], np.zeros((partial_data_feats[icu_indices[icuindex-1]+1:ind,:].shape[0],)))
                previous_icu = np.atleast_2d(partial_data_feats[icu_indices[icui-1],:])
                clf.partial_fit(previous_icu, previous_prediction)
            prediction = clf.predict(partial_data_feats[-1,:])
            previous_prediction = prediction
            final_predictions.append((id, int(partial_data[ind,0]),int(prediction[0])))
    final_pred_df = pd.DataFrame(np.array(final_predictions),columns = ['ID','TIME','LABEL'])
    final_pred_df_gp = final_pred_df.groupby('ID').sum()
    predictions = [1 if x>0 else 0 for x in final_pred_df_gp['LABEL']]
    cm = confusion_matrix(truth,predictions)
    cm = cm.astype('float')/cm.sum(axis=1)
    if cm[0][0]<0.99: return 0
    return (cm[0][0]-.99)*5 + (cm[1][1])*.75
 

In [75]:
xerox_scorer([1,0,0,0,1],[1,0,0,0,1])

0.80000000000000004