In [1]:
# Import packages
import numpy as np
import pandas as pd

# Exploratory Analysis

Before we begin exploring our dataset and fitting models, let's explore our data set a bit first
1. Event count for each patient. This calculates the number of diagnosis, drugs taken, and lab test of living patients and dead patients.
2. Encounter count calculates how many visits a patient had to the ICU.
3. Record length records how long the patient has been tracked in our dataset, the difference between the last day recorded and first day recorded for a patient

## Event count

In [2]:
events = pd.read_csv("../data/train/events.csv")
mortality = pd.read_csv("../data/train/mortality_events.csv")

morts = mortality['patient_id']
dead = events[events['patient_id'].isin(morts)]
alive = events[~events['patient_id'].isin(morts)]
display(dead.describe(), events.describe(), alive.describe())


Unnamed: 0,patient_id,value
count,491007.0,444238.0
mean,10050.837397,72.05522
std,6091.513865,2575.69
min,12.0,-47.0
25%,5072.0,3.01
50%,9967.0,15.0
75%,15038.0,47.0
max,25075.0,1115000.0


Unnamed: 0,patient_id,value
count,740066.0,667517.0
mean,9415.493002,70.50876
std,5808.988279,2171.407
min,12.0,-47.0
25%,4683.0,3.11
50%,8904.0,14.9
75%,14180.0,46.0
max,25075.0,1115000.0


Unnamed: 0,patient_id,value
count,249059.0,223279.0
mean,8162.944226,67.431897
std,4975.07612,946.895486
min,80.0,-32.0
25%,3917.0,3.3
50%,7299.0,14.4
75%,12171.0,44.0
max,18480.0,117500.0


In [3]:
alive_group = alive.groupby(['patient_id']).size().reset_index(name='count')
dead_group = dead.groupby(['patient_id']).size().reset_index(name='count')

display(alive_group.head())
display(dead_group.head())

Unnamed: 0,patient_id,count
0,80,185
1,99,238
2,197,332
3,198,944
4,306,233


Unnamed: 0,patient_id,count
0,12,868
1,19,177
2,41,1092
3,106,622
4,112,556


## Encounter count

In [4]:
alive_group = alive.groupby(['patient_id', 'timestamp']).size().reset_index(name='count')
dead_group = dead.groupby(['patient_id', 'timestamp']).size().reset_index(name='count')

alive_encounter_count = alive_group.groupby(['patient_id']).agg('count').reset_index()
alive_encounter_count.drop('timestamp', axis=1, inplace=True)
dead_encounter_count = dead_group.groupby(['patient_id']).agg('count').reset_index()
dead_encounter_count.drop('timestamp', axis=1, inplace=True)

alive_encounter_count.head()

Unnamed: 0,patient_id,count
0,80,5
1,99,10
2,197,9
3,198,22
4,306,7


## Record length

In [5]:
alive['timestamp'] = pd.to_datetime(alive['timestamp'])
a_g1 = alive.groupby(['patient_id'])
a_g2 = a_g1.agg(lambda x: x['timestamp'].max() - x['timestamp'].min()).reset_index()
a_g2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,patient_id,event_id,event_description,timestamp,value
0,80,8 days,8 days,8 days,8 days
1,99,1267 days,1267 days,1267 days,1267 days
2,197,9 days,9 days,9 days,9 days
3,198,300 days,300 days,300 days,300 days
4,306,6 days,6 days,6 days,6 days


In [6]:
alive_record_length = a_g2[['patient_id', 'value']]

In [7]:
alive_record_length.head()

Unnamed: 0,patient_id,value
0,80,8 days
1,99,1267 days
2,197,9 days
3,198,300 days
4,306,6 days


In [8]:
dead.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value
0,12,DIAG440649,Primary malignant neoplasm of head of pancreas,2011-12-06,1.0
1,12,DIAG201070,Cholelithiasis AND cholecystitis without obstr...,2011-12-06,1.0
2,12,DIAG321462,Cardiac complication,2011-12-06,1.0
3,12,DIAG321042,Cardiac arrest,2011-12-06,1.0
4,12,DIAG435141,Hemorrhage AND/OR hematoma complicating procedure,2011-12-06,1.0


In [9]:
dead['timestamp'] = pd.to_datetime(dead['timestamp'])
m_g1 = dead.groupby(['patient_id'])
m_g2 = m_g1.agg(lambda x: x['timestamp'].max() - x['timestamp'].min()).reset_index()
m_g2['value'] = m_g2['value'].dt.days
dead_record_length = m_g2[['patient_id', 'value']]
dead_record_length.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,patient_id,value
0,12,15
1,19,4
2,41,43
3,106,6
4,112,839


## Calculate index date

In [10]:
mortality.head()

Unnamed: 0,patient_id,timestamp,label
0,19,2014-03-04,1
1,12,2011-12-19,1
2,41,2014-02-15,1
3,106,2015-08-11,1
4,112,2011-04-23,1


In [11]:
morts = mortality['patient_id']

alive = events[~events.patient_id.isin(morts)]

In [12]:
mortality['timestamp'] = pd.to_datetime(mortality['timestamp']).dt.date
mortality.head()

Unnamed: 0,patient_id,timestamp,label
0,19,2014-03-04,1
1,12,2011-12-19,1
2,41,2014-02-15,1
3,106,2015-08-11,1
4,112,2011-04-23,1


In [13]:
mortality['indx_date'] = mortality['timestamp'] - pd.to_timedelta(30, unit='d')
mortality.head()
dead = mortality[['patient_id', 'indx_date']]
dead.head()

Unnamed: 0,patient_id,indx_date
0,19,2014-02-02
1,12,2011-11-19
2,41,2014-01-16
3,106,2015-07-12
4,112,2011-03-24


In [14]:
a_g1 = alive.groupby(['patient_id']).agg({'timestamp': {'indx_date': 'max'}}).reset_index()
a_g1.columns = ['patient_id', 'indx_date']
a_g1.tail()

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,patient_id,indx_date
495,18326,2010-11-22
496,18367,2013-08-04
497,18452,2014-06-20
498,18455,2012-10-24
499,18480,2012-12-22


In [15]:
indx_date = dead.append(a_g1)
indx_date.describe()
indx_date[indx_date['patient_id'] == 24581]

Unnamed: 0,patient_id,indx_date
49,24581,2013-11-07


## Filter event

In [16]:
display(events.head())
display(indx_date.head())

evts = pd.merge(events, indx_date, on='patient_id')
evts.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value
0,12,DIAG440649,Primary malignant neoplasm of head of pancreas,2011-12-06,1.0
1,12,DIAG201070,Cholelithiasis AND cholecystitis without obstr...,2011-12-06,1.0
2,12,DIAG321462,Cardiac complication,2011-12-06,1.0
3,12,DIAG321042,Cardiac arrest,2011-12-06,1.0
4,12,DIAG435141,Hemorrhage AND/OR hematoma complicating procedure,2011-12-06,1.0


Unnamed: 0,patient_id,indx_date
0,19,2014-02-02
1,12,2011-11-19
2,41,2014-01-16
3,106,2015-07-12
4,112,2011-03-24


Unnamed: 0,patient_id,event_id,event_description,timestamp,value,indx_date
0,12,DIAG440649,Primary malignant neoplasm of head of pancreas,2011-12-06,1.0,2011-11-19
1,12,DIAG201070,Cholelithiasis AND cholecystitis without obstr...,2011-12-06,1.0,2011-11-19
2,12,DIAG321462,Cardiac complication,2011-12-06,1.0,2011-11-19
3,12,DIAG321042,Cardiac arrest,2011-12-06,1.0,2011-11-19
4,12,DIAG435141,Hemorrhage AND/OR hematoma complicating procedure,2011-12-06,1.0,2011-11-19


In [17]:
evts['timestamp'] = pd.to_datetime(evts['timestamp']).dt.date
evts['indx_date'] = pd.to_datetime(evts['indx_date']).dt.date
filt = evts[(evts['timestamp'] <= evts['indx_date']) & (evts['timestamp'] >= evts['indx_date'] - pd.to_timedelta(2000, unit='d'))]
filt.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value,indx_date
868,19,DIAG433329,Closed fracture of second cervical vertebra,2013-02-19,1.0,2014-02-02
869,19,DIAG81902,Urinary tract infectious disease,2013-02-19,1.0,2014-02-02
870,19,DIAG74082,Acontractile detrusor,2013-02-19,1.0,2014-02-02
871,19,DIAG436539,Closed fracture of patella,2013-02-19,1.0,2014-02-02
872,19,DIAG80502,Osteoporosis,2013-02-19,1.0,2014-02-02


In [18]:
display(filt[(filt['patient_id'] == 2293) & (filt['event_id'] == 'LAB3019550')])
filt.info()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value,indx_date
106962,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-02,140.0,2010-06-02
107034,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-04,139.0,2010-06-02
107065,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-04,139.0,2010-06-02
107105,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-05,138.0,2010-06-02
107136,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-06,140.0,2010-06-02
107162,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-07,143.0,2010-06-02
107172,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-08,143.0,2010-06-02
107215,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-09,141.0,2010-06-02
107247,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-17,141.0,2010-06-02
107322,2293,LAB3019550,Sodium [Moles/volume] in Serum or Plasma,2007-11-18,142.0,2010-06-02


<class 'pandas.core.frame.DataFrame'>
Int64Index: 556347 entries, 868 to 740064
Data columns (total 6 columns):
patient_id           556347 non-null int64
event_id             556347 non-null object
event_description    556347 non-null object
timestamp            556347 non-null object
value                500684 non-null float64
indx_date            556347 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 29.7+ MB


## Aggregate events

In [19]:
feature_map = pd.read_csv("../data/train/event_feature_map.csv")
feature_map.head()

Unnamed: 0,idx,event_id
0,1,DIAG132397
1,2,DIAG132408
2,3,DIAG132446
3,4,DIAG132583
4,5,DIAG132643


In [20]:
agg = pd.merge(filt, feature_map, on='event_id')
agg.dropna(inplace=True)
agg.head()

Unnamed: 0,patient_id,event_id,event_description,timestamp,value,indx_date,idx
0,19,DIAG433329,Closed fracture of second cervical vertebra,2013-02-19,1.0,2014-02-02,1202
1,19,DIAG81902,Urinary tract infectious disease,2013-02-19,1.0,2014-02-02,2028
2,3009,DIAG81902,Urinary tract infectious disease,2013-12-26,1.0,2015-04-09,2028
3,6006,DIAG81902,Urinary tract infectious disease,2013-04-20,1.0,2014-04-24,2028
4,157,DIAG81902,Urinary tract infectious disease,2013-01-18,1.0,2013-02-26,2028


In [21]:
agg1 = agg.groupby(['patient_id', 'event_id', 'idx']).agg({'value': 'count'}).reset_index()
agg1.head()
feature_pairs = agg1[['patient_id','idx', 'value']]
feature_pairs.rename(columns={'idx': 'feature_id', 'value': 'feature_value'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


In [22]:
feature_pairs.head()

Unnamed: 0,patient_id,feature_id,feature_value
0,19,719,1
1,19,744,1
2,19,784,1
3,19,786,1
4,19,1202,1


In [23]:
feature_pairs['max_feature_value'] = feature_pairs.groupby(['feature_id'])['feature_value'].transform(np.max)

In [24]:
feature_pairs['min_feature_value'] = feature_pairs.groupby(['feature_id'])['feature_value'].transform(np.min)

In [25]:
feature_pairs.head(10)

Unnamed: 0,patient_id,feature_id,feature_value,max_feature_value,min_feature_value
0,19,719,1,4,1
1,19,744,1,1,1
2,19,784,1,3,1
3,19,786,1,3,1
4,19,1202,1,1,1
5,19,1411,1,1,1
6,19,1895,1,1,1
7,19,2001,1,2,1
8,19,2028,1,2,1
9,19,2040,2,22,1


In [26]:
feature_pairs['feature_value'] = (feature_pairs['feature_value']) / (feature_pairs['max_feature_value'])

In [27]:
feature_pairs.head(20)

Unnamed: 0,patient_id,feature_id,feature_value,max_feature_value,min_feature_value
0,19,719,0.25,4,1
1,19,744,1.0,1,1
2,19,784,0.333333,3,1
3,19,786,0.333333,3,1
4,19,1202,1.0,1,1
5,19,1411,1.0,1,1
6,19,1895,1.0,1,1
7,19,2001,0.5,2,1
8,19,2028,0.5,2,1
9,19,2040,0.090909,22,1


In [28]:
feature_pairs.describe()

Unnamed: 0,patient_id,feature_id,feature_value,max_feature_value,min_feature_value
count,64209.0,64209.0,64209.0,64209.0,64209.0
mean,9397.951424,2528.773287,0.200134,133.299771,1.003847
std,5709.579119,668.648141,0.282694,161.729717,0.117869
min,19.0,1.0,0.001898,1.0,1.0
25%,4692.0,2432.0,0.026634,9.0,1.0
50%,8904.0,2766.0,0.077482,55.0,1.0
75%,13653.0,2914.0,0.244755,188.0,1.0
max,25075.0,3179.0,1.0,527.0,14.0


In [29]:
feature_pairs['feature_value'].max(), feature_pairs['feature_value'].min()

(1.0, 0.0018975332068311196)

In [30]:
feature_pairs.describe()
#feature_pairs.head()

Unnamed: 0,patient_id,feature_id,feature_value,max_feature_value,min_feature_value
count,64209.0,64209.0,64209.0,64209.0,64209.0
mean,9397.951424,2528.773287,0.200134,133.299771,1.003847
std,5709.579119,668.648141,0.282694,161.729717,0.117869
min,19.0,1.0,0.001898,1.0,1.0
25%,4692.0,2432.0,0.026634,9.0,1.0
50%,8904.0,2766.0,0.077482,55.0,1.0
75%,13653.0,2914.0,0.244755,188.0,1.0
max,25075.0,3179.0,1.0,527.0,14.0


In [31]:
set(feature_pairs['patient_id'])

{19,
 41,
 80,
 99,
 112,
 157,
 177,
 197,
 198,
 224,
 284,
 294,
 306,
 308,
 329,
 388,
 397,
 402,
 407,
 421,
 424,
 462,
 465,
 512,
 528,
 618,
 634,
 635,
 657,
 679,
 701,
 704,
 734,
 751,
 801,
 818,
 862,
 869,
 871,
 894,
 905,
 921,
 928,
 940,
 1017,
 1065,
 1069,
 1070,
 1132,
 1139,
 1219,
 1227,
 1249,
 1252,
 1272,
 1275,
 1301,
 1306,
 1324,
 1372,
 1422,
 1438,
 1451,
 1518,
 1619,
 1656,
 1665,
 1675,
 1688,
 1718,
 1727,
 1761,
 1816,
 1820,
 1925,
 1934,
 1944,
 1999,
 2077,
 2144,
 2145,
 2160,
 2206,
 2258,
 2274,
 2289,
 2293,
 2352,
 2358,
 2361,
 2379,
 2408,
 2409,
 2468,
 2492,
 2501,
 2517,
 2521,
 2544,
 2553,
 2565,
 2604,
 2631,
 2639,
 2663,
 2670,
 2676,
 2704,
 2722,
 2732,
 2734,
 2766,
 2814,
 2820,
 2827,
 2828,
 2842,
 2857,
 2934,
 2940,
 2998,
 3009,
 3014,
 3026,
 3034,
 3038,
 3055,
 3082,
 3097,
 3100,
 3122,
 3127,
 3144,
 3147,
 3173,
 3191,
 3199,
 3259,
 3267,
 3294,
 3317,
 3323,
 3332,
 3350,
 3367,
 3370,
 3403,
 3419,
 3433,
 3445

## Create features

In [32]:
patient_features = {}
for patient in set(feature_pairs['patient_id']):
    for index, row in feature_pairs[feature_pairs['patient_id'] == patient].iterrows():
        if patient in patient_features.keys():
            patient_features[patient].append((row['feature_id'], row['feature_value']))
        else:
            patient_features[patient] = [(row['feature_id'], row['feature_value'])]

In [33]:
patient_features[19]

[(719.0, 0.25),
 (744.0, 1.0),
 (784.0, 0.33333333333333331),
 (786.0, 0.33333333333333331),
 (1202.0, 1.0),
 (1411.0, 1.0),
 (1895.0, 1.0),
 (2001.0, 0.5),
 (2028.0, 0.5),
 (2040.0, 0.090909090909090912),
 (2045.0, 0.16666666666666666),
 (2075.0, 0.034482758620689655),
 (2221.0, 0.20000000000000001),
 (2260.0, 0.33333333333333331),
 (2340.0, 0.14285714285714285),
 (2378.0, 0.18181818181818182),
 (2432.0, 0.14285714285714285),
 (2435.0, 0.033333333333333333),
 (2447.0, 0.012658227848101266),
 (2448.0, 0.016666666666666666),
 (2503.0, 0.14285714285714285),
 (2593.0, 0.037037037037037035),
 (2598.0, 0.15384615384615385),
 (2599.0, 0.090909090909090912),
 (2640.0, 0.22222222222222221),
 (2646.0, 0.20000000000000001),
 (2682.0, 0.041666666666666664),
 (2684.0, 0.03125),
 (2688.0, 0.0079365079365079361),
 (2696.0, 0.009433962264150943),
 (2697.0, 0.0075187969924812026),
 (2702.0, 0.0081967213114754103),
 (2713.0, 0.010752688172043012),
 (2721.0, 0.0072639225181598066),
 (2723.0, 0.017964071

In [34]:
mortality.head()

Unnamed: 0,patient_id,timestamp,label,indx_date
0,19,2014-03-04,1,2014-02-02
1,12,2011-12-19,1,2011-11-19
2,41,2014-02-15,1,2014-01-16
3,106,2015-08-11,1,2015-07-12
4,112,2011-04-23,1,2011-03-24


In [35]:
print(len(mortality))
len(set(mortality['patient_id']))

500


500

In [36]:
mortality_dict = {}
for index, row in mortality.iterrows():
    mortality_dict[row['patient_id']] = row['label']

In [37]:
mortality_dict

{12: 1,
 19: 1,
 41: 1,
 106: 1,
 112: 1,
 149: 1,
 157: 1,
 177: 1,
 181: 1,
 224: 1,
 279: 1,
 281: 1,
 284: 1,
 294: 1,
 308: 1,
 329: 1,
 407: 1,
 453: 1,
 471: 1,
 472: 1,
 499: 1,
 559: 1,
 618: 1,
 634: 1,
 679: 1,
 704: 1,
 726: 1,
 801: 1,
 871: 1,
 894: 1,
 905: 1,
 921: 1,
 940: 1,
 977: 1,
 1054: 1,
 1139: 1,
 1227: 1,
 1242: 1,
 1297: 1,
 1299: 1,
 1301: 1,
 1371: 1,
 1422: 1,
 1456: 1,
 1599: 1,
 1619: 1,
 1656: 1,
 1663: 1,
 1727: 1,
 1752: 1,
 1924: 1,
 1944: 1,
 1979: 1,
 1999: 1,
 2144: 1,
 2191: 1,
 2207: 1,
 2289: 1,
 2293: 1,
 2346: 1,
 2358: 1,
 2379: 1,
 2402: 1,
 2501: 1,
 2532: 1,
 2544: 1,
 2553: 1,
 2670: 1,
 2704: 1,
 2732: 1,
 2734: 1,
 2827: 1,
 2828: 1,
 2842: 1,
 2934: 1,
 2998: 1,
 3009: 1,
 3038: 1,
 3050: 1,
 3055: 1,
 3122: 1,
 3127: 1,
 3156: 1,
 3189: 1,
 3191: 1,
 3245: 1,
 3294: 1,
 3419: 1,
 3433: 1,
 3449: 1,
 3462: 1,
 3471: 1,
 3537: 1,
 3705: 1,
 3773: 1,
 3794: 1,
 3810: 1,
 3826: 1,
 3911: 1,
 3983: 1,
 4073: 1,
 4122: 1,
 4178: 1,
 4211: 

## Save svmlight

In [38]:
aggregated_events = patient_features.copy()
features = aggregated_events.copy()

In [64]:
len(features)*0.25

209.0

In [39]:
features[19]

[(719.0, 0.25),
 (744.0, 1.0),
 (784.0, 0.33333333333333331),
 (786.0, 0.33333333333333331),
 (1202.0, 1.0),
 (1411.0, 1.0),
 (1895.0, 1.0),
 (2001.0, 0.5),
 (2028.0, 0.5),
 (2040.0, 0.090909090909090912),
 (2045.0, 0.16666666666666666),
 (2075.0, 0.034482758620689655),
 (2221.0, 0.20000000000000001),
 (2260.0, 0.33333333333333331),
 (2340.0, 0.14285714285714285),
 (2378.0, 0.18181818181818182),
 (2432.0, 0.14285714285714285),
 (2435.0, 0.033333333333333333),
 (2447.0, 0.012658227848101266),
 (2448.0, 0.016666666666666666),
 (2503.0, 0.14285714285714285),
 (2593.0, 0.037037037037037035),
 (2598.0, 0.15384615384615385),
 (2599.0, 0.090909090909090912),
 (2640.0, 0.22222222222222221),
 (2646.0, 0.20000000000000001),
 (2682.0, 0.041666666666666664),
 (2684.0, 0.03125),
 (2688.0, 0.0079365079365079361),
 (2696.0, 0.009433962264150943),
 (2697.0, 0.0075187969924812026),
 (2702.0, 0.0081967213114754103),
 (2713.0, 0.010752688172043012),
 (2721.0, 0.0072639225181598066),
 (2723.0, 0.017964071

In [40]:
for key in aggregated_events.keys():
    if key in mortality_dict.keys():
        features[key].insert(0,1)
    else:
        features[key].insert(0,0)

In [41]:
features[19]

[1,
 (719.0, 0.25),
 (744.0, 1.0),
 (784.0, 0.33333333333333331),
 (786.0, 0.33333333333333331),
 (1202.0, 1.0),
 (1411.0, 1.0),
 (1895.0, 1.0),
 (2001.0, 0.5),
 (2028.0, 0.5),
 (2040.0, 0.090909090909090912),
 (2045.0, 0.16666666666666666),
 (2075.0, 0.034482758620689655),
 (2221.0, 0.20000000000000001),
 (2260.0, 0.33333333333333331),
 (2340.0, 0.14285714285714285),
 (2378.0, 0.18181818181818182),
 (2432.0, 0.14285714285714285),
 (2435.0, 0.033333333333333333),
 (2447.0, 0.012658227848101266),
 (2448.0, 0.016666666666666666),
 (2503.0, 0.14285714285714285),
 (2593.0, 0.037037037037037035),
 (2598.0, 0.15384615384615385),
 (2599.0, 0.090909090909090912),
 (2640.0, 0.22222222222222221),
 (2646.0, 0.20000000000000001),
 (2682.0, 0.041666666666666664),
 (2684.0, 0.03125),
 (2688.0, 0.0079365079365079361),
 (2696.0, 0.009433962264150943),
 (2697.0, 0.0075187969924812026),
 (2702.0, 0.0081967213114754103),
 (2713.0, 0.010752688172043012),
 (2721.0, 0.0072639225181598066),
 (2723.0, 0.01796

In [42]:
deliverable1 = open('../deliverables/features_svmlight_test.train', 'wb')
deliverable2 = open('../deliverables/features_test.train', 'wb')

In [43]:
features.items()

[(8193,
  [1,
   (170.0, 0.25),
   (348.0, 1.0),
   (482.0, 1.0),
   (573.0, 1.0),
   (615.0, 0.25),
   (681.0, 0.25),
   (713.0, 0.16666666666666666),
   (1100.0, 0.33333333333333331),
   (1266.0, 1.0),
   (1724.0, 0.5),
   (2684.0, 0.125),
   (2688.0, 0.015873015873015872),
   (2696.0, 0.025943396226415096),
   (2702.0, 0.032786885245901641),
   (2713.0, 0.021505376344086023),
   (2721.0, 0.019370460048426151),
   (2723.0, 0.035928143712574849),
   (2736.0, 0.021791767554479417),
   (2748.0, 0.090909090909090912),
   (2751.0, 0.026506024096385541),
   (2766.0, 0.026315789473684209),
   (2785.0, 0.023809523809523808),
   (2787.0, 0.12234042553191489),
   (2789.0, 0.033033033033033031),
   (2790.0, 0.007462686567164179),
   (2795.0, 0.026178010471204188),
   (2799.0, 0.024213075060532687),
   (2807.0, 0.16666666666666666),
   (2814.0, 0.035714285714285712),
   (2824.0, 0.028462998102466792),
   (2839.0, 0.11794871794871795),
   (2841.0, 0.021791767554479417),
   (2859.0, 0.029940119760

In [44]:
for key, values in sorted(features.items()):
    first_part = str(values[0]) + " "
    sec_part = " ".join(str(int(val[0])) + ":" + format(val[1], '.6f') for val in sorted(features[key][1:])) 
    deliverable1.write(first_part + sec_part + '\n')
    deliverable2.write(str(int(key)) + " " + first_part + sec_part + '\n')
    

# Predictive Modeling!

Now that the data's clean and we have our features in sparse vector representation, let's try our data set on a few classifiers

In [45]:
from sklearn.datasets import load_svmlight_file
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score
import utils

## Logistic Regression

In [46]:
X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")

In [47]:
X_train[1]

<1x3190 sparse matrix of type '<type 'numpy.float64'>'
	with 128 stored elements in Compressed Sparse Row format>

In [48]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred_lr = logreg.predict(X_train)

In [49]:
Y_pred_lr

array([ 1.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,
        1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,
        1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,
        1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        0.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0

## SVM

In [50]:
svm = LinearSVC()
svm.fit(X_train, Y_train)

Y_pred_svm = svm.predict(X_train)

In [51]:
Y_pred_svm

array([ 1.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,
        1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,
        1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,
        1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        0.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,
        1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0

## Decision Tree

In [52]:
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, Y_train)

Y_pred_tree = tree.predict(X_train)

In [53]:
Y_pred_tree

array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,
        0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0

## Classification Metrics

In [54]:
def classification_metrics(Y_pred, Y_true):
    #NOTE: It is important to provide the output in the same order
    acc = accuracy_score(Y_true, Y_pred)
    auc = roc_auc_score(Y_true, Y_pred)
    prec = precision_score(Y_true, Y_pred)
    rec = recall_score(Y_true, Y_pred)
    f1score = f1_score(Y_true, Y_pred)

    return acc, auc, prec, rec, f1score

In [55]:
#input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName,Y_pred,Y_true):
    print "______________________________________________"
    print "Classifier: "+classifierName
    acc, auc_, precision, recall, f1score = classification_metrics(Y_pred,Y_true)
    print "Accuracy: "+str(acc)
    print "AUC: "+str(auc_)
    print "Precision: "+str(precision)
    print "Recall: "+str(recall)
    print "F1-score: "+str(f1score)
    print "______________________________________________"
    print ""

In [56]:
display_metrics("Logistic Regression",Y_pred_lr,Y_train)
display_metrics("SVM",Y_pred_svm,Y_train)
display_metrics("Decision Tree",Y_pred_tree,Y_train)

______________________________________________
Classifier: Logistic Regression
Accuracy: 0.954545454545
AUC: 0.945404761905
Precision: 0.986928104575
Recall: 0.89880952381
F1-score: 0.940809968847
______________________________________________

______________________________________________
Classifier: SVM
Accuracy: 0.994019138756
AUC: 0.994511904762
Precision: 0.988200589971
Recall: 0.997023809524
F1-score: 0.992592592593
______________________________________________

______________________________________________
Classifier: Decision Tree
Accuracy: 0.776315789474
AUC: 0.747595238095
Precision: 0.792156862745
Recall: 0.60119047619
F1-score: 0.68358714044
______________________________________________



Classifiers performs very well on the training set. Let's see their performance the test data set.

In [57]:
X_test, Y_test = utils.get_data_from_svmlight("../data/features_svmlight.validate")

In [58]:
# Logistic Regression
Y_pred_lr = logreg.predict(X_test)
# SVM
Y_pred_svm = svm.predict(X_test)
# Decision tree
Y_pred_tree = tree.predict(X_test)

In [59]:
display_metrics("Logistic Regression",Y_pred_lr,Y_test)
display_metrics("SVM",Y_pred_svm,Y_test)
display_metrics("Decision Tree",Y_pred_tree,Y_test)

______________________________________________
Classifier: Logistic Regression
Accuracy: 0.738095238095
AUC: 0.7375
Precision: 0.680412371134
Recall: 0.733333333333
F1-score: 0.705882352941
______________________________________________

______________________________________________
Classifier: SVM
Accuracy: 0.738095238095
AUC: 0.738888888889
Precision: 0.676767676768
Recall: 0.744444444444
F1-score: 0.708994708995
______________________________________________

______________________________________________
Classifier: Decision Tree
Accuracy: 0.671428571429
AUC: 0.656944444444
Precision: 0.632911392405
Recall: 0.555555555556
F1-score: 0.591715976331
______________________________________________



Our classifiers seem to perform pretty well on unknown data, on our test data set with accuracy of about 73%.

It seems a bit old that the Logistic Regression classifier and the SVM classifier had the same accuracy score. Let's take a look at the prediction labels for both classifiers to see if they actually gave the same labels to every patient.

In [60]:
print("Logistic Regression Y_pred")
display(Y_pred_lr)
print("SVM Y_pred")
display(Y_pred_svm)

Logistic Regression Y_pred


array([ 0.,  0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,
        0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,  0

SVM Y_pred


array([ 0.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,
        0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,
        0.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,
        0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,
        0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  0

It appears that both classifiers gave the same number of positives and thus ended up with the same accuracy score. However, they did not assign the same labels to every patient.

Next, we should explore using Cross fold validation to evaluate and improve our predictive model

## K-fold cross validation

In [61]:
from sklearn.cross_validation import KFold, ShuffleSplit
from numpy import mean



In [67]:
X,Y = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
kf = KFold(X.shape[0], n_folds=5, random_state= 545510477)

accuracy_scores = []
auc_scores = []

for train_index, test_index in kf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    accuracy_scores.append(acc)
    auc_scores.append(auc)

print("KFold mean accuracy: ", np.mean(accuracy_scores), "KFold mean AUC: ", np.mean(auc_scores))

('KFold mean accuracy: ', 0.72132164242942687, 'KFold mean AUC: ', 0.70757733030284675)


In [68]:
accuracy_scores
auc_scores

[0.68611111111111112,
 0.69493608652900685,
 0.74677419354838714,
 0.7236559139784946,
 0.68640934634723449]

In [69]:
## KFold for SVM
accuracy_scores = []
auc_scores = []

for train_index, test_index in kf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    svm = LinearSVC()
    svm.fit(X_train, y_train)

    y_pred = svm.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    accuracy_scores.append(acc)
    auc_scores.append(auc)

print("KFold mean accuracy: ", np.mean(accuracy_scores), "KFold mean AUC: ", np.mean(auc_scores))

('KFold mean accuracy: ', 0.71298118049615056, 'KFold mean AUC: ', 0.71111108854860894)


## Shuffle split cross validation

In [None]:
ss = ShuffleSplit(X.shape[0], n_iter=5, test_size=0.25, random_state=545510477)

accuracy_scores = []
auc_scores = []

for train_index, test_index in ss:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    accuracy_scores.append(acc)
    auc_scores.append(auc)

print("ShuffleSplit mean accuracy: ", np.mean(accuracy_scores), "ShuffleSplit mean AUC: ", np.mean(auc_scores))

## Random Forest Classifier

In [65]:
from sklearn.ensemble import RandomForestClassifier
import etl
import models_partc

In [66]:
def my_features():
	#TODO: complete this
    
    ## Creat pandas df from csv
    events, mortality, feature_map = etl.read_csv('../data/train/')
    
    
    patient_features, mortality = etl.create_features(events, mortality, feature_map)
    
    features = patient_features.copy()

    
    feature_length = len(features)
    
    # Training set = 75% of patients, Test set = 25% of patients
    features_train = dict(features.items()[(feature_length/4):])
    features_test = dict(features.items()[:(feature_length/4)])
    
    etl.save_svmlight(features_train, mortality, "../data/train_features_svmlight.train", "../data/train_features.train")
    etl.save_svmlight(features_test, mortality, "../data/test_features_svmlight.train", "../deliverables/test_features.txt")

        
    X_train, Y_train = utils.get_data_from_svmlight("../data/train_features_svmlight.train")
    X_test, Y_test = utils.get_data_from_svmlight("../data/test_features_svmlight.train")
    
    return X_train, Y_train, X_test