In [296]:
%matplotlib inline
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split,ShuffleSplit
from sklearn.metrics import r2_score
from sklearn import ensemble
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import featuretools as ft

In [262]:
# ignore warnings from pandas
import warnings
warnings.filterwarnings('ignore')

In [228]:
#!pip install -U featuretools

In [273]:
df_train_raw = pd.read_csv("./data/train.csv")
df_test_raw = pd.read_csv("./data/data_for_test.csv")

In [274]:
df_train = df_train_raw.groupby(['key']).mean().reset_index()

In [275]:
df_train.head()

Unnamed: 0,key,patient_id,gender,age,x1,x2,x3,x4,x5,x6,xx1,xx2,xx3,xx4,xx5,y_mean_MAP,y_mean_HR
0,0-28,0,0,72,39,5,1,0,0,0,80.166667,99.733333,113.433333,69.566667,87.0,86.426667,79.13
1,1-10,1,1,64,55,9,5,1,0,1,72.1,99.8,113.9,59.6,76.466667,73.033333,72.92
2,1-11,1,1,64,55,9,5,1,0,1,71.5,99.966667,114.066667,59.5,75.4,77.56,72.493333
3,1-14,1,1,64,55,9,5,1,0,1,99.133333,100.0,133.433333,69.833333,84.966667,82.523333,118.613333
4,1-15,1,1,64,55,9,5,1,0,1,150.2,100.0,123.733333,77.1,89.233333,98.706667,189.72


In [276]:
stats = df_train_raw.groupby(['patient_id','key']).agg({'xx1':['last','mean'],'xx2':['last','mean'],
                                                        'xx3':['last','mean'],'xx4':['last','mean'],
                                                        'xx5':['last','mean']
                                                       }).reset_index()
stats.columns = ['patient_id','key','xx1_last', 
                 'xx1_mean','xx2_last','xx2_mean',
                 'xx3_last','xx3_mean','xx4_last',
                'xx4_mean','xx5_last','xx5_mean']
stats.drop(['patient_id'], axis = 1, inplace = True)
df_train = df_train.merge(stats, on = 'key', how = 'left')

In [277]:
df_train.head()

Unnamed: 0,key,patient_id,gender,age,x1,x2,x3,x4,x5,x6,...,xx1_last,xx1_mean,xx2_last,xx2_mean,xx3_last,xx3_mean,xx4_last,xx4_mean,xx5_last,xx5_mean
0,0-28,0,0,72,39,5,1,0,0,0,...,79.0,80.166667,100.0,99.733333,112.0,113.433333,71.0,69.566667,88.0,87.0
1,1-10,1,1,64,55,9,5,1,0,1,...,71.0,72.1,100.0,99.8,109.0,113.9,58.0,59.6,73.0,76.466667
2,1-11,1,1,64,55,9,5,1,0,1,...,73.0,71.5,100.0,99.966667,117.0,114.066667,61.0,59.5,77.0,75.4
3,1-14,1,1,64,55,9,5,1,0,1,...,119.0,99.133333,100.0,100.0,116.0,133.433333,70.0,69.833333,82.0,84.966667
4,1-15,1,1,64,55,9,5,1,0,1,...,190.0,150.2,100.0,100.0,133.0,123.733333,85.0,77.1,99.0,89.233333


In [278]:
#drop target column
df_train.drop(['y_mean_HR','y_mean_MAP'], axis = 1, inplace = True)

In [280]:
valid_idx = range(int(df_train.shape[0]*80/100), df_train.shape[0])
train_idx = range(int(df_train.shape[0]*80/100))

In [281]:
#specify Target
y_mean_MAP = df_train_raw[['y_mean_MAP','key']].groupby('key').mean().reset_index().drop(['key'], axis = 1)
y_mean_HR = df_train_raw[['y_mean_HR','key']].groupby('key').mean().reset_index().drop(['key'], axis = 1)

In [286]:
#Train Test Split
X_map_train = df_train.loc[train_idx,:]
X_map_val = df_train.loc[valid_idx,:]
y_map_train = y_mean_MAP.loc[train_idx]
y_map_val = y_mean_MAP.loc[valid_idx]

In [289]:
X_hr_train = df_train.loc[train_idx,:]
X_hr_val = df_train.loc[valid_idx,:]
y_hr_train = y_mean_HR.loc[train_idx]
y_hr_val = y_mean_HR.loc[valid_idx]

In [293]:
#Sanity Check
print(X_map_train.key.isin(X_map_val.key).all())
print(X_hr_train.key.isin(X_hr_val.key).all())

False
False


### XGBoost on y_mean_MAP

In [298]:
#feature selection
X_map_train.columns

Index(['key', 'patient_id', 'gender', 'age', 'x1', 'x2', 'x3', 'x4', 'x5',
       'x6', 'xx1', 'xx2', 'xx3', 'xx4', 'xx5', 'xx1_last', 'xx1_mean',
       'xx2_last', 'xx2_mean', 'xx3_last', 'xx3_mean', 'xx4_last', 'xx4_mean',
       'xx5_last', 'xx5_mean'],
      dtype='object')

In [299]:
X_map_train = X_map_train[['gender','age','x1','x2','xx3','xx4','xx5','xx3_mean','xx3_last',
                   'xx4_mean','xx4_last','xx5_mean','xx5_last']]
X_map_val = X_map_val[['gender','age','x1','x2','xx3','xx4','xx5','xx3_mean','xx3_last',
                   'xx4_mean','xx4_last','xx5_mean','xx5_last']]

In [311]:
xgb_map = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.01,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=1000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
xgb_map.fit(X_map_train,y_map_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1.5, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0.75, reg_lambda=0.45, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.6, verbosity=1)

In [312]:
y_map_val_pred = xgb_map.predict(X_map_val)
r2_score(y_map_val,y_map_val_pred)

0.8845700041397508

### XGBoost on y_mean_HR

In [308]:
#feature selection
X_hr_train.columns

Index(['key', 'patient_id', 'gender', 'age', 'x1', 'x2', 'x3', 'x4', 'x5',
       'x6', 'xx1', 'xx2', 'xx3', 'xx4', 'xx5', 'xx1_last', 'xx1_mean',
       'xx2_last', 'xx2_mean', 'xx3_last', 'xx3_mean', 'xx4_last', 'xx4_mean',
       'xx5_last', 'xx5_mean'],
      dtype='object')

In [313]:
X_hr_train = X_hr_train[['age','xx4','xx5','xx4_mean','xx4_last',
                          'xx5_mean','xx5_last','xx1','xx1_mean','xx1_last']]
X_hr_val = X_hr_val[['age','xx4','xx5','xx4_mean','xx4_last',
                          'xx5_mean','xx5_last','xx1','xx1_mean','xx1_last']]

In [314]:
xgb_hr = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.01,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=1000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
xgb_hr.fit(X_hr_train,y_hr_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1.5, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0.75, reg_lambda=0.45, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.6, verbosity=1)

In [315]:
y_hr_val_pred = xgb_hr.predict(X_hr_val)
r2_score(y_hr_val,y_hr_val_pred)

0.9497421690921983

### Test

In [316]:
df_test = df_test_raw.groupby(['key']).mean().reset_index()

In [317]:
df_test.head()

Unnamed: 0,key,patient_id,gender,age,x1,x2,x3,x4,x5,x6,xx1,xx2,xx3,xx4,xx5
0,1005-1,1005,0,80,41,4,2,1,0,0,57.866667,98.433333,126.566667,42.8,71.466667
1,1005-10,1005,0,80,41,4,2,0,0,0,75.766667,97.333333,126.933333,48.4,71.866667
2,1005-11,1005,0,80,41,4,2,1,0,0,65.666667,92.166667,109.0,48.633333,65.6
3,1005-12,1005,0,80,41,4,2,1,0,0,64.6,94.933333,124.2,55.4,75.133333
4,1005-13,1005,0,80,41,4,2,0,0,0,73.8,97.4,134.733333,52.366667,77.2


In [318]:
stats_test = df_test_raw.groupby(['patient_id','key']).agg({'xx1':['last','mean'],'xx2':['last','mean'],
                                                        'xx3':['last','mean'],'xx4':['last','mean'],
                                                        'xx5':['last','mean']
                                                       }).reset_index()
stats_test.columns = ['patient_id','key','xx1_last', 
                 'xx1_mean','xx2_last','xx2_mean',
                 'xx3_last','xx3_mean','xx4_last',
                'xx4_mean','xx5_last','xx5_mean']
stats_test.drop(['patient_id'], axis = 1, inplace = True)
df_test = df_test.merge(stats_test, on = 'key', how = 'left')

In [319]:
df_test.head()

Unnamed: 0,key,patient_id,gender,age,x1,x2,x3,x4,x5,x6,...,xx1_last,xx1_mean,xx2_last,xx2_mean,xx3_last,xx3_mean,xx4_last,xx4_mean,xx5_last,xx5_mean
0,1005-1,1005,0,80,41,4,2,1,0,0,...,55.0,57.866667,99.0,98.433333,116.0,126.566667,36.0,42.8,61.0,71.466667
1,1005-10,1005,0,80,41,4,2,0,0,0,...,71.0,75.766667,100.0,97.333333,124.0,126.933333,46.0,48.4,68.0,71.866667
2,1005-11,1005,0,80,41,4,2,1,0,0,...,65.0,65.666667,89.0,92.166667,105.0,109.0,46.0,48.633333,62.0,65.6
3,1005-12,1005,0,80,41,4,2,1,0,0,...,68.0,64.6,97.0,94.933333,108.0,124.2,47.0,55.4,68.0,75.133333
4,1005-13,1005,0,80,41,4,2,0,0,0,...,76.0,73.8,99.0,97.4,132.0,134.733333,52.0,52.366667,76.0,77.2


In [320]:
#feature selection for test 
df_test_map =df_test[['gender','age','x1','x2','xx3','xx4','xx5','xx3_mean','xx3_last',
                   'xx4_mean','xx4_last','xx5_mean','xx5_last']]
df_test_hr = df_test[['age','xx4','xx5','xx4_mean','xx4_last',
                          'xx5_mean','xx5_last','xx1','xx1_mean','xx1_last']]

In [321]:
df_test_map.head()

Unnamed: 0,gender,age,x1,x2,xx3,xx4,xx5,xx3_mean,xx3_last,xx4_mean,xx4_last,xx5_mean,xx5_last
0,0,80,41,4,126.566667,42.8,71.466667,126.566667,116.0,42.8,36.0,71.466667,61.0
1,0,80,41,4,126.933333,48.4,71.866667,126.933333,124.0,48.4,46.0,71.866667,68.0
2,0,80,41,4,109.0,48.633333,65.6,109.0,105.0,48.633333,46.0,65.6,62.0
3,0,80,41,4,124.2,55.4,75.133333,124.2,108.0,55.4,47.0,75.133333,68.0
4,0,80,41,4,134.733333,52.366667,77.2,134.733333,132.0,52.366667,52.0,77.2,76.0


In [322]:
df_test_hr.head()

Unnamed: 0,age,xx4,xx5,xx4_mean,xx4_last,xx5_mean,xx5_last,xx1,xx1_mean,xx1_last
0,80,42.8,71.466667,42.8,36.0,71.466667,61.0,57.866667,57.866667,55.0
1,80,48.4,71.866667,48.4,46.0,71.866667,68.0,75.766667,75.766667,71.0
2,80,48.633333,65.6,48.633333,46.0,65.6,62.0,65.666667,65.666667,65.0
3,80,55.4,75.133333,55.4,47.0,75.133333,68.0,64.6,64.6,68.0
4,80,52.366667,77.2,52.366667,52.0,77.2,76.0,73.8,73.8,76.0


In [325]:
test_hr_pred = xgb_hr.predict(df_test_hr)

In [326]:
test_map_pred = xgb_map.predict(df_test_map)

In [327]:
df_test['y_mean_MAP'] = test_map_pred
df_test['y_mean_HR'] = test_hr_pred

In [328]:
df_submit = df_test[['key','y_mean_MAP','y_mean_HR']]

In [329]:
df_submit.head()

Unnamed: 0,key,y_mean_MAP,y_mean_HR
0,1005-1,64.651085,56.20916
1,1005-10,69.766434,73.415321
2,1005-11,64.477036,65.145874
3,1005-12,70.389214,66.327324
4,1005-13,76.479774,74.878159


In [330]:
df_submit = df_submit.groupby('key').mean()

In [331]:
df_submit.head()

Unnamed: 0_level_0,y_mean_MAP,y_mean_HR
key,Unnamed: 1_level_1,Unnamed: 2_level_1
1005-1,64.651085,56.20916
1005-10,69.766434,73.415321
1005-11,64.477036,65.145874
1005-12,70.389214,66.327324
1005-13,76.479774,74.878159


In [332]:
df_submit.to_csv("no_one_4.csv")