In [177]:
%matplotlib inline
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split,ShuffleSplit
from sklearn.metrics import r2_score
from sklearn import ensemble
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

### loading data

In [228]:
df_train_raw = pd.read_csv("train.csv")
df_test_raw = pd.read_csv("data_for_test.csv")

### dropping non predictive column

In [229]:
df_train_raw = df_train_raw.drop('patient_id', axis=1)
df_test_raw = df_test_raw.drop('patient_id', axis=1)

### row to column tranformation for each key, around 150 new features generated in this operation

In [230]:
col = ['xx1','xx2','xx3','xx4','xx5']
df_train_raw['key_row_count'] = df_train_raw.groupby('key').cumcount().astype(str)
train_pivoted = pd.pivot(df_train_raw,'key','key_row_count',col)
train_pivoted.columns = ['_'.join(col) for col in train_pivoted.columns]

In [231]:
col = ['xx1','xx2','xx3','xx4','xx5']
df_test_raw['key_row_count'] = df_test_raw.groupby('key').cumcount().astype(str)
test_pivoted = pd.pivot(df_test_raw,'key','key_row_count',col)
test_pivoted.columns = ['_'.join(col) for col in test_pivoted.columns]

In [232]:
train_pivoted.head(2)

Unnamed: 0_level_0,xx1_0,xx1_1,xx1_10,xx1_11,xx1_12,xx1_13,xx1_14,xx1_15,xx1_16,xx1_17,...,xx5_27,xx5_28,xx5_29,xx5_3,xx5_4,xx5_5,xx5_6,xx5_7,xx5_8,xx5_9
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0-28,82.0,82.0,79.0,77.0,78.0,76.0,78.0,83.0,83.0,82.0,...,84.0,91.0,88.0,85.0,85.0,83.0,83.0,83.0,82.0,84.0
1-10,64.0,66.0,70.0,69.0,70.0,69.0,77.0,100.0,76.0,69.0,...,72.0,73.0,73.0,77.0,77.0,77.0,77.0,76.0,76.0,75.0


In [233]:
test_pivoted.head(2)

Unnamed: 0_level_0,xx1_0,xx1_1,xx1_10,xx1_11,xx1_12,xx1_13,xx1_14,xx1_15,xx1_16,xx1_17,...,xx5_27,xx5_28,xx5_29,xx5_3,xx5_4,xx5_5,xx5_6,xx5_7,xx5_8,xx5_9
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1005-1,59.0,60.0,68.0,67.0,58.0,62.0,58.0,57.0,56.0,55.0,...,67.0,60.0,61.0,69.0,78.0,75.0,72.0,76.0,72.0,73.0
1005-10,77.0,76.0,79.0,76.0,71.0,76.0,78.0,76.0,78.0,70.0,...,73.0,72.0,68.0,71.0,70.0,76.0,68.0,70.0,70.0,70.0


### aggregation over key on the originally loaded dataset. 
NOTE this is a new df which will be later merged to the pivoted ones

In [234]:
df_train = df_train_raw.groupby(['key']).mean().reset_index()

In [235]:
df_test = df_test_raw.groupby(['key']).mean().reset_index()

In [236]:
df_train.head(2)

Unnamed: 0,key,gender,age,x1,x2,x3,x4,x5,x6,xx1,xx2,xx3,xx4,xx5,y_mean_MAP,y_mean_HR
0,0-28,0,72,39,5,1,0,0,0,80.166667,99.733333,113.433333,69.566667,87.0,86.426667,79.13
1,1-10,1,64,55,9,5,1,0,1,72.1,99.8,113.9,59.6,76.466667,73.033333,72.92


### adding more non-linear features like min, max and median

In [237]:
stats_train = df_train_raw.groupby(['key']).agg({'xx1':['min','median','max'],'xx2':['min','median','max'],
                                                        'xx3':['min','median','max'],'xx4':['min','median','max'],
                                                        'xx5':['min','median','max']
                                                       }).reset_index()
stats_train.columns = ['key','xx1_min','xx1_median','xx1_max', 'xx2_min','xx2_median','xx2_max',
                 'xx3_min','xx3_median','xx3_max','xx4_min','xx4_median','xx4_max',
                 'xx5_min','xx5_median','xx5_max']
stats_train.head(3)
# df_train = df_train.merge(stats, on = 'key', how = 'left')

Unnamed: 0,key,xx1_min,xx1_median,xx1_max,xx2_min,xx2_median,xx2_max,xx3_min,xx3_median,xx3_max,xx4_min,xx4_median,xx4_max,xx5_min,xx5_median,xx5_max
0,0-28,76.0,80.0,84.0,96.0,100.0,100.0,102.0,113.0,128.0,65.0,68.0,78.0,82.0,85.0,98.0
1,1-10,64.0,70.5,100.0,99.0,100.0,100.0,106.0,114.0,130.0,53.0,59.0,77.0,72.0,76.0,88.0
2,1-11,68.0,71.5,74.0,99.0,100.0,100.0,108.0,113.0,124.0,58.0,59.0,62.0,73.0,75.0,78.0


In [238]:
stats_test = df_test_raw.groupby(['key']).agg({'xx1':['min','median','max'],'xx2':['min','median','max'],
                                                        'xx3':['min','median','max'],'xx4':['min','median','max'],
                                                        'xx5':['min','median','max']
                                                       }).reset_index()
stats_test.columns = ['key','xx1_min','xx1_median','xx1_max', 'xx2_min','xx2_median','xx2_max',
                 'xx3_min','xx3_median','xx3_max','xx4_min','xx4_median','xx4_max',
                 'xx5_min','xx5_median','xx5_max']
stats_test.head(3)
# df_test = df_test.merge(stats, on = 'key', how = 'left')

Unnamed: 0,key,xx1_min,xx1_median,xx1_max,xx2_min,xx2_median,xx2_max,xx3_min,xx3_median,xx3_max,xx4_min,xx4_median,xx4_max,xx5_min,xx5_median,xx5_max
0,1005-1,53.0,57.5,68.0,96.0,99.0,100.0,110.0,123.5,152.0,35.0,42.5,60.0,60.0,71.5,94.0
1,1005-10,70.0,76.0,94.0,95.0,97.0,100.0,119.0,126.0,141.0,45.0,48.0,58.0,68.0,71.0,86.0
2,1005-11,57.0,65.0,100.0,89.0,92.0,97.0,102.0,108.0,120.0,45.0,48.5,55.0,61.0,65.0,73.0


### merging the three data frames that we created above

In [239]:
df_train = df_train.merge(stats_train, on = 'key', how = 'left')
df_train = df_train.merge(train_pivoted, on='key', how = 'left')

In [240]:
df_test = df_test.merge(stats_test, on = 'key', how = 'left')
df_test = df_test.merge(test_pivoted, on='key', how = 'left')

In [241]:
df_train.head(2)

Unnamed: 0,key,gender,age,x1,x2,x3,x4,x5,x6,xx1,...,xx5_27,xx5_28,xx5_29,xx5_3,xx5_4,xx5_5,xx5_6,xx5_7,xx5_8,xx5_9
0,0-28,0,72,39,5,1,0,0,0,80.166667,...,84.0,91.0,88.0,85.0,85.0,83.0,83.0,83.0,82.0,84.0
1,1-10,1,64,55,9,5,1,0,1,72.1,...,72.0,73.0,73.0,77.0,77.0,77.0,77.0,76.0,76.0,75.0


In [242]:
df_test.head(2)

Unnamed: 0,key,gender,age,x1,x2,x3,x4,x5,x6,xx1,...,xx5_27,xx5_28,xx5_29,xx5_3,xx5_4,xx5_5,xx5_6,xx5_7,xx5_8,xx5_9
0,1005-1,0,80,41,4,2,1,0,0,57.866667,...,67.0,60.0,61.0,69.0,78.0,75.0,72.0,76.0,72.0,73.0
1,1005-10,0,80,41,4,2,0,0,0,75.766667,...,73.0,72.0,68.0,71.0,70.0,76.0,68.0,70.0,70.0,70.0


### defining training and validation data
#### train test split

In [155]:
train = df_train.drop(['y_mean_HR','y_mean_MAP','key'], axis=1)

In [156]:
y_mean_MAP=df_train['y_mean_MAP']
y_mean_HR=df_train['y_mean_HR']

In [201]:
valid_idx = range(int(df_train.shape[0]*80/100), df_train.shape[0])
train_idx = range(int(df_train.shape[0]*80/100))

In [202]:
X_map_train = train.loc[train_idx,:]
X_map_val = train.loc[valid_idx,:]
y_map_train = y_mean_MAP.loc[train_idx]
y_map_val = y_mean_MAP.loc[valid_idx]

In [203]:
X_hr_train = train.loc[train_idx,:]
X_hr_val = train.loc[valid_idx,:]
y_hr_train = y_mean_HR.loc[train_idx]
y_hr_val = y_mean_HR.loc[valid_idx]

In [204]:
X_map_train.shape, X_map_val.shape , X_hr_train.shape, X_hr_val.shape

((35959, 178), (8990, 178), (35959, 178), (8990, 178))

In [205]:
y_map_train.shape, y_map_val.shape

((35959,), (8990,))

### XGBoost on y_mean_MAP (Model 2)

In [206]:
#feature selection
X_map_train.columns

Index(['gender', 'age', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'xx1', 'xx2',
       ...
       'xx5_27', 'xx5_28', 'xx5_29', 'xx5_3', 'xx5_4', 'xx5_5', 'xx5_6',
       'xx5_7', 'xx5_8', 'xx5_9'],
      dtype='object', length=178)

In [207]:
xgb_map = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.01,
                 max_depth=2,
                 min_child_weight=1.5,
                 n_estimators=1200,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
xgb_map.fit(X_map_train,y_map_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=2, min_child_weight=1.5, missing=None, n_estimators=1200,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0.75, reg_lambda=0.45, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.6, verbosity=1)

In [208]:
y_map_val_pred = xgb_map.predict(X_map_val)
r2_score(y_map_val,y_map_val_pred)

0.8904210975550623

### XGBoost on y_mean_HR (Model 2)

In [189]:
#feature selection
X_hr_train.columns

Index(['gender', 'age', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'xx1', 'xx2',
       ...
       'xx5_27', 'xx5_28', 'xx5_29', 'xx5_3', 'xx5_4', 'xx5_5', 'xx5_6',
       'xx5_7', 'xx5_8', 'xx5_9'],
      dtype='object', length=178)

In [191]:
xgb_hr = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.01,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=1000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
xgb_hr.fit(X_hr_train,y_hr_train)

  if getattr(data, 'base', None) is not None and \




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, gamma=0,
             importance_type='gain', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1.5, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0.75, reg_lambda=0.45, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.6, verbosity=1)

In [192]:
y_hr_val_pred = xgb_hr.predict(X_hr_val)
r2_score(y_hr_val,y_hr_val_pred)

0.9538499947016639

## Stacking multiple regressors (Model 3 - the selected one)

In [219]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn import linear_model

In [2]:
### For MAP

In [220]:
r1 = LinearRegression(normalize=True, n_jobs=-1, copy_X=True)
r2 = RandomForestRegressor(
    n_estimators=500,
    criterion='mse',
    min_samples_leaf=3,
    max_features='auto',
    max_leaf_nodes=None,
    bootstrap=True,
    oob_score=True,
    n_jobs= -1,
    random_state=42,
    verbose=1,
)
r3 = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.01,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=1000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
er = VotingRegressor([('lr', r1), ('rf', r2),('xgb', r3)], weights=[0.2, 0.3, 0.5], n_jobs=-1 )
er.fit(X_map_train,y_map_train)

VotingRegressor(estimators=[('lr',
                             LinearRegression(copy_X=True, fit_intercept=True,
                                              n_jobs=-1, normalize=True)),
                            ('rf',
                             RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=3,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n

In [221]:
er_pred = er.predict(X_map_val)
r2_score(y_map_val,er_pred)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.2s finished


0.8920121356611059

In [None]:
### For HR

In [222]:
r1_1 = LinearRegression(normalize=True, n_jobs=-1, copy_X=True)
r2_1 = RandomForestRegressor(
    n_estimators=500,
    criterion='mse',
    min_samples_leaf=3,
    max_features='auto',
    max_leaf_nodes=None,
    bootstrap=True,
    oob_score=True,
    n_jobs= -1,
    random_state=42,
    verbose=1,
)
r3_1 = xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0,                 
                 learning_rate=0.01,
                 max_depth=3,
                 min_child_weight=1.5,
                 n_estimators=1000,                                                                    
                 reg_alpha=0.75,
                 reg_lambda=0.45,
                 subsample=0.6,
                 seed=42)
er_1 = VotingRegressor([('lr', r1_1), ('rf', r2_1),('xgb', r3_1)], weights=[0.2, 0.2, 0.6], n_jobs=-1 )
er_1.fit(X_hr_train,y_hr_train)

VotingRegressor(estimators=[('lr',
                             LinearRegression(copy_X=True, fit_intercept=True,
                                              n_jobs=-1, normalize=True)),
                            ('rf',
                             RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=3,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n

In [223]:
er_1_pred = er_1.predict(X_hr_val)
r2_score(y_hr_val,er_1_pred)

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.1s finished


0.9543744950486026

In [225]:
X_hr_train.shape

(35959, 178)

### Predictions on Test

In [243]:
test_hr_pred = er_1.predict(df_test.drop('key', axis=1))

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.2s finished


In [244]:
test_map_pred = er.predict(df_test.drop('key', axis=1))

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.2s finished


In [245]:
df_test['y_mean_MAP'] = test_map_pred
df_test['y_mean_HR'] = test_hr_pred

In [246]:
df_submit = df_test[['key','y_mean_MAP','y_mean_HR']]

In [247]:
df_submit.head()

Unnamed: 0,key,y_mean_MAP,y_mean_HR
0,1005-1,64.046467,55.22282
1,1005-10,71.324177,74.91352
2,1005-11,64.458873,66.226725
3,1005-12,68.622625,62.417755
4,1005-13,76.887255,75.127072


In [248]:
df_submit.shape

(13019, 3)

In [249]:
df_submit = df_submit.groupby('key').mean()

In [251]:
# df_submit.to_csv("no_one_6.csv")