In [199]:
%matplotlib inline
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split,ShuffleSplit
from sklearn.metrics import r2_score
from sklearn import ensemble
from sklearn.decomposition import PCA
import xgboost as xgb

In [56]:
df_train_raw = pd.read_csv("./data/train.csv")

In [None]:
df_test = pd.read_csv("./data/data_for_test.csv")

In [57]:
df_train_cat = df_train_raw[['key','gender','age','x1','x2','x3','x4','x5','x6']]
df_train_cat = df_train_cat.groupby('key').agg(lambda x: x.value_counts().index[0]).reset_index()

In [59]:
df_train_cat.shape

(44949, 9)

In [58]:
df_train_num = df_train_raw[['xx1','xx2','xx3','xx4','key']]
df_train_num = df_train_num.groupby('key').mean().reset_index()

In [60]:
df_train_num.shape

(44949, 5)

In [61]:
df_train = pd.merge(df_train_num, df_train_cat, how = 'inner', on = 'key')

In [62]:
df_train.shape

(44949, 13)

In [108]:
stats = df_train_raw.groupby('key')['xx3'].agg(['max', 'min','std','sem'])
stats.columns = ['max_xx3_amount', 'min_xx3_amount','std_xx3_amount','sem_xx3_amount']
df_train = df_train.merge(stats, left_on = 'key', right_index=True, how = 'left')

In [109]:
stats = df_train_raw.groupby('key')['xx4'].agg(['max', 'min','std','sem'])
stats.columns = [ 'max_xx4_amount', 'min_xx4_amount','std_xx4_amount','sem_xx4_amount']
df_train = df_train.merge(stats, left_on = 'key', right_index=True, how = 'left')

In [110]:
stats = df_train_raw.groupby('key')['xx5'].agg(['max', 'min','std','sem'])
stats.columns = [ 'max_xx5_amount', 'min_xx5_amount','std_xx5_amount','sem_xx5_amount']
df_train = df_train.merge(stats, left_on = 'key', right_index=True, how = 'left')

In [312]:
X_map = df_train.drop(['key'],axis = 1)
X_hr = df_train.drop(['key'],axis = 1)

### Feature selection for y_mean_MAP
1. high correlation column with target xx3,xx4,xx5
2. high correlation column between features: x1-x2,x5-x6

#### train_valid split

In [138]:
y_mean_MAP = df_train_raw[['y_mean_MAP','key']].groupby('key').mean().reset_index().drop(['key'], axis = 1)

In [139]:
y_mean_MAP.index

RangeIndex(start=0, stop=44949, step=1)

In [234]:
y_mean_HR = df_train_raw[['y_mean_HR','key']].groupby('key').mean().reset_index().drop(['key'], axis = 1)

In [148]:
X_map_train, X_map_val, y_map_train, y_map_val = train_test_split(X_map, y_mean_MAP, test_size=0.33, random_state=42)

In [313]:
X_hr_train, X_hr_val, y_hr_train, y_hr_val = train_test_split(X_hr, y_mean_HR, test_size=0.33, random_state=42)

### XGBoost on y_mean_MAP

In [236]:
regr_model = ensemble.GradientBoostingRegressor(learning_rate=0.01, subsample = 0.7, max_depth=7, 
                                                n_estimators=1000, min_samples_leaf=10, max_features=10)

In [237]:
regr_model.fit(X_map_train,y_map_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.01, loss='ls', max_depth=7,
                          max_features=10, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=10, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=0.7, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [238]:
regr_model.score(X_map_val, y_map_val)

0.8135857255026723

### XGBoost on y_mean_HR

In [326]:
regr_model_hr = ensemble.GradientBoostingRegressor(learning_rate=0.015, subsample = 0.70, max_depth=5, 
                                                n_estimators=700, min_samples_leaf=5, max_features=10)

In [327]:
regr_model_hr.fit(X_hr_train,y_hr_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.015, loss='ls', max_depth=5,
                          max_features=10, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=5, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=700,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=0.7, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [328]:
regr_model_hr.score(X_hr_val, y_hr_val)

0.9116928991023437