## Stacking primer, based on script by Faron


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error



Some parameter definitions

In [2]:
ID = 'id'
TARGET = 'loss'
NFOLDS = 4
SEED = 0
NROWS = None
DATA_DIR = "data"

TRAIN_FILE = "{0}/train.csv".format(DATA_DIR)
TEST_FILE = "{0}/test.csv".format(DATA_DIR)
SUBMISSION_FILE = "{0}/sample_submission.csv".format(DATA_DIR)

In [3]:
train = pd.read_csv(TRAIN_FILE, nrows=NROWS)
test = pd.read_csv(TEST_FILE, nrows=NROWS)

y_train = train[TARGET].ravel()

train.drop([ID, TARGET], axis=1, inplace=True)
test.drop([ID], axis=1, inplace=True)

print("{},{}".format(train.shape, test.shape))

(188318, 130),(125546, 130)


In [4]:
ntrain = train.shape[0]
ntest = test.shape[0]
train_test = pd.concat((train, test)).reset_index(drop=True)

features = train.columns

In [5]:
cats = [feat for feat in features if 'cat' in feat]
for feat in cats:
    train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]

print(train_test.head())

x_train = np.array(train_test.iloc[:ntrain,:])
x_test = np.array(train_test.iloc[ntrain:,:])

   cat1  cat2  cat3  cat4  cat5  cat6  cat7  cat8  cat9  cat10    ...     \
0     0     1     0     1     0     0     0     0     1      0    ...      
1     0     1     0     0     0     0     0     0     1      1    ...      
2     0     1     0     0     1     0     0     0     1      1    ...      
3     1     1     0     1     0     0     0     0     1      0    ...      
4     0     1     0     1     0     0     0     0     1      1    ...      

      cont5     cont6     cont7    cont8    cont9   cont10    cont11  \
0  0.310061  0.718367  0.335060  0.30260  0.67135  0.83510  0.569745   
1  0.885834  0.438917  0.436585  0.60087  0.35127  0.43919  0.338312   
2  0.397069  0.289648  0.315545  0.27320  0.26076  0.32446  0.381398   
3  0.422268  0.440945  0.391128  0.31796  0.32128  0.44467  0.327915   
4  0.704268  0.178193  0.247408  0.24564  0.22089  0.21230  0.204687   

     cont12    cont13    cont14  
0  0.594646  0.822493  0.714843  
1  0.366307  0.611431  0.304496  
2  0.373

In [6]:
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [7]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [8]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [9]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [17]:
et_params = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

et_params2 = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.25,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'criterion': 'mae'
}

rf_params = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.25,
    'max_depth': 8,
    'min_samples_leaf': 2,
}

rf_params2 = {
    'n_jobs': -1,
    'n_estimators': 200,
    'max_features': 0.5,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'criterion': 'mae'
}

xgb_params = {
    'n_jobs': -1,
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.04,
    'objective': 'reg:linear',
    'max_depth': 7,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',
    'nrounds': 350
}



In [18]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
et2 = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params2)
rf2 = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params2)

In [None]:
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
et2_oof_train, et2_oof_test = get_oof(et2)
rf2_oof_train, rf2_oof_test = get_oof(rf2)

In [None]:
print("XG-CV: {}".format(mean_absolute_error(y_train, xg_oof_train)))
print("ET-CV: {}".format(mean_absolute_error(y_train, et_oof_train)))
print("RF-CV: {}".format(mean_absolute_error(y_train, rf_oof_train)))
print("ET2-CV: {}".format(mean_absolute_error(y_train, et2_oof_train)))
print("RF2-CV: {}".format(mean_absolute_error(y_train, rf2_oof_train)))

Clearly, the xg, et, and rf regressors are doing a solid job, and re-running them every time is a waste of time and energy, so let's save them and simply read them in from csv when we want to ensemble

In [14]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

(188318, 3),(125546, 3)


In [15]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',
}

res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))

[0]	train-mae:3006.49+4.19014	test-mae:3006.49+12.7324
[10]	train-mae:2719.85+3.77514	test-mae:2719.83+12.8933
[20]	train-mae:2463.51+3.38504	test-mae:2463.48+13.267
[30]	train-mae:2236.35+3.02003	test-mae:2236.39+13.4033
[40]	train-mae:2038.71+2.56257	test-mae:2038.82+13.3325
[50]	train-mae:1869.83+2.27367	test-mae:1870.09+13.0377
[60]	train-mae:1727.93+1.78197	test-mae:1728.35+12.6914
[70]	train-mae:1610.35+1.51575	test-mae:1611.01+12.2194
[80]	train-mae:1514.36+1.25937	test-mae:1515.24+11.9594
[90]	train-mae:1436.48+1.34783	test-mae:1437.57+11.3221
[100]	train-mae:1374.2+1.21987	test-mae:1375.56+10.7053
[110]	train-mae:1324.33+1.07363	test-mae:1325.89+10.3493
[120]	train-mae:1285.15+0.949783	test-mae:1286.91+9.9801
[130]	train-mae:1254.08+1.17367	test-mae:1255.98+9.20725
[140]	train-mae:1229.98+1.16754	test-mae:1232.08+8.74768
[150]	train-mae:1211.32+0.970857	test-mae:1213.62+8.41304
[160]	train-mae:1196.97+0.891936	test-mae:1199.48+8.01031
[170]	train-mae:1186+0.888229	test-mae:118

In [None]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

In [None]:
submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)
submission.to_csv('submissions/xgstacker_xg_et_rf_ad.sub.20161012.csv', index=None)