## Stacking primer, based on script by Faron

### Heavily edited at this point by me

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.feature_extraction import DictVectorizer

from scipy.sparse import csr_matrix

Some parameter definitions

In [2]:
ID = 'id'
TARGET = 'loss'
NFOLDS = 4
SEED = 61222
NROWS = None
DATA_DIR = "data"

TRAIN_FILE = "{0}/train.csv".format(DATA_DIR)
TEST_FILE = "{0}/test.csv".format(DATA_DIR)
SUBMISSION_FILE = "{0}/sample_submission.csv".format(DATA_DIR)

In [40]:
train = pd.read_csv(TRAIN_FILE, nrows=NROWS)
test = pd.read_csv(TEST_FILE, nrows=NROWS)

train_loss = train[[ID,TARGET]] # template for saved single regressor outputs
y_train = train[TARGET].ravel()

#log transform to reduce skewness
y_train = np.log(y_train)

train.drop([ID, TARGET], axis=1, inplace=True)
test.drop([ID], axis=1, inplace=True)

print("{},{}".format(train.shape, test.shape))

(188318, 130),(125546, 130)


In [41]:
ntrain = train.shape[0]
ntest = test.shape[0]
train_test = pd.concat((train, test)).reset_index(drop=True)

features = train.columns

In [5]:
def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    
    Modified from: https://gist.github.com/kljensen/5452382
    and: https://gist.github.com/ramhiser/982ce339d5f8c9a769a0
    
    Details:
    
    http://en.wikipedia.org/wiki/One-hot
    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    vec = DictVectorizer()
    
    vec_data = pd.DataFrame(vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index
    
    df = df.drop(cols, axis=1)
    df = df.join(vec_data)
    return df

In [42]:
cats = [feat for feat in features if 'cat' in feat]
train_test = encode_onehot(train_test, cats)
    
print(train_test.head())

x_train = np.array(train_test.iloc[:ntrain,:])
x_test = np.array(train_test.iloc[ntrain:,:])

      cont1     cont2     cont3     cont4     cont5     cont6     cont7  \
0  0.726300  0.245921  0.187583  0.789639  0.310061  0.718367  0.335060   
1  0.330514  0.737068  0.592681  0.614134  0.885834  0.438917  0.436585   
2  0.261841  0.358319  0.484196  0.236924  0.397069  0.289648  0.315545   
3  0.321594  0.555782  0.527991  0.373816  0.422268  0.440945  0.391128   
4  0.273204  0.159990  0.527991  0.473202  0.704268  0.178193  0.247408   

     cont8    cont9   cont10   ...    cat99=M  cat99=N  cat99=O  cat99=P  \
0  0.30260  0.67135  0.83510   ...        0.0      0.0      0.0      0.0   
1  0.60087  0.35127  0.43919   ...        0.0      0.0      0.0      0.0   
2  0.27320  0.26076  0.32446   ...        0.0      0.0      0.0      0.0   
3  0.31796  0.32128  0.44467   ...        0.0      0.0      0.0      0.0   
4  0.24564  0.22089  0.21230   ...        0.0      0.0      0.0      1.0   

   cat99=R  cat99=S  cat99=T  cat99=U  cat9=A  cat9=B  
0      0.0      0.0      1.0      0.

In [43]:
# sparse matrix
x_train = csr_matrix(x_train)
x_test = csr_matrix(x_test)

In [8]:
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)

In [9]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        try:
            params['seed'] = seed
            self.clf = clf(**params)
        except TypeError:
            try:
                params.pop('seed',None)
                params['random_state'] = seed
                self.clf = clf(**params)
            except TypeError:
                params.pop('random_state',None)
                self.clf = clf(**params)
            
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [25]:
class SGBWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        try:
            params['seed'] = seed
            self.clf = clf(**params)
        except TypeError:
            try:
                params.pop('seed',None)
                params['random_state'] = seed
                self.clf = clf(**params)
            except TypeError:
                params.pop('random_state',None)
                self.clf = clf(**params)
            
    def train(self, x_train, y_train):
        self.clf.fit(x_train.toarray(), y_train)

    def predict(self, x):
        return self.clf.predict(x.toarray())

In [10]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [11]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [30]:
et_params = {
    'n_jobs': -1,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2
}

rf_params = {
    'n_jobs': -1,
    'n_estimators': 100,
    'max_features': 0.25,
    'max_depth': 8,
    'min_samples_leaf': 2
}

xgb_params = {
    'n_jobs': -1,
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.07,
    'objective': 'reg:linear',
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae'
}

# linear regression params
lr_params = {
    'n_jobs':-1
}

# ridge regression params
rr_params = {
    'alpha': 1.0
}

# elastic net params
en_params = {
    'alpha': 1.0
}

# stochastic gradient boosting
sgb_params = {
    'n_estimators': 50,
    'max_depth': 6,
    'learning_rate': 0.07,
    'max_features': 0.25
}

In [44]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)

lr = SklearnWrapper(clf=LinearRegression, seed=SEED, params=lr_params)
rr = SklearnWrapper(clf=Ridge, seed=SEED, params=rr_params)
en = SklearnWrapper(clf=ElasticNet, seed=SEED, params=en_params)
sgb = SGBWrapper(clf=GradientBoostingRegressor, seed=SEED, params=sgb_params)

In [45]:
xg_oof_train, xg_oof_test = get_oof(xg)

In [46]:
et_oof_train, et_oof_test = get_oof(et)

In [47]:
rf_oof_train, rf_oof_test = get_oof(rf)

In [48]:
lr_oof_train, lr_oof_test = get_oof(lr)

In [49]:
rr_oof_train, rr_oof_test = get_oof(rr)

In [50]:
en_oof_train, en_oof_test = get_oof(en)

In [51]:
# sgb_oof_train, sgb_oof_test = get_oof(sgb)

In [52]:
print("XG-CV: {}".format(mean_absolute_error(y_train, xg_oof_train)))
print("ET-CV: {}".format(mean_absolute_error(y_train, et_oof_train)))
print("RF-CV: {}".format(mean_absolute_error(y_train, rf_oof_train)))

print("LR-CV: {}".format(mean_absolute_error(y_train, lr_oof_train)))
print("RR-CV: {}".format(mean_absolute_error(y_train, rr_oof_train)))
print("EN-CV: {}".format(mean_absolute_error(y_train, en_oof_train)))
# print("SGB-CV: {}".format(mean_absolute_error(y_train, sgb_oof_train)))

XG-CV: 0.428684197175055
ET-CV: 0.45084699457639427
RF-CV: 0.47125714088200527
LR-CV: 0.4413113497884676
RR-CV: 0.4410594955651067
EN-CV: 0.6600409203406471


Clearly, the xg, et, and rf regressors are doing a solid job, and re-running them every time is a waste of time and energy, so let's save them and simply read them in from csv when we want to ensemble

In [53]:
x_train = np.concatenate((xg_oof_train, 
                          et_oof_train, 
                          rf_oof_train,
                          lr_oof_train,
                          rr_oof_train,
                          en_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, 
                         et_oof_test, 
                         rf_oof_test,
                         lr_oof_test,
                         rr_oof_test,
                         en_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

(188318, 6),(125546, 6)


In [54]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)

xgb_params = {
    'seed': SEED,
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.05,
    'objective': 'reg:linear',
    'max_depth': 4,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'mae',
}

res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=4, seed=SEED, stratified=False,
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))

[0]	train-mae:6.82614+0.00107958	test-mae:6.82613+0.00334772
[10]	train-mae:4.08754+0.000793454	test-mae:4.08748+0.00340333
[20]	train-mae:2.44845+0.000367166	test-mae:2.44833+0.00378233
[30]	train-mae:1.47193+0.000279706	test-mae:1.47186+0.0034762
[40]	train-mae:0.911961+0.000281293	test-mae:0.912118+0.00256106
[50]	train-mae:0.628162+0.000371504	test-mae:0.628667+0.00154099
[60]	train-mae:0.503764+0.000379435	test-mae:0.504628+0.00069803
[70]	train-mae:0.453875+0.000465415	test-mae:0.455006+0.000462354
[80]	train-mae:0.434418+0.000466007	test-mae:0.435763+0.000744276
[90]	train-mae:0.426718+0.00043101	test-mae:0.428236+0.000988498
[100]	train-mae:0.42362+0.000431106	test-mae:0.425264+0.00110175
[110]	train-mae:0.422232+0.000433657	test-mae:0.423989+0.00118558
[120]	train-mae:0.421563+0.000443859	test-mae:0.423412+0.00121627
[130]	train-mae:0.421164+0.000407181	test-mae:0.423119+0.00127382
[140]	train-mae:0.420901+0.000403718	test-mae:0.422951+0.00129153
[150]	train-mae:0.420716+0.000

In [55]:
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

In [56]:
submission = pd.read_csv(SUBMISSION_FILE)
#submission.iloc[:, 1] = gbdt.predict(dtest)
submission.iloc[:, 1] = np.exp(gbdt.predict(dtest))
submission.to_csv('submissions/ensemble.sub.xgb_rf_et_lr_rr_en.logpred.20161019.csv', index=None)

In [62]:
# mean
submission.iloc[:, 1]=np.exp(np.mean(x_test,axis=1))
submission.to_csv('submissions/ensemble.mean.sub.xgb_rf_et_lr_rr_en.logpred.20161019.csv', index=None)