In [6]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import xgboost as xgb

In [7]:
#read in constants
alpha0 = 8.9615;
seed = 42;
n_folds =5;

In [8]:
#evaluation metric with cross validation
def rmse_cv(model):
    rmse = np.sqrt( -cross_val_score(model, X_tr, y_tr, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [9]:
def rmse(y_predicted, y_actual):
    return(np.sqrt(mean_squared_error(y_actual, y_predicted)))

In [10]:
def stacking_regression(models, meta_model, X_train, y_train, X_test,
             metric=None, n_folds=3, average_fold=True,
             shuffle=False, random_state=0, verbose=1):
    '''
    Function 'stacking' takes train data, test data, list of 1-st level
    models, meta_model for the 2-nd level and returns stacking predictions.

    Parameters
    ----------
    models : list
        List of 1-st level models. You can use any models that follow sklearn
        convention i.e. accept numpy arrays and have methods 'fit' and 'predict'.

    meta_model: model
        2-nd level model. You can use any model that follow sklearn convention

    X_train : numpy array or sparse matrix of shape [n_train_samples, n_features]
        Training data

    y_train : numpy 1d array
        Target values

    X_test : numpy array or sparse matrix of shape [n_test_samples, n_features]
        Test data

    metric : callable, default None
        Evaluation metric (score function) which is used to calculate
        results of cross-validation.
        If None, then by default:
            sklearn.metrics.mean_absolute_error - for regression

    n_folds : int, default 3
        Number of folds in cross-validation

    average_fold: boolean, default True
        Whether to take the average of the predictions on test set from each fold.
        Refit the model using the whole training set and predict test set if False

    shuffle : boolean, default False
        Whether to perform a shuffle before cross-validation split

    random_state : int, default 0
        Random seed for shuffle

    verbose : int, default 1
        Level of verbosity.
        0 - show no messages
        1 - for each model show single mean score
        2 - for each model show score for each fold and mean score

        Caution. To calculate MEAN score across all folds
        full train set prediction and full true target are used.
        So for some metrics (e.g. rmse) this value may not be equal
        to mean of score values calculated for each fold.

    Returns
    -------
    stacking_prediction : numpy array of shape n_test_samples
        Stacking prediction
    '''

    # Specify default metric for cross-validation
    if metric is None:
        metric = mean_squared_error

    # Print metric
    if verbose > 0:
        print('metric: [%s]\n' % metric.__name__)

    # Split indices to get folds
    kf = KFold(n_splits = n_folds, shuffle = shuffle, random_state = random_state)

    if X_train.__class__.__name__ == "DataFrame":
    	X_train = X_train.as_matrix()
    	X_test = X_test.as_matrix()

    # Create empty numpy arrays for stacking features
    S_train = np.zeros((X_train.shape[0], len(models)))
    S_test = np.zeros((X_test.shape[0], len(models)))

    # Loop across models
    for model_counter, model in enumerate(models):
        if verbose > 0:
            print('model %d: [%s]' % (model_counter, model.__class__.__name__))

        # Create empty numpy array, which will contain temporary predictions for test set made in each fold
        S_test_temp = np.zeros((X_test.shape[0], n_folds))
        # Loop across folds
        for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            X_te = X_train[te_index]
            y_te = y_train[te_index]
            # Clone the model because fit will mutate the model.
            instance = clone(model)
            # Fit 1-st level model
            instance.fit(X_tr, y_tr)
            # Predict out-of-fold part of train set
            S_train[te_index, model_counter] = np.array(instance.predict(X_te))
            # Predict full test set
            S_test_temp[:, fold_counter] = np.array(instance.predict(X_test))

            # Delete temperatory model
            del instance

            if verbose > 1:
                print('    fold %d: [%.8f]' % (fold_counter, metric(y_te, S_train[te_index, model_counter])))

        # Compute mean or mode of predictions for test set
        if average_fold:
            S_test[:, model_counter] = np.mean(S_test_temp, axis = 1)
        else:
            model.fit(X_train, y_train)
            S_test[:, model_counter] = model.predict(X_test)

        if verbose > 0:
            print('    ----')
            print('    MEAN:   [%.8f]\n' % (metric(y_train, S_train[:, model_counter])))

    # Fit our second layer meta model
    meta_model.fit(S_train, y_train)
    # Make our final prediction
    stacking_prediction = meta_model.predict(S_test)

    return stacking_prediction

In [11]:
#instantiate Ridge model
ridge = Ridge(alpha0);

In [12]:
#instantiate xgb model
xgb_mod = xgb.XGBRegressor(
                max_depth=4,
                learning_rate=0.05,
                n_estimators=200,
                gamma=0.0,
                min_child_weight=1.5,
                subsample=0.2,
                colsample_bytree=0.2,
                reg_alpha=0.1,
                reg_lambda=0.9,
                seed=seed);

In [13]:
#instantiate Random Forest Model
rf = RandomForestRegressor(n_estimators = 500,
                                max_features = 37,
                                bootstrap = True,
                                oob_score = True,
                                random_state = seed);

In [35]:
X_train = pd.read_csv('Datasets/X_train.csv');
X_test = pd.read_csv('Datasets/X_test.csv');
y_tr = pd.read_csv('Datasets/y_train.csv');
y_tr = [item for sublist in y_tr.values for item in sublist]
y_train = np.asarray(y_tr)
X_train.drop('Unnamed: 0', axis=1);
X_test.drop('Unnamed: 0', axis=1);

In [36]:
ntrain = X_train.shape[0]
ntrain

1460

In [37]:
ntest = X_test.shape[0]
ntest

1459

In [39]:
y_train

array([12.24769912, 12.10901644, 12.31717117, ..., 12.49313327,
       11.86446927, 11.90159023])

In [22]:
models = [ridge, rf];
meta_model = xgb_mod;

In [40]:
stacking_regression(models, meta_model, X_train, y_train, X_test,
             metric=None, n_folds=3, average_fold=True,
             shuffle=False, random_state=seed, verbose=1)

metric: [mean_squared_error]

model 0: [Ridge]
    ----
    MEAN:   [0.03646164]

model 1: [RandomForestRegressor]
    ----
    MEAN:   [0.04055207]



array([11.764228, 12.056521, 11.904654, ..., 11.786586, 11.862917,
       12.176709], dtype=float32)