Read in necessary libraries

In [24]:
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split

In [27]:
# constants for regularized regression
alphaL = 0.000811;
alphaR = 8.9022;

In [28]:
# Obtaining data from Wenchang
X_train = pd.read_csv('Datasets/stack_trainx.csv');
X_test = pd.read_csv('Datasets/stack_testx.csv');
y_tr = pd.read_csv('Datasets/stack_trainy.csv');
y_train = y_tr.SalePrice.tolist()

In [29]:
# log x-forming
y_train = np.array(y_train)
y_train = np.log1p(y_train)

In [30]:
#stacking function
def stacking_regression(models, meta_model, X_train, y_train, X_test,
             metric=None, n_folds=3, average_fold=True,
             shuffle=False, random_state=seed, verbose=1):
    '''
    Function 'stacking' takes train data, test data, list of 1-st level
    models, meta_model for the 2-nd level and returns stacking predictions.

    Parameters
    ----------
    models : list
        List of 1-st level models. You can use any models that follow sklearn
        convention i.e. accept numpy arrays and have methods 'fit' and 'predict'.

    meta_model: model
        2-nd level model. You can use any model that follow sklearn convention

    X_train : numpy array or sparse matrix of shape [n_train_samples, n_features]
        Training data

    y_train : numpy 1d array
        Target values

    X_test : numpy array or sparse matrix of shape [n_test_samples, n_features]
        Test data

    metric : callable, default None
        Evaluation metric (score function) which is used to calculate
        results of cross-validation.
        If None, then by default:
            sklearn.metrics.mean_absolute_error - for regression

    n_folds : int, default 3
        Number of folds in cross-validation

    average_fold: boolean, default True
        Whether to take the average of the predictions on test set from each fold.
        Refit the model using the whole training set and predict test set if False

    shuffle : boolean, default False
        Whether to perform a shuffle before cross-validation split

    random_state : int, default 0
        Random seed for shuffle

    verbose : int, default 1
        Level of verbosity.
        0 - show no messages
        1 - for each model show single mean score
        2 - for each model show score for each fold and mean score

        Caution. To calculate MEAN score across all folds
        full train set prediction and full true target are used.
        So for some metrics (e.g. rmse) this value may not be equal
        to mean of score values calculated for each fold.

    Returns
    -------
    stacking_prediction : numpy array of shape n_test_samples
        Stacking prediction
    '''

    # Specify default metric for cross-validation
    if metric is None:
        metric = mean_squared_error

    # Print metric
    if verbose > 0:
        print('metric: (sqrt)[%s]\n' % metric.__name__)

    # Split indices to get folds
    kf = KFold(n_splits = n_folds, shuffle = shuffle, random_state = random_state)

    if X_train.__class__.__name__ == "DataFrame":
        X_train = X_train.as_matrix()
        X_test = X_test.as_matrix()

    # Create empty numpy arrays for stacking features
    S_train = np.zeros((X_train.shape[0], len(models)))
    S_test = np.zeros((X_test.shape[0], len(models)))

    # Loop across models
    for model_counter, model in enumerate(models):
        if verbose > 0:
            print('model %d: [%s]' % (model_counter, model.__class__.__name__))

        # Create empty numpy array, which will contain temporary predictions for test set made in each fold
        S_test_temp = np.zeros((X_test.shape[0], n_folds))
        # Loop across folds
        for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            X_te = X_train[te_index]
            y_te = y_train[te_index]
            # Clone the model because fit will mutate the model.
            instance = clone(model)
            # Fit 1-st level model
            instance.fit(X_tr, y_tr)
            # Predict out-of-fold part of train set
            S_train[te_index, model_counter] = np.array(instance.predict(X_te))
            # Predict full test set
            S_test_temp[:, fold_counter] = np.array(instance.predict(X_test))

            # Delete temporary model
            del instance

            if verbose > 1:
                print('    fold %d: [%.8f]' % (fold_counter, metric(y_te, S_train[te_index, model_counter])))

        # Compute mean or mode of predictions for test set
        if average_fold:
            S_test[:, model_counter] = np.mean(S_test_temp, axis = 1)
        else:
            model.fit(X_train, y_train)
            S_test[:, model_counter] = model.predict(X_test)

        if verbose > 0:
            print('    ----')
            print('    MEAN RMSE:   [%.8f]\n' % np.sqrt((metric(y_train, S_train[:, model_counter]))))

    # Fit our second layer meta model
    meta_model.fit(S_train, y_train)
    # Make our final prediction
    stacking_prediction = meta_model.predict(S_test)


    return stacking_prediction

In [31]:
#instantiate Lasso model
lasso = Lasso(alphaL);

In [32]:
#instantiate Ridge model
ridge = Ridge(alphaR);

In [33]:
# Gradient Boosted Trees
gb_tree = GradientBoostingRegressor(max_depth = 3, 
                                    learning_rate = 0.04, 
                                    n_estimators = 800,
                                    subsample= 2/3, 
                                    random_state=0)

In [34]:
import xgboost as xgb
xgb_mod = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate = 0.05,
    max_depth=3,  
    silent=True, 
    objective='reg:linear', 
    booster='gbtree', 
    n_jobs=4, 
    nthread=None, 
    gamma=0.08, 
    min_child_weight=5, 
    max_delta_step=0, 
    subsample=0.3, 
    colsample_bytree=0.3, 
    colsample_bylevel=1, 
    reg_alpha = 0,
    reg_lambda=1.5, 
    scale_pos_weight=1, 
    base_score=0.5, 
    random_state=0, 
    seed=seed, 
    missing=None)

In [69]:
models = [xgb_mod,gb_tree];
meta_model = ridge;

In [70]:
mS_time = int(round(time.time() * 1000));
seed = random.seed(mS_time);
y_predicted = stacking_regression(models, meta_model, X_train, y_train, X_test,
             metric=None, n_folds=5, average_fold=True,
             shuffle=False, random_state=seed, verbose=2)
y_predicted

metric: (sqrt)[mean_squared_error]

model 0: [XGBRegressor]
    fold 0: [0.01324025]
    fold 1: [0.01903506]
    fold 2: [0.01772861]
    fold 3: [0.01456553]
    fold 4: [0.01745592]
    ----
    MEAN RMSE:   [0.12808230]

model 1: [GradientBoostingRegressor]
    fold 0: [0.01232076]
    fold 1: [0.01867321]
    fold 2: [0.01773828]
    fold 3: [0.01222267]
    fold 4: [0.01605198]
    ----
    MEAN RMSE:   [0.12410229]



array([11.74063521, 11.97665118, 12.13198862, ..., 12.0096689 ,
       11.68910022, 12.31959998])

In [71]:
#define index for submission
testid = np.arange(len(X_train) + 1, len(X_train) + len(X_test) + 1)
testid

array([1461, 1462, 1463, ..., 2917, 2918, 2919])

In [72]:
#transform data back from log
y_predicted = np.expm1(y_predicted)

In [73]:
submission = pd.DataFrame({'Id': testid, 'SalePrice': y_predicted})
submission.to_csv('./Datasets/stack_ridge_prediction.csv', index = False)