# Stacking

In [1]:
import os
os.chdir("D:/ML_Projects/MercedesBenz-Kaggle/")
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.model_selection import GridSearchCV, ShuffleSplit, KFold, train_test_split
import sklearn.metrics as mt
from sklearn import svm, ensemble
from sklearn.linear_model import LinearRegression, BayesianRidge, ElasticNet



In [2]:
def read_data():
    train_data = pd.read_csv("./data/train.csv")
    test_data = pd.read_csv("./data/test.csv")
    return train_data, test_data


def process_data(train_data, test_data):
    binary_cols, all_zero_cols, all_one_cols = [],[],[]
    for col in train_data.iloc[:,10:]:
        unique_vals = train_data[col].unique()
        if np.array_equal(unique_vals, [1,0]) or np.array_equal(unique_vals, [0,1]):
            binary_cols.append(col)
        elif np.array_equal(unique_vals, [0]):
            all_zero_cols.append(col)
        elif np.array_equal(unique_vals, [1]):
            all_one_cols.append(col)
        else:
            print(unique_vals)

    # Drop columns with only zeros
    train_data = train_data.drop(all_zero_cols, axis=1)
    test_data = test_data.drop(all_zero_cols, axis=1)
    
    train_cat_cols = train_data.iloc[:,2:10]
    test_cat_cols = test_data.iloc[:,1:9]
    freq=[]
    col_names = []
    cat_mismatch = []
    
    for train_col, test_col in zip(train_cat_cols, test_cat_cols):
        col_names.append(train_col)
        train_freq = len(train_cat_cols[train_col].unique())
        test_freq = len(test_cat_cols[test_col].unique())
        
        if train_freq!=test_freq:
            cat_mismatch.append(train_col)
            
        freq.append([train_freq, test_freq])
    freq = pd.DataFrame(freq, columns=['Train_Freq', 'Test_Freq'], index=col_names)
    
    train_data = train_data.drop(cat_mismatch, axis=1)
    test_data = test_data.drop(cat_mismatch, axis=1)
    return train_data, test_data


def prepare_data_ml(train_data, test_data):
    X_train = pd.get_dummies(train_data)
    X_train = X_train.drop(['ID','y'], axis=1).values
    y_train = train_data.y.values
    
    X_test = pd.get_dummies(test_data)
    y_test_id = test_data.ID.values
    X_test = X_test.drop(['ID'], axis=1).values
    
    return X_train, y_train, X_test, y_test_id

def make_submission(reg_estimator, X_test, ID, fname='FinalSubmission'):
    y_pred = reg_estimator.predict(X_test)
    final_submission = pd.DataFrame(np.hstack([ID[:,np.newaxis], y_pred[:,np.newaxis]]), columns=['ID','y'])
    final_submission.ID = final_submission.ID.astype(int)
    final_submission.to_csv('./results/'+fname, index=False)
    return final_submission

In [3]:
def build_regressor(regressor_obj, X_train, y_train):
    print(regressor_obj)
    regressor_obj.fit(X_train, y_train)
    y_train_pred = regressor_obj.predict(X_train)
    
    r_2 = mt.r2_score(y_train, y_train_pred) # Coefficient of determination
    mse = mt.mean_squared_error(y_train, y_train_pred) # Mean squared error
    
    print("Coefficient of Determination: ", r_2)
    print("Mean Square Error: ", mse)
    
    return regressor_obj, y_train_pred

def perf_regressor(regressor_obj, x, y):
    print(regressor_obj)
    pred = regressor_obj.predict(x)
    
    r_2 = mt.r2_score(y, pred) # Coefficient of determination
    mse = mt.mean_squared_error(y, pred) # Mean squared error
    
    print("Test Performance")
    print("Coefficient of Determination: ", r_2)
    print("Mean Square Error: ", mse)
    
    return pred

In [4]:
train_data, test_data = read_data()
train_data, test_data = process_data(train_data, test_data)

X, y, X_test, y_test_id = prepare_data_ml(train_data, test_data)

print("Training Samples: ", X.shape)
print("Test Sample: ", X_test.shape)

Training Samples:  (4209, 431)
Test Sample:  (4209, 431)


# Stack GBM's - add white noise to output

In [9]:
cv_kfold = KFold(n_splits=2, shuffle=True)
i = 0
for train_idx, val_idx in cv_kfold.split(X, y):
    print("++++++++++++++ Fold %d +++++++++++++++" %(i))
    x_train, x_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    print ("==============Regression Model 1==============")
    
    regressor1 = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse',
                                                   learning_rate=0.01, loss='ls', max_depth=3,
                                                   max_features='sqrt',
                                                   min_samples_split=2, min_weight_fraction_leaf=0.0,
                                                   n_estimators=1000, random_state=122,
                                                   subsample=1, verbose=0)
    regressor1.fit(x_train, y_train*1.25)
    pred1 = regressor1.predict(x_val).reshape(x_val.shape[0],1)
    print("R2 : %0.2f" %(mt.r2_score(y_val, pred1)))
    
    print ("==============Regression Model 2==============")
    
    regressor2 = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse',
                                                   learning_rate=0.01, loss='ls', max_depth=3,
                                                   max_features='sqrt',
                                                   min_samples_split=2, min_weight_fraction_leaf=0.0,
                                                   n_estimators=1000, random_state=122,
                                                   subsample=1, verbose=0)
    regressor2.fit(x_train, y_train*0.75)
    pred2 = regressor2.predict(x_val).reshape(x_val.shape[0],1)
    print("R2 : %0.2f" %(mt.r2_score(y_val, pred2)))
    
    print ("==============Regression Model 3==============")
    
    regressor3 = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse',
                                                   learning_rate=0.01, loss='ls', max_depth=3,
                                                   max_features='sqrt',
                                                   min_samples_split=2, min_weight_fraction_leaf=0.0,
                                                   n_estimators=1000, random_state=122,
                                                   subsample=1, verbose=0)
    regressor3.fit(x_train, y_train)
    pred3 = regressor3.predict(x_val).reshape(x_val.shape[0],1)
    print("R2 : %0.2f" %(mt.r2_score(y_val, pred3)))
    
    if i==0:
        x_train_meta = np.empty(shape=(x_val.shape[0], x_val.shape[1]))
        x_train_meta = np.hstack((x_val, pred1, pred2, pred3))
    else:
        x_train_fold = np.hstack((x_val, pred1, pred2, pred3))
        x_train_meta = np.vstack((x_train_meta, x_train_fold))
    i+=1

++++++++++++++ Fold 0 +++++++++++++++
R2 : -3.51
R2 : -3.58
R2 : 0.58
++++++++++++++ Fold 1 +++++++++++++++
R2 : -3.30
R2 : -3.30
R2 : 0.53


In [10]:
regressor1 = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse',
                                                   learning_rate=0.01, loss='ls', max_depth=3,
                                                   max_features='sqrt',
                                                   min_samples_split=2, min_weight_fraction_leaf=0.0,
                                                   n_estimators=1000, random_state=122,
                                                   subsample=1, verbose=0)
regressor1.fit(X, y*1.25)
test_pred1 = regressor1.predict(X_test).reshape(X_test.shape[0],1)

regressor2 = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse',
                                                   learning_rate=0.01, loss='ls', max_depth=3,
                                                   max_features='sqrt',
                                                   min_samples_split=2, min_weight_fraction_leaf=0.0,
                                                   n_estimators=1000, random_state=122,
                                                   subsample=1, verbose=0)
regressor2.fit(X, y*0.75)
test_pred2 = regressor2.predict(X_test).reshape(X_test.shape[0],1)

regressor3 = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse',
                                                   learning_rate=0.01, loss='ls', max_depth=3,
                                                   max_features='sqrt',
                                                   min_samples_split=2, min_weight_fraction_leaf=0.0,
                                                   n_estimators=1000, random_state=122,
                                                   subsample=1, verbose=0)
regressor3.fit(X, y)
test_pred3 = regressor3.predict(X_test).reshape(X_test.shape[0],1)

x_test_meta = np.hstack((X_test, test_pred1, test_pred2, test_pred3))

In [11]:
stacked_ridge = Ridge(alpha=0.0001, fit_intercept=True, normalize=True)
stacked_ridge, stacked_pred = build_regressor(stacked_ridge, x_test_meta, y)

Ridge(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)
Coefficient of Determination:  0.116325325958
Mean Square Error:  142.031716991
