In [1]:
# Global Setting
#-----------------------------------------------------

# Rand Seed
RandSeed0 = 123;

# Set the threshold for being over-skewed
# (can tune later)
skew0 = 0.7;

# Set the number of folds in the K-fold CV
num_of_folds = 5;
#-----------------------------------------------------


# Setting of Flags
#-----------------------------------------------------

# For Section 1: 
# Data Loading
#whichDataSet = 0;
whichDataSet = 'w';

# For Section 2: 
# Data Pre-Processing 
use_2_CookedUp_Features = 1;
# Use Bath/Bed and Garage/Bed or not
# 1 means Yes; other numbers mean No. 

use_2_CookedUp_Features_instead = 0;
# Use Bath/Bed and Garage/Bed instead of Bath (Full, Half), Garage
# 1 means Yes; other numbers mean No. 

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(font_scale=1.2)

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

import pylab 
import scipy.stats as stats

from scipy.stats import skew

from copy import deepcopy

import sklearn.model_selection as ms
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge

import xgboost 

## 1. Data Loading

In [3]:
if whichDataSet == 0: # Using the original data set 
    train = pd.read_csv('Datasets/train.csv')
    test = pd.read_csv('Datasets/test.csv')
    # Concat. train[no ID column, ... (all columns) ..., no SalePrice column]
    #     with test[no ID column, ... (all columns) ..., no SalePrice column]
    # (Test data has no SalePrice column anyway)
    train_test = pd.concat([train.loc[:,'MSSubClass':'SaleCondition'],
                             test.loc[:,'MSSubClass':'SaleCondition']]);
    print('Train-Test mega dataset shape:',train_test.shape)
##
##
if whichDataSet == 'w': # Using Wenchang's modified data set
    train = pd.read_csv('Datasets/train_Wenchang.csv')
    test = pd.read_csv('Datasets/test_Wenchang.csv')
    train.drop(['Unnamed: 0'], axis=1, inplace=True)
    test.drop(['Unnamed: 0'], axis=1, inplace=True)
    train_test = pd.concat([train.loc[:,'1stFlrSF':'RemodYearDiff'],
                             test.loc[:,'1stFlrSF':'RemodYearDiff']]);
    print('Train-Test mega dataset shape:',train_test.shape)

Train-Test mega dataset shape: (2919, 72)


## 2. Data Pre-Processing
* 2.0 (Optional) Add Cooked_up Features: Bath_Capacity and Parking_Capacity
* 2.1 Some Preliminary Examination and Symmetrization
* 2.2 Fill in NAs
* 2.3 Encode Categorical Features
* 2.4 Set up training and test data matrices

### 2.0 (Optional) Add Cooked_up Features: Bath_Capacitance and Parking_Capacitance

In [4]:
if use_2_CookedUp_Features == 1:
    #
    # Compute Total Bathrooms in a house
    # and set those with zero bathroom with median bathroom number
    train['TotBath'] = train.FullBath + 0.5*train.HalfBath;
    test['TotBath'] = test.FullBath + 0.5*test.HalfBath;
    train_test['TotBath'] = train_test.FullBath + 0.5*train_test.HalfBath;
    #
    train['TotBath'].replace(0, train['TotBath'].median(), inplace=True);
    test['TotBath'].replace(0, test['TotBath'].median(), inplace=True);
    train_test['TotBath'].replace(0, train_test['TotBath'].median(), inplace=True);
    #------------------------------------------------------------------------------
    #
    # Set those with zero bedroom with median bedfroom number 
    # (or set them to one to be conservative)
    train['BedroomAbvGr'].replace(0, train['BedroomAbvGr'].median(), inplace=True);
    test['BedroomAbvGr'].replace(0, test['BedroomAbvGr'].median(), inplace=True);
    train_test['BedroomAbvGr'].replace(0, train_test['BedroomAbvGr'].median(), inplace=True);
    #------------------------------------------------------------------------------
    #
    # Cook-up Feature 1:
    # Bath_Capacitance = TotBath / BedroomAbvGr
    train['Bath_Capacitance'] = train.TotBath / train.BedroomAbvGr
    test['Bath_Capacitance'] = train.TotBath / train.BedroomAbvGr
    train_test['Bath_Capacitance'] = train.TotBath / train.BedroomAbvGr
    #------------------------------------------------------------------------------
    #
    # Cook-up Feature 2: 
    # Parking_Capacitance = TotBath / BedroomAbvGr
    train['Parking_Capacitance'] = train.GarageCars / train.BedroomAbvGr
    test['Parking_Capacitance'] = train.GarageCars / train.BedroomAbvGr
    train_test['Parking_Capacitance'] = train.GarageCars / train.BedroomAbvGr

In [5]:
if use_2_CookedUp_Features_instead == 1:
    #
    # Reasonable dropping ---------------------------------------- 
    #
    train.drop(train[['FullBath', 
                      'HalfBath']], axis=1, inplace=True)
    test.drop(test[['FullBath', 
                    'HalfBath']], axis=1, inplace=True)
    train_test.drop(train_test[['FullBath', 
                                'HalfBath']], axis=1, inplace=True)
    #
    # Dropping that needs more scrutiny ---------------------------------------- 
    #
    #train.drop(train[['TotBath']], axis=1, inplace=True)
    #test.drop(test[['TotBath']], axis=1, inplace=True)
    #train_test.drop(train_test[['TotBath']], axis=1, inplace=True)
    # Dropping that needs more scrutiny ---------------------------------------- 
    #
    #train.drop(train[['GarageCars']], axis=1, inplace=True)
    #test.drop(test[['GarageCars']], axis=1, inplace=True)
    #train_test.drop(train_test[['GarageCars']], axis=1, inplace=True)

### 2.1 (a) Preliminary Examination

In [6]:
# As observed, symmetrize SalePrice via log(1 + ***)
train.SalePrice = np.log(1 + train.SalePrice)

### 2.1 (b) Symmetrization

In [7]:
numeric_features = train_test.dtypes[train_test.dtypes != "object"].index

skewed_features = train[numeric_features].apply(lambda x: skew(x.dropna())) 
skewed_features = skewed_features[abs( skewed_features ) > skew0]
skewed_features = skewed_features.index

In [8]:
train_test[skewed_features] = np.log(1 + train_test[skewed_features])

### 2.2 Encode Categorical Features
#### Using plain and simple one-hot encoding

In [9]:
train_test = pd.get_dummies(train_test)

In [10]:
#train_test.head().T

### 2.3 Fill in NAs

#### Interpolating NAs with the median of each field

In [11]:
if whichDataSet == 0:
    train_test = train_test.fillna(train_test.median())

### 2.4 Set up training and test data matrices

In [12]:
X_train = train_test[:train.shape[0]]
X_test  = train_test[train.shape[0]:]
y_train = train.SalePrice

### 2.5 Create Artificial Train-Test Data Sets via Train-Test-Split
### For model validation purposes

In [13]:
# [X_train_v, X_test_v, y_train_v, y_test_v] 
# all come from the original [X_train, y_train] 
##
## Naming convention: 
## The "_v" in names such as "X_train_v" is 
## to indicate such set is for validation purposes.

X_train_v, X_test_v, y_train_v, y_test_v =\
    ms.train_test_split(deepcopy(X_train),\
                        deepcopy(y_train),\
                        test_size = 1/8,\
                        random_state = RandSeed0)

In [14]:
print(-1 + np.exp(X_train_v.GrLivArea.max()))
print(-1 + np.exp(X_train_v.GrLivArea.min()))

5642.0
334.0


## 3. Modeling

In [15]:
# Using the [X_train_v, y_train_v] to search for the optimal hyper-parameter(s)
def rmse_cv(model):
    kf = KFold(num_of_folds, shuffle=True, random_state=42).get_n_splits(X_train)
    rmse = np.sqrt( -cross_val_score(model, X_train, y_train, 
                                     scoring="neg_mean_squared_error",
                                     cv=kf))
    return(rmse)


# Use the built-in MSE calculator
def rmse(y_predicted, y_actual):
    return( np.sqrt( mean_squared_error(y_actual, y_predicted) ) )


def R2(y_predicted, y_actual):
    # R^2 = 1 - SS_residual / SS_total
    SS_residual = sum((y_predicted - y_actual)**2)
    SS_total = sum((y_actual - y_actual.mean())**2)
    R2 = 1 - SS_residual / SS_total
    return(R2)

### 3.1 Linear Regression with Lasso Regularizations where $L(\ \vec{\beta} \ ) = MSE + \alpha \cdot ||\ \vec{\beta}\ ||_{L_{1}}$

In [16]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import ensemble
import xgboost as xgb

In [17]:
from sklearn.linear_model import Lasso

# supply several alphas to do CV
alpha_array = np.linspace(10,0.00025,64)

cv_Lasso = [rmse_cv( Lasso(alpha = Alpha) ).mean() for Alpha in alpha_array]

#--------------------------------------------------------------------------------

cv_Lasso = pd.Series(cv_Lasso, index = alpha_array)

alpha0_Lasso = cv_Lasso[cv_Lasso == cv_Lasso.min()].index[0];
rmse0 = cv_Lasso.min();

print('*'*50)
print('By CV, Lasso alpha set to {}'.format(alpha0_Lasso))
#--------------------------------------------------------------------------------

# Training the model
model_Lasso = Lasso(alpha0_Lasso).fit(X_train, y_train);

#--------------------------------------------------------------------------------

# Training Performance
predictions_Lasso_train = pd.DataFrame({"Predicted":model_Lasso.predict(X_train), 
                                        "Actual":y_train});
predictions_Lasso_train["Residual"] = predictions_Lasso_train.Actual - predictions_Lasso_train.Predicted;

R2_Lasso_train = model_Lasso.score(X_train, y_train);

RMSE_Lasso_train = rmse(predictions_Lasso_train.Actual, predictions_Lasso_train.Predicted);

print('*'*50)
print('Lasso Training Performace: R^2 = {:.4f}'.format(R2_Lasso_train))
print('*'*50)
print('Lasso Training Performace: RMSE = {:.4f}'.format(RMSE_Lasso_train))

#--------------------------------------------------------------------------------

# Test Performance
predictions_Lasso_test = pd.DataFrame({"Predicted":model_Lasso.predict(X_test_v), 
                                       "Actual":y_test_v});
predictions_Lasso_test["Residual"] = predictions_Lasso_test.Actual - predictions_Lasso_test.Predicted;

R2_Lasso_test = model_Lasso.score(X_test_v, y_test_v);

RMSE_Lasso_test = rmse(predictions_Lasso_test.Actual, predictions_Lasso_test.Predicted);

print('*'*50)
print('Lasso Test Performace: R^2 = {:.4f}'.format(R2_Lasso_test))
print('*'*50)
print('Lasso Test Performace: RMSE = {:.4f}'.format(RMSE_Lasso_test))
print('*'*50)

#--------------------------------------------------------------------------------

# Generate Test Vector (Ridge)

output_Lasso_test = model_Lasso.predict(X_test);

# de-Logarithm
output_Lasso_test = -1 + np.exp(output_Lasso_test); 

#print(len(output_Lasso_test))

#output_Lasso_test

**************************************************
By CV, Lasso alpha set to 0.00025
**************************************************
Lasso Training Performace: R^2 = 0.9285
**************************************************
Lasso Training Performace: RMSE = 0.1068
**************************************************
Lasso Test Performace: R^2 = 0.9503
**************************************************
Lasso Test Performace: RMSE = 0.0874
**************************************************


### 3.2 XGBoost

In [18]:
#xgb_paramters = {'learning_rate':[0.005, 0.01, 0.05, 0.1, 0.5],
#                 'n_estimators':[50, 100, 200, 500]}

#xgb_GridSearch = GridSearchCV(xgboost.XGBRegressor(), param_grid=xgb_paramters)

#xgb_GridSearch.fit(X_train, y_train)

#xgb_GridSearch.best_score_

#--------------------------------------------------------------------------------

#learning_rate0_xgb = xgb_GridSearch.best_params_['learning_rate'];
#n_estimators0_xgb = xgb_GridSearch.best_params_['n_estimators'];

learning_rate0_xgb = 0.1;
n_estimators0_xgb = 500;
min_child_weight0 = 2;

print('*'*50)
print('By CV, parameters of XGBooster:')
print('learning_rate set to {}'.format(learning_rate0_xgb))
print('n_estimators set to {}'.format(n_estimators0_xgb))
print('min_child_weight set to {}'.format(min_child_weight0))


#--------------------------------------------------------------------------------



'''
xgboost.XGBRegressor(
    max_depth=3, 
    learning_rate=0.1, 
    n_estimators=100, 
    silent=True, 
    objective='reg:linear', 
    booster='gbtree', 
    n_jobs=1, 
    nthread=None, 
    gamma=0, 
    min_child_weight=1, 
    max_delta_step=0, 
    subsample=1, 
    colsample_bytree=1, 
    colsample_bylevel=1, 
    reg_alpha=0, 
    reg_lambda=1, 
    scale_pos_weight=1, 
    base_score=0.5, 
    random_state=0, 
    seed=None, 
    missing=None, 
    **kwargs)
'''


model_xgb = xgboost.XGBRegressor(learning_rate=learning_rate0_xgb,
                                 n_estimators=n_estimators0_xgb,
                                 min_child_weight=min_child_weight0);

model_xgb = model_xgb.fit(X_train, y_train);

#--------------------------------------------------------------------------------

# Training Performance
# The results are put in dataframe "predictions_xgb_train"

predictions_xgb_train = pd.DataFrame({"Predicted":model_xgb.predict(X_train), 
                                      "Actual":y_train});
predictions_xgb_train["Residual"] = predictions_xgb_train.Actual - predictions_xgb_train.Predicted;

R2_xgb_train = R2(predictions_xgb_train.Predicted, predictions_xgb_train.Actual);

RMSE_xgb_train = rmse(predictions_xgb_train.Actual, predictions_xgb_train.Predicted);

print('*'*50)
print('XGBooster Training Performace: R^2 = {:.4f}'.format(R2_xgb_train))
print('*'*50)
print('XGBooster Training Performace: RMSE = {:.4f}'.format(RMSE_xgb_train))

#--------------------------------------------------------------------------------

# Test Performance
# The results are put in dataframe "predictions_xgb_test"

predictions_xgb_test = pd.DataFrame({"Predicted":model_xgb.predict(X_test_v), 
                                      "Actual":y_test_v});
predictions_xgb_test["Residual"] = predictions_xgb_test.Actual - predictions_xgb_test.Predicted;


R2_xgb_test = R2(predictions_xgb_test.Predicted, predictions_xgb_test.Actual);

RMSE_xgb_test = rmse(predictions_xgb_test.Actual, predictions_xgb_test.Predicted);

print('*'*50)
print('XGBooster Test Performace: R^2 = {:.4f}'.format(R2_xgb_test));
print('*'*50)
print('XGBooster Test Performace: RMSE = {:.4f}'.format(RMSE_xgb_test));
print('*'*50)

#--------------------------------------------------------------------------------

# Generate Test Vector (XGB)

output_xgb_test = model_xgb.predict(X_test);

# de-Logarithm
output_xgb_test = -1 + np.exp(output_xgb_test); 

#print(len(output_xgb_test))

#output_xgb_test

**************************************************
By CV, parameters of XGBooster:
learning_rate set to 0.1
n_estimators set to 500
min_child_weight set to 2
**************************************************
XGBooster Training Performace: R^2 = 0.9883
**************************************************
XGBooster Training Performace: RMSE = 0.0433
**************************************************
XGBooster Test Performace: R^2 = 0.9898
**************************************************
XGBooster Test Performace: RMSE = 0.0396
**************************************************


### 3.3 Random Forest

In [19]:
from sklearn.ensemble import RandomForestRegressor
import math

#rf_tree_cv = RandomForestRegressor(
#                                bootstrap = True,
#                                oob_score = True,
#                                random_state = 0)

#rf_param_grid = [{
#        'n_estimators' : [300, 400, 500, 600], #Test[100,250,500,750,1000]
#        'max_features' : np.arange(35,38), # First try: np.arange(17,40)
#}]

#rf_grid_search = GridSearchCV(rf_tree_cv, param_grid = rf_param_grid, cv = 5);

#rf_grid_search.fit(X_train_v, y_train_v);

#--------------------------------------------------------------------------------

#max_features0_RForest = rf_grid_search.best_params_['max_features'];
#n_estimators0_RForest = rf_grid_search.best_params_['n_estimators'];

max_features0_RForest = 37;
n_estimators0_RForest = 600;

print('*'*50)
print('By CV, parameters of RForest:')
print('max_features set to {}'.format(max_features0_RForest))
print('n_estimators set to {}'.format(n_estimators0_RForest))

#--------------------------------------------------------------------------------

#rf_tree = RandomForestRegressor(n_estimators = n_estimators0_RForest,
#                                max_features = max_features0_RForest,
#                                bootstrap = True,
#                                oob_score = True,
#                                random_state = 0)

#rf_tree.fit(X_train_v, y_train_v)

model_RForest = RandomForestRegressor(n_estimators = n_estimators0_RForest,
                                      max_features = max_features0_RForest,
                                      bootstrap = True,
                                      oob_score = True,
                                      random_state = 0)

model_RForest = model_RForest.fit(X_train_v, y_train_v)
#--------------------------------------------------------------------------------

#R2_RForest_train = rf_tree.score(X_train_v, y_train_v);
#RMSE_RForest_train = np.sqrt(mean_squared_error(y_train_v, rf_tree.predict(X_train_v)));

R2_RForest_train = model_RForest.score(X_train_v, y_train_v);
RMSE_RForest_train = np.sqrt(mean_squared_error(y_train_v, 
                                                model_RForest.predict(X_train_v)));

print('*'*50)
print('Random Forest Training Performace: R^2 = {:.4f}'.format(R2_RForest_train))
print('*'*50)
print('Random Forest Training Performace: RMSE = {:.4f}'.format(RMSE_RForest_train))

#--------------------------------------------------------------------------------

#R2_RForest_test = rf_tree.score(X_train_v, y_train_v);
#RMSE_RForest_test = np.sqrt(mean_squared_error(y_test_v, rf_tree.predict(X_test_v)));

R2_RForest_test = model_RForest.score(X_train_v, y_train_v);
RMSE_RForest_test = np.sqrt(mean_squared_error(y_test_v, 
                                               model_RForest.predict(X_test_v)));

print('*'*50)
print('Random Forest Test Performace: R^2 = {:.4f}'.format(R2_RForest_test))
print('*'*50)
print('Random Forest Test Performace: RMSE = {:.4f}'.format(RMSE_RForest_test))
print('*'*50)

#--------------------------------------------------------------------------------

#output_RForest_test = -1 + np.exp(rf_tree.predict(X_test));
output_RForest_test = -1 + np.exp(model_RForest.predict(X_test));

**************************************************
By CV, parameters of RForest:
max_features set to 37
n_estimators set to 600
**************************************************
Random Forest Training Performace: R^2 = 0.9830
**************************************************
Random Forest Training Performace: RMSE = 0.0521
**************************************************
Random Forest Test Performace: R^2 = 0.9830
**************************************************
Random Forest Test Performace: RMSE = 0.1289
**************************************************


### 4. Final Output

### 4.1 Simple Stacking

In [20]:
# use RMSE or use RMSE_to_R2

use_RMSE = 0;
# Use RMSE: use_RMSE = 1
# Use RMSE_to_R2: use_RMSE = any number other than 1

if use_RMSE == 1:
    rho_Lasso = RMSE_Lasso_test;
    rho_xgb = RMSE_Lasso_test;
    rho_RForest = RMSE_Lasso_test;
else:
    rho_Lasso = RMSE_Lasso_test / R2_Lasso_test;
    rho_xgb = RMSE_xgb_test / R2_xgb_test;
    rho_RForest = RMSE_RForest_test / R2_RForest_test;
    
s = rho_Lasso + rho_xgb + rho_RForest;

w_Lasso = 1 - rho_Lasso / s;
w_xgb = 1 - rho_xgb / s;
w_RForest = 1 - rho_RForest / s;

output_final_SimpleStacking = 1/2 * (w_Lasso * output_Lasso_test + 
                                     w_xgb * output_xgb_test + 
                                     w_RForest * output_RForest_test)


print('*'*50)
print('Simple Stacking:')
print('*'*50)
print('Weight_Lasso: {:.4f}'.format(w_Lasso/2))
print('*'*50)
print('Weight_XGB: {:.4f}'.format(w_xgb/2))
print('*'*50)
print('Weight_RForest: {:.4f}'.format(w_RForest/2))
print('*'*50)

print('\n')
print('Final Predicted SalePrices:')
print(len(output_final_SimpleStacking))
output_final_SimpleStacking

**************************************************
Simple Stacking:
**************************************************
Weight_Lasso: 0.3253
**************************************************
Weight_XGB: 0.4239
**************************************************
Weight_RForest: 0.2508
**************************************************


Final Predicted SalePrices:
1459


array([ 123724.69765036,  157999.72229538,  184744.92356571, ...,
        159573.54661891,  116957.57815469,  220335.00004869])

### 4.2 Fancy Stacking

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

In [22]:
#print('*'*70)
#print(model_Lasso)
#print('*'*70)
#print(model_xgb)
#print('*'*70)
#print(model_RForest)
#print('*'*70)

In [23]:
class stacked_meta_model():
    
    def __init__(self, base_models, meta_model, k_folds=num_of_folds):
        self.base_models = base_models
        self.meta_model = meta_model
        self.k_folds = k_folds
   

    # -------------------------------------------------------------
    # The class method .fit() defined below is
    # the core of the stacked_meta_model!!
    # -------------------------------------------------------------
    def fit(self, X, y):
        
        # -----------------------------------------
        # Stage 1: Re-train/fit base models
        # -----------------------------------------
    
        # Initialize list of re-fitted base_models:
        self.base_models_Beta = [list() for i in self.base_models]
        
        # Prepare splitting for k-fold CV
        k_f = KFold(n_splits=self.k_folds, shuffle=True, random_state=RandSeed0)
        
        # Re-Train base-models and then generate out-of-fold predictions.
        oof_predictions = np.zeros((X.shape[0], len(self.base_models)))
        ## "oof" stands for out-of-fold
        
        for Model_Counter, Model in enumerate(self.base_models):
            
            for Folder_Counter, (tr_index, te_index) in enumerate(k_f.split(X, y)):
                # X splits to X[tr_index] and X[te_indx]; likewise for y

                # "Deep-copy" each original base model;
                # Otherwise, doing fit will mutate the original model
                instance = clone(Model)
                                
                ## For debugging purpose:
                ##print(X.loc[tr_index])
                ##print(y[tr_index])
                ##print(instance) 
                ##print(type(tr_index))
                
                instance.fit(X.loc[tr_index], y[tr_index])
                self.base_models_Beta[Model_Counter].append(instance)
                
                oof_predictions[te_index, Model_Counter] = \
                    instance.predict(X.loc[te_index])

        # -----------------------------------------
        # Stage 2: Re-train/fit the meta model
        # -----------------------------------------
                
        # "Deep-copy" the original meta model;
        # Otherwise, doing fit will mutate the original meta model
        self.meta_model_Beta = clone(self.meta_model)
        
        self.meta_model_Beta.fit(oof_predictions, y)
        
        #print(self.base_models_Beta)
        #print('*'*50)
        #print(self.meta_model_Beta)
        
        return self
    
    
    #-------------------------------------------------------------
   

    # Use the Meta Model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([Model.predict(X) for Model in Base_Models]).mean(axis=1)
            for Base_Models in self.base_models_Beta])
        return self.meta_model_Beta.predict(meta_features)

In [24]:
model_meta = stacked_meta_model(base_models=[model_Lasso, model_xgb, model_RForest], 
                                meta_model=model_xgb);

model_meta.fit(X_train, y_train);


predictions_meta_train = pd.DataFrame({"Predicted":model_meta.predict(X_train), 
                                       "Actual":y_train});
predictions_meta_train["Residual"] = predictions_meta_train.Actual - predictions_meta_train.Predicted;


R2_meta_train = R2(predictions_meta_train.Predicted, predictions_meta_train.Actual);
RMSE_meta_train = rmse(predictions_meta_train.Actual, predictions_meta_train.Predicted);

print('*'*60)
print('Stacked Meta Model Training Performace: R^2 = {:.4f}'.format(R2_meta_train))
print('*'*60)
print('Stacked Meta Model Training Performace: RMSE = {:.4f}'.format(RMSE_meta_train))

#--------------------------------------------------------------------------------

# Test Performance
predictions_meta_test = pd.DataFrame({"Predicted":model_meta.predict(X_test_v), 
                                      "Actual":y_test_v});
predictions_meta_test["Residual"] = predictions_meta_test.Actual - predictions_meta_test.Predicted;


R2_meta_test = R2(predictions_meta_test.Predicted, predictions_meta_test.Actual);
RMSE_meta_test = rmse(predictions_meta_test.Actual, predictions_meta_test.Predicted);

print('*'*60)
print('Stacked Meta Test Performace: R^2 = {:.4f}'.format(R2_meta_test));
print('*'*60)
print('Stacked Meta Test Performace: RMSE = {:.4f}'.format(RMSE_meta_test));
print('*'*60)

#--------------------------------------------------------------------------------


output_final_MetaStacking = model_meta.predict(X_test);

# de-Logarithm
output_final_MetaStacking = -1 + np.exp(output_final_MetaStacking); 

print('\n')
print('Final Predicted SalePrices')
print(len(output_final_MetaStacking))
output_final_MetaStacking

************************************************************
Stacked Meta Model Training Performace: R^2 = 0.9311
************************************************************
Stacked Meta Model Training Performace: RMSE = 0.1048
************************************************************
Stacked Meta Test Performace: R^2 = 0.9560
************************************************************
Stacked Meta Test Performace: RMSE = 0.0823
************************************************************


Final Predicted SalePrices
1459


array([ 116715.7421875,  164863.21875  ,  160508.71875  , ...,
        159756.4375   ,  120283.2265625,  212363.59375  ], dtype=float32)

### 5. Generate CSV File for Kaggle Submission

In [25]:
# Simple Stacking Predictions
Predictions_simple = pd.read_csv('Datasets/test.csv');

Predictions_simple['SalePrice'] = output_final_SimpleStacking;
Predictions_simple = Predictions_simple[['Id','SalePrice']];

print('*'*50)
print('Predictions by simple stacking:')
print('*'*50)
print(Predictions_simple.head())

# ------------------------------------------------------------

# Meta Stacking Predictions
Predictions_meta = pd.read_csv('Datasets/test.csv');

Predictions_meta['SalePrice'] = output_final_MetaStacking;
Predictions_meta = Predictions_meta[['Id','SalePrice']];

print('*'*50)
print('Predictions by meta stacking:')
print('*'*50)
print(Predictions_meta.head())

**************************************************
Predictions by simple stacking:
**************************************************
     Id      SalePrice
0  1461  123724.697650
1  1462  157999.722295
2  1463  184744.923566
3  1464  191859.466300
4  1465  194154.741696
**************************************************
Predictions by meta stacking:
**************************************************
     Id      SalePrice
0  1461  116715.742188
1  1462  164863.218750
2  1463  160508.718750
3  1464  176721.234375
4  1465  209795.625000


In [26]:
import time
timestr = time.strftime("%m%d-%H%M%S")

# Save predictions by simple stacking ---------------------------------------------
fileName_str = 'Datasets/Predictions_Simple_Stacking_'+timestr+'.csv';

print('*'*70)
print('Write as:', fileName_str)
Predictions_simple.to_csv(fileName_str,index = False);

# Double Check whether the CSV file is properly saved and ready for submission
print('Read in:', fileName_str)
print(pd.read_csv(fileName_str).head())

print('\n')

# Save predictions by meta stacking ---------------------------------------------
fileName_str = 'Datasets/Predictions_Meta_Stacking_'+timestr+'.csv';

print('*'*70)
print('Write as:', fileName_str)
Predictions_meta.to_csv(fileName_str,index = False);

# Double Check whether the CSV file is properly saved and ready for submission
print('Read in:', fileName_str)
print(pd.read_csv(fileName_str).head())

**********************************************************************
Write as: Datasets/Predictions_Simple_Stacking_0312-104843.csv
Read in: Datasets/Predictions_Simple_Stacking_0312-104843.csv
     Id      SalePrice
0  1461  123724.697650
1  1462  157999.722295
2  1463  184744.923566
3  1464  191859.466300
4  1465  194154.741696


**********************************************************************
Write as: Datasets/Predictions_Meta_Stacking_0312-104843.csv
Read in: Datasets/Predictions_Meta_Stacking_0312-104843.csv
     Id      SalePrice
0  1461  116715.742188
1  1462  164863.218750
2  1463  160508.718750
3  1464  176721.234375
4  1465  209795.625000
