# Model Stacking

In the previous notebook we used optuna to find hyperparameters which minimized the average mean absolute error on the training set using 3-fold cross-validation.

In this notebook we will generate predictions using the three optimized models and then stack the models to see if we can get any improvements.

In [1]:
# Global variables for testing changes to this notebook quickly
FOLD_SEED = 0
NUM_FOLDS = 3
EARLY_STOP = 50
TRIALS = 100
SUBMIT = False
STACK = True

In [2]:
# Essentials
import os
import warnings
import numpy as np
import pandas as pd
import time

# Preprocessing
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold
from functools import partial 
from sklearn.impute import SimpleImputer, KNNImputer
from category_encoders import OrdinalEncoder, OneHotEncoder

# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from category_encoders import MEstimateEncoder

# Models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.base import clone
from sklearn.linear_model import Lasso

# Hyperparameter Tuning
import optuna
from optuna.samplers import GridSampler
from optuna.visualization import plot_parallel_coordinate

# Mute warnings
warnings.filterwarnings('ignore')

# Preprocessing

In [3]:
# Load the training data
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# Remove rows with missing target
train.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Clean data, static transformations
def clean_data(*data):
    for df in data:
        # fix typos to match documentation
        df['MSZoning'] =  df['MSZoning'].replace({'C (all)': 'C'})
        df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn":"BrkComm","Wd Shng": "WdShing"})
        df['Neighborhood'] = df['Neighborhood'].replace({'NAmes':'Names'})

        # Some values of GarageYrBlt are corrupt, replace them with YearBuilt
        df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
        
        # optional feature: A - agriculture, C - commercial, R - residential, I - industrial
        df["MSClass"] = df['MSZoning'].map({'A': 'A','C': 'C',"FV": 'R','I': 'I',
                                            "RH": 'R',"RL": 'R',"RP": 'R',"RM": 'R', np.nan:np.nan})
    return data
    
train, test = clean_data(train, test)

# List of categorical/numerical columns
columns = [col for col in test.columns if col not in ["Id","MSClass"]]
object_cols = [col for col in columns if train[col].dtype == "object"]
number_cols = [col for col in columns if train[col].dtype != "object"]

# Define bins
binner = KBinsDiscretizer(n_bins = 45, encode = 'ordinal', strategy = 'uniform')
y_bins = binner.fit_transform(pd.DataFrame(data=train['SalePrice']))

# Define folds
train["kfold"] = -1
kf = StratifiedKFold(NUM_FOLDS, shuffle = True, random_state = FOLD_SEED) 
for fold, (train_idx, valid_idx) in enumerate(kf.split(train, y_bins)):
    train.loc[valid_idx,"kfold"] = fold

In [4]:
def preprocessing(X_train, X_valid, X_test):
    
    # 1. impute numerical data
    columns = [col for col in X_train.columns if X_train[col].dtype != "object"]
    if columns:
        imputer = SimpleImputer(strategy='mean')
        X_train[columns] = imputer.fit_transform(X_train[columns])
        X_valid[columns] = imputer.transform(X_valid[columns])
        X_test[columns] = imputer.transform(X_test[columns])
    
    # 2. impute categorical data
    columns = [col for col in X_train.columns if X_train[col].dtype == "object"]
    if columns:
        imputer = SimpleImputer(strategy='constant', fill_value = 'None')
        X_train[columns] = imputer.fit_transform(X_train[columns])
        X_valid[columns] = imputer.transform(X_valid[columns])
        X_test[columns] = imputer.transform(X_test[columns])
    
    # 3. encode 1-10 ratings
    cols = ["OverallQual","OverallCond"]
    cols = [x for x in cols if x in X_train.columns]
    ratings = {float(a):b for b,a in enumerate(range(1,11))}
    mapping = [{'col':x, 'mapping': ratings} for x in cols]
    
    encoder = OrdinalEncoder(cols = cols, mapping = mapping)
    X_train = encoder.fit_transform(X_train)
    X_valid = encoder.transform(X_valid)
    X_test = encoder.transform(X_test)
    
    # 4. encode Poor, Fair, Avg, Good, Ex ratings
    cols = ["ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC", "KitchenQual","FireplaceQu","GarageQual","GarageCond",'PoolQC']
    cols = [x for x in cols if x in X_train.columns]
    ratings = {"Po":0, "Fa":1, "TA":2, "Gd":3, "Ex":4}
    mapping = [{'col':x, 'mapping': ratings} for x in cols]
    
    encoder = OrdinalEncoder(cols = cols, mapping = mapping)
    X_train = encoder.fit_transform(X_train)
    X_valid = encoder.transform(X_valid)
    X_test = encoder.transform(X_test)
    
    # 5. encode remaining ordinal data
    cols = ["LotShape","LandSlope","BsmtExposure","BsmtFinType1","BsmtFinType2",
    "Functional","GarageFinish","PavedDrive","Utilities","CentralAir","Electrical",
    "Fence"]
    cols = [x for x in cols if x in X_train.columns]
    mapping = [{'col':"LotShape",
                'mapping': {"Reg":0, "IR1":1, "IR2":2, "IR3":3}},
               {'col':"LandSlope",
                'mapping': {"Sev":0, "Mod":1, "Gtl":2}},
               {'col':"BsmtExposure",
                'mapping': {"No":0, "Mn":1, "Av":2, "Gd":3}},
               {'col':"BsmtFinType1",
                'mapping': {"Unf":0, "LwQ":1, "Rec":2, "BLQ":3, "ALQ":4, "GLQ":5}},
               {'col':"BsmtFinType2",
                'mapping': {"Unf":0, "LwQ":1, "Rec":2, "BLQ":3, "ALQ":4, "GLQ":5}},
               {'col':"Functional",
                'mapping': {"Sal":0, "Sev":1, "Maj1":2, "Maj2":3, "Mod":4, "Min2":5, "Min1":6, "Typ":7}},
               {'col':"GarageFinish",
                'mapping': {"Unf":0, "RFn":1, "Fin":2}},
               {'col':"PavedDrive",
                'mapping': {"N":0, "P":1, "Y":2}},
               {'col':"Utilities",
                'mapping': {"NoSeWa":0, "NoSewr":1, "AllPub":2}},
               {'col':"CentralAir",
                'mapping': {"N":0, "Y":1}},
               {'col':"Electrical",
                'mapping': {"Mix":0, "FuseP":1, "FuseF":2, "FuseA":3, "SBrkr":4}},
               {'col':"Fence",
                'mapping': {"MnWw":0, "GdWo":1, "MnPrv":2, "GdPrv":3}}]
    mapping = [x for x in mapping if x['col'] in X_train.columns]
    
    encoder = OrdinalEncoder(cols = cols, mapping = mapping)
    X_train = encoder.fit_transform(X_train)
    X_valid = encoder.transform(X_valid)
    X_test = encoder.transform(X_test)
    
    # Determine cardinality of remaining categorical data
    columns = [col for col in X_train.columns if X_train[col].dtype == 'object']
    high_cols = [col for col in columns if X_train[col].nunique() >= 10]
    low_cols = [col for col in columns if X_train[col].nunique() < 10]
    
    # 6. ordinal encode high cardinality data
    if high_cols:
        encoder = OrdinalEncoder(cols = high_cols)
        X_train = encoder.fit_transform(X_train)
        X_valid = encoder.transform(X_valid)
        X_test = encoder.transform(X_test)
    
    # 7. one-hot encode low cardinality data
    if low_cols:
        encoder = OneHotEncoder(cols = low_cols, use_cat_names = True)
        X_train = encoder.fit_transform(X_train)
        X_valid = encoder.transform(X_valid)
        X_test = encoder.transform(X_test)
        
    return X_train, X_valid, X_test

# Feature Engineering

In [5]:
def mathematical_transformations(X_train, X_valid, X_test):
    
    X_train["LivLotRatio"] = X_train["GrLivArea"] / X_train["LotArea"]
    X_valid["LivLotRatio"] = X_valid["GrLivArea"] / X_valid["LotArea"]
    X_test["LivLotRatio"] = X_test["GrLivArea"] / X_test["LotArea"]
    
    X_train["Spaciousness"] = (X_train["1stFlrSF"]+X_train["2ndFlrSF"]) / X_train["TotRmsAbvGrd"]
    X_valid["Spaciousness"] = (X_valid["1stFlrSF"]+X_valid["2ndFlrSF"]) / X_valid["TotRmsAbvGrd"]
    X_test["Spaciousness"] = (X_test["1stFlrSF"]+X_test["2ndFlrSF"]) / X_test["TotRmsAbvGrd"]
    
    X_train["TotalOutsideSF"] = X_train["WoodDeckSF"] + X_train["OpenPorchSF"] + X_train["EnclosedPorch"] + X_train["3SsnPorch"] + X_train["ScreenPorch"]
    X_valid["TotalOutsideSF"] = X_valid["WoodDeckSF"] + X_valid["OpenPorchSF"] + X_valid["EnclosedPorch"] + X_valid["3SsnPorch"] + X_valid["ScreenPorch"]
    X_test["TotalOutsideSF"] = X_test["WoodDeckSF"] + X_test["OpenPorchSF"] + X_test["EnclosedPorch"] + X_test["3SsnPorch"] + X_test["ScreenPorch"]
    
    X_train['TotalLot'] = X_train['LotFrontage'] + X_train['LotArea']
    X_valid['TotalLot'] = X_valid['LotFrontage'] + X_valid['LotArea']
    X_test['TotalLot'] = X_test['LotFrontage'] + X_test['LotArea']
    
    X_train['TotalBsmtFin'] = X_train['BsmtFinSF1'] + X_train['BsmtFinSF2']
    X_valid['TotalBsmtFin'] = X_valid['BsmtFinSF1'] + X_valid['BsmtFinSF2']
    X_test['TotalBsmtFin'] = X_test['BsmtFinSF1'] + X_test['BsmtFinSF2']
    
    X_train['TotalSF'] = X_train['TotalBsmtSF'] + X_train['2ndFlrSF'] + X_train['1stFlrSF']
    X_valid['TotalSF'] = X_valid['TotalBsmtSF'] + X_valid['2ndFlrSF'] + X_valid['1stFlrSF']
    X_test['TotalSF'] = X_test['TotalBsmtSF'] + X_test['2ndFlrSF'] + X_test['1stFlrSF']
    
    X_train['TotalBath'] = X_train['FullBath'] + X_train['HalfBath'] * 0.5 + X_train['BsmtFullBath'] + X_train['BsmtHalfBath'] * 0.5
    X_valid['TotalBath'] = X_valid['FullBath'] + X_valid['HalfBath'] * 0.5 + X_valid['BsmtFullBath'] + X_valid['BsmtHalfBath'] * 0.5
    X_test['TotalBath'] = X_test['FullBath'] + X_test['HalfBath'] * 0.5 + X_test['BsmtFullBath'] + X_test['BsmtHalfBath'] * 0.5
    
    X_train['TotalPorch'] = X_train['OpenPorchSF'] + X_train['EnclosedPorch'] + X_train['ScreenPorch'] + X_train['WoodDeckSF']
    X_valid['TotalPorch'] = X_valid['OpenPorchSF'] + X_valid['EnclosedPorch'] + X_valid['ScreenPorch'] + X_valid['WoodDeckSF']
    X_test['TotalPorch'] = X_test['OpenPorchSF'] + X_test['EnclosedPorch'] + X_test['ScreenPorch'] + X_test['WoodDeckSF']
    
    return X_train, X_valid, X_test

def count_porch_types(X_train, X_valid, X_test):
    
    X_train["PorchTypes"] = X_train[["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]].gt(0).sum(axis=1)
    X_valid["PorchTypes"] = X_valid[["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]].gt(0).sum(axis=1)
    X_test["PorchTypes"] = X_test[["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]].gt(0).sum(axis=1)
        
    return X_train, X_valid, X_test

def generate_cluster_distances(X_train, X_valid, X_test, name = "Area", features = ['LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF','GrLivArea']):
    
    # 1. normalize based on training data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train[features])
    X_valid_scaled = scaler.transform(X_valid[features])
    X_test_scaled = scaler.transform(X_test[features])
    
    # 2. generate cluster distances (use transform)
    kmeans = KMeans(n_clusters = 10, n_init = 10, random_state=0)
    X_cd = kmeans.fit_transform(X_scaled)
    X_valid_cd = kmeans.transform(X_valid_scaled)
    X_test_cd = kmeans.transform(X_test_scaled)
    
    # 3. column labels
    X_cd = pd.DataFrame(X_cd, columns=[name + "_Centroid_" + str(i) for i in range(X_cd.shape[1])])
    X_valid_cd = pd.DataFrame(X_valid_cd, columns=[name + "_Centroid_" + str(i) for i in range(X_valid_cd.shape[1])])
    X_test_cd = pd.DataFrame(X_test_cd, columns=[name + "_Centroid_" + str(i) for i in range(X_test_cd.shape[1])])    
    
    return X_train.join(X_cd), X_valid.join(X_valid_cd), X_test.join(X_test_cd)

def pca_transform(X_train, X_valid, X_test, 
                  features = ["GarageArea","YearRemodAdd","TotalBsmtSF","GrLivArea"], 
                  n_components = 2):
    
    # Normalize based on training data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train[features])
    X_valid_scaled = scaler.transform(X_valid[features])
    X_test_scaled = scaler.transform(X_test[features])
    
    # Create principal components
    pca = PCA(n_components)
    X_pca = pca.fit_transform(X_scaled)
    X_valid_pca = pca.transform(X_valid_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    X_valid_pca = pd.DataFrame(X_valid_pca, columns=component_names)
    X_test_pca = pd.DataFrame(X_test_pca, columns=component_names)
    
    return X_train.join(X_pca), X_valid.join(X_valid_pca), X_test.join(X_test_pca)

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=5)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded
    
def encode_neighborhood(X_train, X_valid, X_test, y_train):
    encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    X1_train = encoder.fit_transform(X_train, y_train, cols=["Neighborhood"])
    X1_valid = encoder.transform(X_valid)
    X1_test = encoder.transform(X_test)
        
    return X_train.join(X1_train), X_valid.join(X1_valid), X_test.join(X1_test)


# XGBoost

In [6]:
xgb_params = [
    {'learning_rate': 0.0103, 'max_depth': 3, 
     'min_child_weight': 0.0608, 'colsample_bytree': 0.133, 
     'subsample': 0.6866000000000001, 'reg_lambda': 0.0806, 
     'reg_alpha': 0.1}, 
    {'learning_rate': 0.0098, 'max_depth': 3, 
     'min_child_weight': 0.6463, 'colsample_bytree': 0.14620000000000002, 
     'subsample': 0.6332, 'reg_lambda': 0.066, 'reg_alpha': 0.0}
]

In [7]:
out_of_fold = pd.DataFrame({"XGB"+str(i): np.zeros((train.shape[0],)) for i in range(len(xgb_params))})
out_of_fold['kfold'] = train.kfold
predictions = pd.DataFrame({"XGB"+str(i): np.zeros((test.shape[0],)) for i in range(len(xgb_params))})

for i, params in enumerate(xgb_params):
    X = train.copy()
    scores = np.zeros(NUM_FOLDS)
    transforms = [preprocessing, mathematical_transformations,
                  count_porch_types, generate_cluster_distances,
                  pca_transform, encode_neighborhood]

    for j in range(NUM_FOLDS):
        X_train = X[X.kfold != j][columns].reset_index(drop=True)
        X_valid = X[X.kfold == j][columns].reset_index(drop=True)
        y_train = X[X.kfold != j]['SalePrice'].reset_index(drop=True)
        y_valid = X[X.kfold == j]['SalePrice'].reset_index(drop=True)
        X_test = test[columns].copy()

# loop for applying the transformations
        for transform in transforms:
            try:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test, y_train = y_train)
            except:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test)
        
        model = XGBRegressor(**{**{'random_state':0, 
                                   'n_estimators': 3000},
                                **params})

        model.fit(X_train, y_train,
                  verbose = False,
                  eval_set = [(X_valid, y_valid)],
                  eval_metric = "mae",
                  early_stopping_rounds = EARLY_STOP)

        predictions["XGB"+str(i)] += model.predict(X_test) / NUM_FOLDS 
        preds_valid = model.predict(X_valid)
        scores[j] = mean_absolute_error(y_valid, preds_valid)
        out_of_fold["XGB"+str(i)][X.kfold == j] = preds_valid
        print("Model", i ," Fold",j ,"(MAE):", scores[j])

    print("Model", i, "Average (MAE):", scores.mean())
    print("Model", i, "Worst (MAE):", scores.max())

Model 0  Fold 0 (MAE): 14307.284049345482
Model 0  Fold 1 (MAE): 13135.857554222279
Model 0  Fold 2 (MAE): 14317.888020833334
Model 0 Average (MAE): 13920.3432081337
Model 0 Worst (MAE): 14317.888020833334
Model 1  Fold 0 (MAE): 14285.347343429157
Model 1  Fold 1 (MAE): 13605.928380069301
Model 1  Fold 2 (MAE): 14317.391042952675
Model 1 Average (MAE): 14069.555588817044
Model 1 Worst (MAE): 14317.391042952675


# LightGBM

In [8]:
lgbm_params = [
    {'max_depth': 4,'num_leaves': 11, 'min_child_samples': 3,
     'learning_rate': 0.0153, 'min_child_weight': 0.89, 
     'colsample_bytree': 0.215, 'subsample': 0.613, 
     'reg_lambda': 0.2906, 'reg_alpha': 88.21}, 
    {'learning_rate': 0.020700000000000003, 'max_depth': 3, 
     'num_leaves': 8, 'min_child_samples': 2, 
     'min_child_weight': 1.3800000000000001, 
     'colsample_bytree': 0.12890000000000001, 
     'subsample': 0.6342, 'reg_lambda': 1.589, 'reg_alpha': 86.924}
]

In [9]:
for i in range(len(lgbm_params)):
    out_of_fold["LGBM"+str(i)] = np.zeros((train.shape[0],))
    predictions["LGBM"+str(i)] = np.zeros((test.shape[0],))

for i, params in enumerate(lgbm_params):
    X = train.copy()
    scores = np.zeros(NUM_FOLDS)
    transforms = [preprocessing, mathematical_transformations,
                  count_porch_types, generate_cluster_distances,
                  pca_transform, encode_neighborhood]

    for j in range(NUM_FOLDS):
        X_train = X[X.kfold != j][columns].reset_index(drop=True)
        X_valid = X[X.kfold == j][columns].reset_index(drop=True)
        y_train = X[X.kfold != j]['SalePrice'].reset_index(drop=True)
        y_valid = X[X.kfold == j]['SalePrice'].reset_index(drop=True)
        X_test = test[columns].copy()

# loop for applying the transformations
        for transform in transforms:
            try:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test, y_train = y_train)
            except:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test)
        
        cat_cols = [x for x in X_train.columns if x in object_cols]

        model = LGBMRegressor(**{**{'random_state':0, 'n_estimators': 4000, 
                                    'max_depth': 4,'num_leaves': 11, 
                                    'min_child_samples': 3,},
                                 **params})
        model.fit(X_train, y_train,
                  verbose = False,
                  eval_set = [(X_valid, y_valid)],
                  eval_metric = "mae",
                  categorical_feature = cat_cols,
                  early_stopping_rounds = EARLY_STOP)

        predictions["LGBM"+str(i)] += model.predict(X_test) / NUM_FOLDS 
        preds_valid = model.predict(X_valid)
        scores[j] = mean_absolute_error(y_valid, preds_valid)
        out_of_fold["LGBM"+str(i)][X.kfold == j] = preds_valid
        print("Model", i ," Fold",j ,"(MAE):", scores[j])

    print("Model", i, "Average (MAE):", scores.mean())
    print("Model", i, "Worst (MAE):", scores.max())

Model 0  Fold 0 (MAE): 14747.866185848075
Model 0  Fold 1 (MAE): 13717.18127967525
Model 0  Fold 2 (MAE): 14469.271574308286
Model 0 Average (MAE): 14311.439679943871
Model 0 Worst (MAE): 14747.866185848075
Model 1  Fold 0 (MAE): 14611.328682975776
Model 1  Fold 1 (MAE): 13976.570776280563
Model 1  Fold 2 (MAE): 14461.357889782365
Model 1 Average (MAE): 14349.752449679567
Model 1 Worst (MAE): 14611.328682975776


# CatBoost

In [10]:
cat_params = [
    {'learning_rate': 0.0147, 'subsample': 0.812, 'reg_lambda': 0.972},
    {'learning_rate': 0.0227, 'subsample': 0.749, 'reg_lambda': 1.463}
]

In [11]:
for i in range(len(cat_params)):
    out_of_fold["CAT"+str(i)] = np.zeros((train.shape[0],))
    predictions["CAT"+str(i)] = np.zeros((test.shape[0],))
    
for i, params in enumerate(cat_params):
    X = train.copy()
    scores = np.zeros(NUM_FOLDS)
    transforms = [preprocessing, mathematical_transformations,
                  count_porch_types, generate_cluster_distances,
                  pca_transform, encode_neighborhood]

    for j in range(NUM_FOLDS):
        X_train = X[X.kfold != j][columns].reset_index(drop=True)
        X_valid = X[X.kfold == j][columns].reset_index(drop=True)
        y_train = X[X.kfold != j]['SalePrice'].reset_index(drop=True)
        y_valid = X[X.kfold == j]['SalePrice'].reset_index(drop=True)
        X_test = test[columns].copy()

# loop for applying the transformations
        for transform in transforms:
            try:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test, y_train = y_train)
            except:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test)
        
        model = CatBoostRegressor(**{**{'random_state':0, 
                                        'n_estimators': 3000,
                                        'eval_metric':"MAE",
                                        'early_stopping_rounds': EARLY_STOP,
                                        'verbose': False}, 
                                     **params})
        model.fit(X_train, y_train,
                  eval_set = (X_valid, y_valid),
                  use_best_model=True)

        predictions["CAT"+str(i)] += model.predict(X_test) / NUM_FOLDS 
        preds_valid = model.predict(X_valid)
        scores[j] = mean_absolute_error(y_valid, preds_valid)
        out_of_fold["CAT"+str(i)][X.kfold == j] = preds_valid
        print("Model", i ," Fold",j ,"(MAE):", scores[j])

    print("Model", i, "Average (MAE):", scores.mean())
    print("Model", i, "Worst (MAE):", scores.max())

Model 0  Fold 0 (MAE): 14172.634905996987
Model 0  Fold 1 (MAE): 12774.724306184993
Model 0  Fold 2 (MAE): 14312.381667259804
Model 0 Average (MAE): 13753.246959813929
Model 0 Worst (MAE): 14312.381667259804
Model 1  Fold 0 (MAE): 14260.626857891964
Model 1  Fold 1 (MAE): 13018.354806356869
Model 1  Fold 2 (MAE): 14292.017565886106
Model 1 Average (MAE): 13856.999743378314
Model 1 Worst (MAE): 14292.017565886106


# Save Output

In [12]:
for col in predictions.columns:
    if SUBMIT == False: break
    output = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions[col]})
    timestr = time.strftime("%Y%m%d-%H%M%S")
    output.to_csv('../submissions/submission_'+col+'_'+timestr+'.csv', index=False)

# Stacking

In [13]:
def stacking(stack_model, submit = False, fit_params = {}):
    preds = np.zeros((test.shape[0],))
    scores = np.zeros(NUM_FOLDS)
    
    for j in range(NUM_FOLDS):
        X_train = out_of_fold[X.kfold != j].drop('kfold', axis = 1)
        X_valid = out_of_fold[X.kfold == j].drop('kfold', axis = 1)
        y_train = train['SalePrice'][X.kfold != j].copy()
        y_valid = train['SalePrice'][X.kfold == j].copy()
        X_test = predictions.copy()

        model = clone(stack_model)
        model.fit(X_train, y_train)

        preds += model.predict(X_test) / NUM_FOLDS 
        preds_valid = model.predict(X_valid)
        scores[j] = mean_absolute_error(y_valid, preds_valid)
        print("Fold", j ,"(MAE):", scores[j])

    print("Avg (RMSE):", round(scores.mean(),6))
    print("Max (RMSE):", round(scores.max(),6))

    if submit:
        output = pd.DataFrame({'Id': test.Id,'SalePrice': preds})
        timestr = time.strftime("%Y%m%d-%H%M%S")
        output.to_csv('../submissions/submission_stack_'+timestr+'.csv', index=False)

In [14]:
stacking(stack_model = Lasso(), submit = STACK)

Fold 0 (MAE): 14698.520677439861
Fold 1 (MAE): 13355.342565399234
Fold 2 (MAE): 14643.656248590694
Avg (RMSE): 14232.506497
Max (RMSE): 14698.520677
