# Feature Engineering

These techniques are adapted from the ones covered in the [Feature Engineering](https://www.kaggle.com/learn/feature-engineering) course on Kaggle.

In [1]:
# Global variables for testing changes to this notebook quickly
FOLD_SEED = 0
NUM_FOLDS = 3
EARLY_STOP = 50
TRIALS = 16

In [2]:
# Essentials
import os
import warnings
import numpy as np
import pandas as pd
import time

# Preprocessing
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold, KFold
from functools import partial 
from sklearn.impute import SimpleImputer, KNNImputer
from category_encoders import OrdinalEncoder, OneHotEncoder

# Feature Engineering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from category_encoders import MEstimateEncoder

# Models and Hyperparameter Tuning
import optuna
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Mute warnings
warnings.filterwarnings('ignore')

# Create Folds and Preprocessing

In [3]:
# Load the training data
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

# Remove rows with missing target
train.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Clean data, static transformations
def clean_data(*data):
    for df in data:
        # fix typos to match documentation
        df['MSZoning'] =  df['MSZoning'].replace({'C (all)': 'C'})
        df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn":"BrkComm","Wd Shng": "WdShing"})
        df['Neighborhood'] = df['Neighborhood'].replace({'NAmes':'Names'})

        # Some values of GarageYrBlt are corrupt, replace them with YearBuilt
        df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
        
        # optional feature: A - agriculture, C - commercial, R - residential, I - industrial
        df["MSClass"] = df['MSZoning'].map({'A': 'A','C': 'C',"FV": 'R','I': 'I',
                                            "RH": 'R',"RL": 'R',"RP": 'R',"RM": 'R', np.nan:np.nan})
    return data
    
train, test = clean_data(train, test)

# List of categorical/numerical columns
columns = [col for col in test.columns if col not in ["Id","MSClass"]]
object_cols = [col for col in columns if train[col].dtype == "object"]
number_cols = [col for col in columns if train[col].dtype != "object"]

# Define bins
binner = KBinsDiscretizer(n_bins = 45, encode = 'ordinal', strategy = 'uniform')
y_bins = binner.fit_transform(pd.DataFrame(data=train['SalePrice']))

# Define folds
train["kfold"] = -1
kf = StratifiedKFold(NUM_FOLDS, shuffle = True, random_state = FOLD_SEED) 
for fold, (train_idx, valid_idx) in enumerate(kf.split(train, y_bins)):
    train.loc[valid_idx,"kfold"] = fold

In [4]:
def preprocessing(X_train, X_valid, X_test):
    
    # 1. find columns in X_train with high ratio of NAs
    temp = pd.DataFrame(X_train.isnull().sum().sort_values(ascending = False), 
                        columns = ['NAs'])
    temp['ratio'] = temp['NAs'].apply(lambda x: round(x/X_train.shape[0],2))
    cols = list(temp[temp['ratio'] >= 0.8].index.values)
    
    # 2. drop the offending columns
    X_train.drop(cols, inplace = True, axis = 1)
    X_valid.drop(cols, inplace = True, axis = 1)
    X_test.drop(cols, inplace = True, axis = 1)
    
    # 3. impute numerical data
    columns = [col for col in X_train.columns if X_train[col].dtype != "object"]
    if columns:
        imputer = SimpleImputer(strategy='median')
        X_train[columns] = imputer.fit_transform(X_train[columns])
        X_valid[columns] = imputer.transform(X_valid[columns])
        X_test[columns] = imputer.transform(X_test[columns])
    
    # 4. impute categorical data
    columns = [col for col in X_train.columns if X_train[col].dtype == "object"]
    if columns:
        imputer = SimpleImputer(strategy='constant', fill_value = 'None')
        X_train[columns] = imputer.fit_transform(X_train[columns])
        X_valid[columns] = imputer.transform(X_valid[columns])
        X_test[columns] = imputer.transform(X_test[columns])
    
    # 5. encode 1-10 ratings
    cols = ["OverallQual","OverallCond"]
    cols = [x for x in cols if x in X_train.columns]
    ratings = {float(a):b for b,a in enumerate(range(1,11))}
    mapping = [{'col':x, 'mapping': ratings} for x in cols]
    
    encoder = OrdinalEncoder(cols = cols, mapping = mapping, handle_missing = 'return_nan')
    X_train = encoder.fit_transform(X_train)
    X_valid = encoder.transform(X_valid)
    X_test = encoder.transform(X_test)
    
    # 6. encode Poor, Fair, Avg, Good, Ex ratings
    cols = ["ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC", "KitchenQual","FireplaceQu","GarageQual","GarageCond",'PoolQC']
    cols = [x for x in cols if x in X_train.columns]
    ratings = {"Po":0, "Fa":1, "TA":2, "Gd":3, "Ex":4}
    mapping = [{'col':x, 'mapping': ratings} for x in cols]
    
    encoder = OrdinalEncoder(cols = cols, mapping = mapping, handle_missing = 'return_nan')
    X_train = encoder.fit_transform(X_train)
    X_valid = encoder.transform(X_valid)
    X_test = encoder.transform(X_test)
    
    # 7. encode remaining ordinal data
    cols = ["LotShape","LandSlope","BsmtExposure","BsmtFinType1","BsmtFinType2",
    "Functional","GarageFinish","PavedDrive","Utilities","CentralAir","Electrical",
    "Fence"]
    cols = [x for x in cols if x in X_train.columns]
    mapping = [{'col':"LotShape",
                'mapping': {"Reg":0, "IR1":1, "IR2":2, "IR3":3}},
               {'col':"LandSlope",
                'mapping': {"Sev":0, "Mod":1, "Gtl":2}},
               {'col':"BsmtExposure",
                'mapping': {"No":0, "Mn":1, "Av":2, "Gd":3}},
               {'col':"BsmtFinType1",
                'mapping': {"Unf":0, "LwQ":1, "Rec":2, "BLQ":3, "ALQ":4, "GLQ":5}},
               {'col':"BsmtFinType2",
                'mapping': {"Unf":0, "LwQ":1, "Rec":2, "BLQ":3, "ALQ":4, "GLQ":5}},
               {'col':"Functional",
                'mapping': {"Sal":0, "Sev":1, "Maj1":2, "Maj2":3, "Mod":4, "Min2":5, "Min1":6, "Typ":7}},
               {'col':"GarageFinish",
                'mapping': {"Unf":0, "RFn":1, "Fin":2}},
               {'col':"PavedDrive",
                'mapping': {"N":0, "P":1, "Y":2}},
               {'col':"Utilities",
                'mapping': {"NoSeWa":0, "NoSewr":1, "AllPub":2}},
               {'col':"CentralAir",
                'mapping': {"N":0, "Y":1}},
               {'col':"Electrical",
                'mapping': {"Mix":0, "FuseP":1, "FuseF":2, "FuseA":3, "SBrkr":4}},
               {'col':"Fence",
                'mapping': {"MnWw":0, "GdWo":1, "MnPrv":2, "GdPrv":3}}]
    mapping = [x for x in mapping if x['col'] in X_train.columns]
    
    encoder = OrdinalEncoder(cols = cols, mapping = mapping, handle_missing = 'return_nan')
    X_train = encoder.fit_transform(X_train)
    X_valid = encoder.transform(X_valid)
    X_test = encoder.transform(X_test)
    
    # 8. encode remaining columns
    columns = [col for col in X_train.columns if X_train[col].dtype == 'object']
    encoder = OrdinalEncoder(cols = columns, handle_missing = 'return_nan')
    X_train = encoder.fit_transform(X_train)
    X_valid = encoder.transform(X_valid)
    X_test = encoder.transform(X_test)
        
    return X_train, X_valid, X_test

# Scoring Functions

In [5]:
def score_xgboost(transforms = [], params = {}, cols = columns, verbose = True):
    start = time.time()
    
    X = train.copy()
    scores = np.zeros(NUM_FOLDS)
    transforms = [preprocessing] + transforms
    
    for i in range(NUM_FOLDS):
        X_train = X[X.kfold != i][cols].copy()
        X_valid = X[X.kfold == i][cols].copy()
        y_train = X[X.kfold != i]['SalePrice'].copy()
        y_valid = X[X.kfold == i]['SalePrice'].copy()
        X_test = test[cols].copy()
        
        # loop for applying the transformations
        for transform in transforms:
            try:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test, y_train = y_train)
            except:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test)
        
        model = XGBRegressor(**{**{'random_state':0, 'n_estimators': 3000},**params})
        model.fit(X_train, y_train,
                  verbose = False,
                  eval_set = [(X_valid, y_valid)],
                  eval_metric = "mae",
                  early_stopping_rounds = EARLY_STOP)

        preds = model.predict(X_valid)
        scores[i] = mean_absolute_error(y_valid, preds)
    end = time.time()
    if verbose:
        print("XGBoost  (3-fold Avg):", 
              round(scores.mean(), 4))
        print("XGBoost  (3-fold Max):", 
              round(scores.max(), 4), "\t",
              str(round(end-start, 3))+"s")

    return round(scores.mean(), 4), round(scores.max(), 4)

In [6]:
def score_lightgbm(transforms = [], params = {}, cols = columns, verbose = True):
    start = time.time()
    
    X = train.copy()
    scores = np.zeros(NUM_FOLDS)
    transforms = [preprocessing] + transforms
    
    for i in range(NUM_FOLDS):
        X_train = X[X.kfold != i][cols].copy()
        X_valid = X[X.kfold == i][cols].copy()
        y_train = X[X.kfold != i]['SalePrice'].copy()
        y_valid = X[X.kfold == i]['SalePrice'].copy()
        X_test = test[cols].copy()
        
        for transform in transforms:
            try:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test, y_train = y_train)
            except:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test)
                
        cat_cols = [x for x in X_train.columns if x in object_cols]
                
        model = LGBMRegressor(**{**{'random_state':0, 'n_estimators': 3000},**params})
        model.fit(X_train, y_train,
                  verbose = False,
                  eval_set = [(X_valid, y_valid)],
                  eval_metric = "mae",
                  categorical_feature = cat_cols,
                  early_stopping_rounds = EARLY_STOP)

        valid_preds = model.predict(X_valid)
        scores[i] = mean_absolute_error(y_valid, valid_preds)
    end = time.time()
    if verbose:
        print("LightGBM (3-fold Avg):", 
              round(scores.mean(), 4))
        print("LightGBM (3-fold Max):", 
              round(scores.max(), 4), "\t",
              str(round(end-start, 3))+"s")

    return round(scores.mean(), 4), round(scores.max(), 4)

In [7]:
def score_catboost(transforms = [], params = {}, cols = columns, verbose = True):
    start = time.time()
    
    X = train.copy()
    scores = np.zeros(NUM_FOLDS)
    transforms = [preprocessing] + transforms
    
    for i in range(NUM_FOLDS):
        X_train = X[X.kfold != i][cols].copy()
        X_valid = X[X.kfold == i][cols].copy()
        y_train = X[X.kfold != i]['SalePrice'].copy()
        y_valid = X[X.kfold == i]['SalePrice'].copy()
        X_test = test[cols].copy()
        
        # loop for applying the transformations
        for transform in transforms:
            try:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test, y_train = y_train)
            except:
                X_train, X_valid, X_test = transform(X_train, X_valid, X_test)
        
        model = CatBoostRegressor(**{**{'random_state':0, 
                                        'n_estimators': 3000,
                                        'eval_metric':"MAE",
                                        'early_stopping_rounds': EARLY_STOP,
                                        'verbose': False}, **params})
        model.fit(X_train, y_train,
                  eval_set = (X_valid, y_valid),
                  use_best_model=True)

        valid_preds = model.predict(X_valid)
        scores[i] = mean_absolute_error(y_valid, valid_preds)
    end = time.time()
    if verbose:
        print("CatBoost (3-fold Avg):", 
              round(scores.mean(), 4))
        print("CatBoost (3-fold Max):", 
              round(scores.max(), 4), "\t",
              str(round(end-start, 3))+"s")

    return round(scores.mean(), 4), round(scores.max(), 4)

In [8]:
def get_baseline():
    
    print("\nBaseline\n")
    xgb_avg, xgb_max = score_xgboost()
    lgbm_avg, lgbm_max = score_lightgbm()
    cat_avg, cat_max = score_catboost()
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
    
    return round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4), round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4)
    
BASELINE_AVG, BASELINE_MAX = get_baseline()


Baseline

XGBoost  (3-fold Avg): 17750.9292
XGBoost  (3-fold Max): 18307.5135 	 1.638s
LightGBM (3-fold Avg): 16623.8913
LightGBM (3-fold Max): 17442.5085 	 1.565s
CatBoost (3-fold Avg): 15306.813
CatBoost (3-fold Max): 15716.4108 	 8.733s

Overall (Avg):         16560.5445
Overall (Max):         17750.9292


# Mutual Information

In [9]:
def remove_uninformative(X_train, X_valid, X_test, y_train, verbose = False):
    
    # 1. Determine uninformative columns
    scores =  mutual_info_regression(X_train, y_train)
    cols = [x for i, x in enumerate(X_train.columns) if scores[i] == 0]
    
    # 2. Drop the uninformative columns
    X_train.drop(cols, axis = 1, inplace = True)
    X_valid.drop(cols, axis = 1, inplace = True)
    X_test.drop(cols, axis = 1, inplace = True)
    
    if verbose:
        print("Dropped columns:", *cols)
    
    return X_train, X_valid, X_test

def test_uninformative():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)
    
    print("\nDrop Uninformative\n")
    transforms = [remove_uninformative]
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
    
test_uninformative()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Drop Uninformative

XGBoost  (3-fold Avg): 17553.6623
XGBoost  (3-fold Max): 18282.7652 	 2.573s
LightGBM (3-fold Avg): 16586.4857
LightGBM (3-fold Max): 17202.7019 	 2.482s
CatBoost (3-fold Avg): 15185.9457
CatBoost (3-fold Max): 15747.9589 	 8.829s

Overall (Avg):         16442.0312
Overall (Max):         17553.6623


# Mathematical Transformations

In [10]:
def mathematical_transformations(X_train, X_valid, X_test):
    
    X_train["LivLotRatio"] = X_train["GrLivArea"] / X_train["LotArea"]
    X_valid["LivLotRatio"] = X_valid["GrLivArea"] / X_valid["LotArea"]
    X_test["LivLotRatio"] = X_test["GrLivArea"] / X_test["LotArea"]
    
    X_train["Spaciousness"] = (X_train["1stFlrSF"]+X_train["2ndFlrSF"]) / X_train["TotRmsAbvGrd"]
    X_valid["Spaciousness"] = (X_valid["1stFlrSF"]+X_valid["2ndFlrSF"]) / X_valid["TotRmsAbvGrd"]
    X_test["Spaciousness"] = (X_test["1stFlrSF"]+X_test["2ndFlrSF"]) / X_test["TotRmsAbvGrd"]
    
    X_train["TotalOutsideSF"] = X_train["WoodDeckSF"] + X_train["OpenPorchSF"] + X_train["EnclosedPorch"] + X_train["3SsnPorch"] + X_train["ScreenPorch"]
    X_valid["TotalOutsideSF"] = X_valid["WoodDeckSF"] + X_valid["OpenPorchSF"] + X_valid["EnclosedPorch"] + X_valid["3SsnPorch"] + X_valid["ScreenPorch"]
    X_test["TotalOutsideSF"] = X_test["WoodDeckSF"] + X_test["OpenPorchSF"] + X_test["EnclosedPorch"] + X_test["3SsnPorch"] + X_test["ScreenPorch"]
    
    X_train['TotalLot'] = X_train['LotFrontage'] + X_train['LotArea']
    X_valid['TotalLot'] = X_valid['LotFrontage'] + X_valid['LotArea']
    X_test['TotalLot'] = X_test['LotFrontage'] + X_test['LotArea']
    
    X_train['TotalBsmtFin'] = X_train['BsmtFinSF1'] + X_train['BsmtFinSF2']
    X_valid['TotalBsmtFin'] = X_valid['BsmtFinSF1'] + X_valid['BsmtFinSF2']
    X_test['TotalBsmtFin'] = X_test['BsmtFinSF1'] + X_test['BsmtFinSF2']
    
    X_train['TotalSF'] = X_train['TotalBsmtSF'] + X_train['2ndFlrSF'] + X_train['1stFlrSF']
    X_valid['TotalSF'] = X_valid['TotalBsmtSF'] + X_valid['2ndFlrSF'] + X_valid['1stFlrSF']
    X_test['TotalSF'] = X_test['TotalBsmtSF'] + X_test['2ndFlrSF'] + X_test['1stFlrSF']
    
    X_train['TotalBath'] = X_train['FullBath'] + X_train['HalfBath'] * 0.5 + X_train['BsmtFullBath'] + X_train['BsmtHalfBath'] * 0.5
    X_valid['TotalBath'] = X_valid['FullBath'] + X_valid['HalfBath'] * 0.5 + X_valid['BsmtFullBath'] + X_valid['BsmtHalfBath'] * 0.5
    X_test['TotalBath'] = X_test['FullBath'] + X_test['HalfBath'] * 0.5 + X_test['BsmtFullBath'] + X_test['BsmtHalfBath'] * 0.5
    
    X_train['TotalPorch'] = X_train['OpenPorchSF'] + X_train['EnclosedPorch'] + X_train['ScreenPorch'] + X_train['WoodDeckSF']
    X_valid['TotalPorch'] = X_valid['OpenPorchSF'] + X_valid['EnclosedPorch'] + X_valid['ScreenPorch'] + X_valid['WoodDeckSF']
    X_test['TotalPorch'] = X_test['OpenPorchSF'] + X_test['EnclosedPorch'] + X_test['ScreenPorch'] + X_test['WoodDeckSF']
    
    return X_train, X_valid, X_test

def test_transformations():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)
    
    print("\nMathematical Transformations\n")
    transforms = [mathematical_transformations]
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
    
test_transformations()
    


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Mathematical Transformations

XGBoost  (3-fold Avg): 17431.0216
XGBoost  (3-fold Max): 17721.1326 	 1.664s
LightGBM (3-fold Avg): 16562.6011
LightGBM (3-fold Max): 17653.9238 	 1.632s
CatBoost (3-fold Avg): 15109.5324
CatBoost (3-fold Max): 15656.9734 	 8.905s

Overall (Avg):         16367.7184
Overall (Max):         17431.0216


# Encoding Interactions

In [11]:
def encode_interaction(X_train, X_valid, X_test, cat_col = "BldgType", num_col = "GrLivArea"):

    X_1 = pd.get_dummies(X_train[cat_col], prefix=cat_col)
    X_2 = pd.get_dummies(X_valid[cat_col], prefix=cat_col)
    X_3 = pd.get_dummies(X_test[cat_col], prefix=cat_col)
        
    for col in X_1.columns:
        X_train[col+"_"+num_col] = X_1[col]*X_train[num_col]
        X_valid[col+"_"+num_col] = X_2[col]*X_valid[num_col]
        X_test[col+"_"+num_col] = X_3[col]*X_test[num_col]
    
    return X_train, X_valid, X_test

def test_interaction():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)  
    
    transforms = [encode_interaction]
    print("\nInteraction Term:\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
    
test_interaction()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Interaction Term:

XGBoost  (3-fold Avg): 17910.0346
XGBoost  (3-fold Max): 18361.0122 	 1.65s
LightGBM (3-fold Avg): 16832.9489
LightGBM (3-fold Max): 17885.9157 	 1.521s
CatBoost (3-fold Avg): 15236.8674
CatBoost (3-fold Max): 15634.9627 	 8.328s

Overall (Avg):         16659.9503
Overall (Max):         17910.0346


# Count Features

In [12]:
def count_porch_types(X_train, X_valid, X_test):
    
    X_train["PorchTypes"] = X_train[["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]].gt(0).sum(axis=1)
    X_valid["PorchTypes"] = X_valid[["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]].gt(0).sum(axis=1)
    X_test["PorchTypes"] = X_test[["WoodDeckSF","OpenPorchSF","EnclosedPorch","3SsnPorch","ScreenPorch"]].gt(0).sum(axis=1)
        
    return X_train, X_valid, X_test

def test_count_feature():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)
    
    transforms = [count_porch_types]
    print("\nCount Feature:\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
      
test_count_feature()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Count Feature:

XGBoost  (3-fold Avg): 17914.7202
XGBoost  (3-fold Max): 18285.0698 	 1.867s
LightGBM (3-fold Avg): 16680.4994
LightGBM (3-fold Max): 17505.6324 	 1.574s
CatBoost (3-fold Avg): 15193.0855
CatBoost (3-fold Max): 15788.8947 	 9.847s

Overall (Avg):         16596.1017
Overall (Max):         17914.7202


# Break Down Feature

In [13]:
def test_breakdown():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)
    
    print("\nBreak Down:\n")
    xgb_avg, xgb_max = score_xgboost(cols = columns + ["MSClass"])
    lgbm_avg, lgbm_max = score_lightgbm(cols = columns + ["MSClass"])
    cat_avg, cat_max = score_catboost(cols = columns + ["MSClass"])
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
      
    
test_breakdown()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Break Down:

XGBoost  (3-fold Avg): 17722.4002
XGBoost  (3-fold Max): 18074.0544 	 1.62s
LightGBM (3-fold Avg): 16623.8913
LightGBM (3-fold Max): 17442.5085 	 1.513s
CatBoost (3-fold Avg): 15181.7648
CatBoost (3-fold Max): 15845.3079 	 10.877s

Overall (Avg):         16509.3521
Overall (Max):         17722.4002


# Grouped Transform

In [14]:
def group_transformation(X_train, X_valid, X_test):
    
    X_train["MedNhbdLvArea"] = X_train.groupby("Neighborhood")["GrLivArea"].transform('median')
    
    # we use the medians from the training data to impute the test data
    mapping = dict()
    for x,y in zip(X_train["MedNhbdLvArea"].iteritems(), X_train['Neighborhood'].iteritems()):
        _,median_area = x
        _,nbhr = y
        if nbhr not in mapping: mapping[nbhr] = median_area
    
    X_valid["MedNhbdLvArea"] = X_valid['Neighborhood'].map(mapping)
    X_test["MedNhbdLvArea"] = X_test['Neighborhood'].map(mapping)
    
    return X_train, X_valid, X_test

def test_group():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)
    
    transforms = [group_transformation]
    print("\nGroup Transformation:\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
      
    
test_group()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Group Transformation:

XGBoost  (3-fold Avg): 17974.0845
XGBoost  (3-fold Max): 18608.1193 	 1.569s
LightGBM (3-fold Avg): 16718.6506
LightGBM (3-fold Max): 17527.5949 	 1.52s
CatBoost (3-fold Avg): 15001.0547
CatBoost (3-fold Max): 15451.3731 	 8.001s

Overall (Avg):         16564.5966
Overall (Max):         17974.0845


# Clustering (Labels)

In [15]:
def generate_cluster_labels(X_train, X_valid, X_test, name = "Area", features = ['LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF','GrLivArea']):
    
    # 1. normalize based on training data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train[features])
    X_valid_scaled = scaler.transform(X_valid[features])
    X_test_scaled = scaler.transform(X_test[features])
    
    # 2. create cluster labels (use predict)
    X_1, X_2 = pd.DataFrame(), pd.DataFrame()
    kmeans = KMeans(n_clusters = 10, n_init = 10, random_state=0)
    X_train[name + "_Cluster"] = kmeans.fit_predict(X_scaled)
    X_valid[name + "_Cluster"] = kmeans.predict(X_valid_scaled)
    X_test[name + "_Cluster"] = kmeans.predict(X_test_scaled)
         
    return X_train, X_valid, X_test

def test_cluster_labels():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)
    
    transforms = [generate_cluster_labels]
    print("\nCluster Labels:\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
     
test_cluster_labels()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Cluster Labels:

XGBoost  (3-fold Avg): 17917.9605
XGBoost  (3-fold Max): 18428.5234 	 1.81s
LightGBM (3-fold Avg): 16574.1062
LightGBM (3-fold Max): 17495.0389 	 1.671s
CatBoost (3-fold Avg): 15351.4654
CatBoost (3-fold Max): 15897.7019 	 8.678s

Overall (Avg):         16614.5107
Overall (Max):         17917.9605


# Clustering (Distances)

In [16]:
def generate_cluster_distances(X_train, X_valid, X_test = None, name = "Area", features = ['LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF','GrLivArea']):
    
    # 1. normalize based on training data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train[features])
    X_valid_scaled = scaler.transform(X_valid[features])
    X_test_scaled = scaler.transform(X_test[features])
    
    # 2. generate cluster distances (use transform)
    kmeans = KMeans(n_clusters = 10, n_init = 10, random_state=0)
    X_cd = kmeans.fit_transform(X_scaled)
    X_valid_cd = kmeans.transform(X_valid_scaled)
    X_test_cd = kmeans.transform(X_test_scaled)
    
    # 3. column labels
    X_cd = pd.DataFrame(X_cd, columns=[name + "_Centroid_" + str(i) for i in range(X_cd.shape[1])])
    X_valid_cd = pd.DataFrame(X_valid_cd, columns=[name + "_Centroid_" + str(i) for i in range(X_valid_cd.shape[1])])
    X_test_cd = pd.DataFrame(X_test_cd, columns=[name + "_Centroid_" + str(i) for i in range(X_valid_cd.shape[1])])
    
    return X_train.join(X_cd), X_valid.join(X_valid_cd), X_test.join(X_test_cd)

def test_cluster_distances():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)
    
    transforms = [generate_cluster_distances]
    print("\nCluster Distances:\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
     
    
test_cluster_distances()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Cluster Distances:

XGBoost  (3-fold Avg): 18070.1014
XGBoost  (3-fold Max): 18889.9244 	 1.903s
LightGBM (3-fold Avg): 17106.0203
LightGBM (3-fold Max): 17812.7573 	 1.893s
CatBoost (3-fold Avg): 15537.4439
CatBoost (3-fold Max): 16187.6359 	 10.731s

Overall (Avg):         16904.5219
Overall (Max):         18070.1014


# Principal Component Analysis

In [17]:
# Performs PCA on the whole dataframe
def pca_transform(X_train, X_valid, X_test, 
                  features = ["GarageArea","YearRemodAdd","TotalBsmtSF","GrLivArea"], 
                  n_components = None):
    
    # Normalize based on training data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train[features])
    X_valid_scaled = scaler.transform(X_valid[features])
    X_test_scaled = scaler.transform(X_test[features])
    
    # Create principal components
    pca = PCA(n_components)
    X_pca = pca.fit_transform(X_scaled)
    X_valid_pca = pca.transform(X_valid_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    X_valid_pca = pd.DataFrame(X_valid_pca, columns=component_names)
    X_test_pca = pd.DataFrame(X_test_pca, columns=component_names)
    
    return X_train.join(X_pca), X_valid.join(X_valid_pca), X_test.join(X_test_pca)

def test_pca_features():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)
    
    transforms = [partial(pca_transform, n_components = 1)]
    print("\nPCA (1 components):\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
   
    
    transforms = [partial(pca_transform, n_components = 2)]
    print("\nPCA (2 components):\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
   
    
    transforms = [partial(pca_transform, n_components = 3)]
    print("\nPCA (3 components):\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
   
    
test_pca_features()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

PCA (1 components):

XGBoost  (3-fold Avg): 17974.0797
XGBoost  (3-fold Max): 18334.2837 	 1.921s
LightGBM (3-fold Avg): 16767.0964
LightGBM (3-fold Max): 17324.6553 	 1.704s
CatBoost (3-fold Avg): 15433.9841
CatBoost (3-fold Max): 15851.3625 	 8.049s

Overall (Avg):         16725.0534
Overall (Max):         17974.0797

PCA (2 components):

XGBoost  (3-fold Avg): 18067.0729
XGBoost  (3-fold Max): 18441.1984 	 2.312s
LightGBM (3-fold Avg): 16920.6678
LightGBM (3-fold Max): 17892.484 	 1.689s
CatBoost (3-fold Avg): 15441.937
CatBoost (3-fold Max): 16010.6142 	 9.294s

Overall (Avg):         16809.8926
Overall (Max):         18067.0729

PCA (3 components):

XGBoost  (3-fold Avg): 18036.673
XGBoost  (3-fold Max): 18392.6344 	 2.074s
LightGBM (3-fold Avg): 16716.2553
LightGBM (3-fold Max): 17289.0693 	 1.689s
CatBoost (3-fold Avg): 15487.882
CatBoost (3-fold Max): 16253.2187 	 11.07s

Overall (Avg):         1674

# Target Encoding

In [18]:
class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=5)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

In [19]:
def encode_neighborhood(X_train, X_valid, X_test, y_train):
    encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    X1_train = encoder.fit_transform(X_train, y_train, cols=["Neighborhood"])
    X1_valid = encoder.transform(X_valid)
    X1_test = encoder.transform(X_test)
        
    return X_train.join(X1_train), X_valid.join(X1_valid), X_test.join(X1_test)

def test_neighborhood_encoding():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX) 
    
    transforms = [encode_neighborhood]
    print("\nTarget Encoding ('Neighborhood'):\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
   
    
test_neighborhood_encoding()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Target Encoding ('Neighborhood'):

XGBoost  (3-fold Avg): 17103.0649
XGBoost  (3-fold Max): 17470.149 	 1.792s
LightGBM (3-fold Avg): 16613.4951
LightGBM (3-fold Max): 17245.4656 	 1.771s
CatBoost (3-fold Avg): 14871.7069
CatBoost (3-fold Max): 15294.9878 	 9.506s

Overall (Avg):         16196.089
Overall (Max):         17103.0649


In [20]:
def encode_subclass(X_train, X_valid, X_test, y_train):
    encoder = CrossFoldEncoder(MEstimateEncoder, m=1)
    X1_train = encoder.fit_transform(X_train, y_train, cols=["MSSubClass"])
    X1_valid = encoder.transform(X_valid)
    X1_test = encoder.transform(X_test)
        
    return X_train.join(X1_train), X_valid.join(X1_valid), X_test.join(X1_test)

def test_subclass_encoding():
    
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)    
    
    transforms = [encode_subclass]
    print("\nTarget Encoding ('SubClass'):\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))
   
    
test_subclass_encoding()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Target Encoding ('SubClass'):

XGBoost  (3-fold Avg): 18346.5279
XGBoost  (3-fold Max): 19315.441 	 2.036s
LightGBM (3-fold Avg): 16895.9533
LightGBM (3-fold Max): 17523.6162 	 1.749s
CatBoost (3-fold Avg): 15345.1554
CatBoost (3-fold Max): 15833.9808 	 9.932s

Overall (Avg):         16862.5455
Overall (Max):         18346.5279


# Test Strategies

We pick all the strategies which resulted in significant gains on average, namely:

1. Mathematical Transformations
2. Group Transformation
3. Target Encoding
4. Drop Columns (Mutual Information)


In [21]:
def test_features():
    print("\nBaseline\n")
    print("Overall (Avg):".ljust(22), BASELINE_AVG)
    print("Overall (Max):".ljust(22), BASELINE_MAX)    
    
    transforms = [mathematical_transformations, group_transformation, encode_neighborhood, remove_uninformative]
    print("\nTarget Encoding ('SubClass'):\n")
    xgb_avg, xgb_max = score_xgboost(transforms)
    lgbm_avg, lgbm_max = score_lightgbm(transforms)
    cat_avg, cat_max = score_catboost(transforms)
    print("\nOverall (Avg):".ljust(23), round(np.mean([xgb_avg, lgbm_avg, cat_avg]), 4))
    print("Overall (Max):".ljust(22),round(np.max([xgb_avg, lgbm_avg, cat_avg]), 4))

test_features()


Baseline

Overall (Avg):         16560.5445
Overall (Max):         17750.9292

Target Encoding ('SubClass'):

XGBoost  (3-fold Avg): 17269.873
XGBoost  (3-fold Max): 18108.4344 	 3.101s
LightGBM (3-fold Avg): 16629.224
LightGBM (3-fold Max): 17103.4907 	 2.86s
CatBoost (3-fold Avg): 14472.6178
CatBoost (3-fold Max): 14847.5023 	 11.908s

Overall (Avg):         16123.9049
Overall (Max):         17269.873
