In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, \
    RobustScaler, FunctionTransformer, PowerTransformer
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_score, learning_curve
from sklearn.pipeline import Pipeline
from sklearn import set_config

import eli5

In [2]:
np.random.seed(42)
set_config(display='diagram')

plt.rcParams['figure.figsize'] = (12, 8)
sns.set_theme(style='whitegrid')

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

full_df = pd.concat([train_df, test_df], sort=True).reset_index(drop=True)

In [4]:
all_df = pd.read_csv('data/test_labels.csv')
test_y = all_df['SalePrice']
test_y

0       105000.0
1       172000.0
2       189900.0
3       195500.0
4       191500.0
          ...   
1454     90500.0
1455     71000.0
1456    131000.0
1457    132000.0
1458    188000.0
Name: SalePrice, Length: 1459, dtype: float64

## Misc Functions

In [5]:
def plot_learning_curve(estimator, X_train, y_train, cv, train_sizes=np.linspace(0.1, 1, 10)):
    plt.style.use('seaborn-darkgrid')
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X_train, y_train, 
        scoring='neg_mean_squared_error',
        cv=cv, 
        n_jobs=-1, 
        train_sizes=train_sizes,
        shuffle=True,
        random_state=42
        )
    train_mean_scores = np.mean(train_scores, axis=1)
    test_mean_scores = np.mean(test_scores, axis=1)

    plt.title('Learning curve')
    plt.plot(train_sizes, train_mean_scores, 'y', label='Train Learning curve')
    plt.plot(train_sizes, test_mean_scores, 'b', label='Test Learning curve')
    plt.legend()

In [6]:
def neg_rmsle(y_true, y_pred):
    y_pred = np.abs(y_pred)
    
    return -1 * np.sqrt(mean_squared_log_error(y_true, y_pred))

In [7]:
def score_model(model, X, Y):
    
    scores = cross_validate(
        model, X, Y, 
        scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], cv=2,
        n_jobs=-1, verbose=0)

    rmsle_score = cross_val_score(model, X, Y, cv=2, scoring=make_scorer(neg_rmsle))

    mse_score = np.sqrt(-1 * scores['test_neg_mean_squared_error'].mean())
    mse_std = np.sqrt(scores['test_neg_mean_squared_error'].std())

    mae_score = -1 * scores['test_neg_mean_absolute_error'].mean()
    mae_std = scores['test_neg_mean_absolute_error'].std()

    r2_score_mean = scores['test_r2'].mean()
    r2_std = scores['test_r2'].std()

    print('[CV] RMSE: %.4f (%.4f)' % (mse_score, mse_std))
    print('[CV] MAE: %.4f (%.4f)' % (mae_score, mae_std))
    print('[CV] R^2: %.4f (%.4f)' % (r2_score_mean, r2_std))
    print('[CV] RMSLE: %.6f (%.4f)' % (-1 * rmsle_score.mean(), rmsle_score.std()))

In [8]:
def score_predictions(y_true, y_predicted):
    rmsle_score = neg_rmsle(y_true, y_predicted)
    mae_score = mean_absolute_error(y_true, y_predicted)

    print('RMSLE: %.8f' % (-1 * rmsle_score))
    print('MAE: %.8f' % (mae_score))

In [9]:
def get_columns_from_transformer(column_transformer, input_colums):    
    col_name = []

    for transformer_in_columns in column_transformer.transformers_[:-1]: #the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names(raw_col_name)
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)

    [_, _, reminder_columns] = column_transformer.transformers_[-1]

    for col_idx in reminder_columns:
        col_name.append(input_colums[col_idx])

    return col_name

## Data Cleaning and Preparation

In [10]:
for dataframe in [train_df, test_df]:
    dataframe['YrSold'] = dataframe['YrSold'].astype(str)
    dataframe['MoSold'] = dataframe['MoSold'].astype(str)

In [11]:
num_features = [f for f in train_df.columns if train_df.dtypes[f] != 'object']
num_features.remove('Id')
num_features.remove('SalePrice')

cat_features = [f for f in train_df.columns if train_df.dtypes[f] == 'object']

In [12]:
for feature in (
    'PoolQC', 
    'FireplaceQu', 
    'Alley', 
    'Fence', 
    'MiscFeature', 
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'GarageType', 
    'GarageFinish', 
    'GarageQual', 
    'GarageCond',
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'MasVnrType',
):
    train_df[feature] = train_df[feature].fillna('None')
    test_df[feature] = test_df[feature].fillna('None')
    full_df[feature] = full_df[feature].fillna('None')

for feature in (
    'BsmtFinSF1', 
    'BsmtFinSF2', 
    'BsmtUnfSF',
    'TotalBsmtSF', 
    'BsmtFullBath', 
    'BsmtHalfBath',
    'MasVnrArea',
    'GarageCars',
    'GarageArea',
    'GarageYrBlt',
):
    train_df[feature] = train_df[feature].fillna(0)
    test_df[feature] = test_df[feature].fillna(0)
    full_df[feature] = full_df[feature].fillna(0)

for feature in (
    'Electrical', 
    'KitchenQual', 
    'Exterior1st',
    'Exterior2nd', 
    'SaleType',
    'Utilities',
):
    train_df[feature] = train_df[feature].fillna(train_df[feature].mode()[0])
    test_df[feature] = test_df[feature].fillna(test_df[feature].mode()[0])
    full_df[feature] = full_df[feature].fillna(test_df[feature].mode()[0])

for dataframe in [train_df, test_df, full_df]:
    dataframe['MSZoning'] = dataframe.groupby(['Neighborhood'])['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
    dataframe['MSSubClass'] = dataframe.groupby(['HouseStyle'])['MSSubClass'].transform(lambda x: x.fillna(x.mode()[0]))
    dataframe['LotFrontage'] = dataframe.groupby(['Neighborhood', 'MSSubClass'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    dataframe['Functional'] = dataframe['Functional'].fillna('Typ')

In [13]:
train_df['LotFrontage'] = train_df.groupby(['Neighborhood', 'MSSubClass'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
test_df['LotFrontage'] = test_df.groupby(['Neighborhood', 'MSSubClass'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [14]:
for dataframe in [train_df, test_df]:
    dataframe['Exterior1st'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)
    dataframe['Exterior2nd'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)

In [15]:
ordinal_feature_mapping = {
    'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}, 
    'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'PoolQC': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'Fence': {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'CentralAir': {'N': 0, 'Y': 1},
    'Alley': {'None': 0, 'Pave': 1, 'Grvl': 2},
    'Street': {'Pave': 0, 'Grvl': 1},
    'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7}
}

non_ordinal_cat_features = list(set(cat_features) - set(ordinal_feature_mapping.keys()))

for cat_feature in non_ordinal_cat_features:
    train_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(train_df[cat_feature])
    test_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(test_df[cat_feature])

for ordinal_feature, feature_mapping in ordinal_feature_mapping.items():
    train_df[ordinal_feature + 'Enc'] = train_df[ordinal_feature].map(feature_mapping)
    test_df[ordinal_feature + 'Enc'] = test_df[ordinal_feature].map(feature_mapping)

### Num Feature Scaling

In [16]:
# https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

## Feature Engineering

In [17]:
for dataframe in [train_df, test_df]:
    dataframe['Has2ndFloor'] = dataframe['2ndFlrSF'].apply(lambda x: int(x > 0))
    dataframe['HasBsmnt'] = dataframe['TotalBsmtSF'].apply(lambda x: int(x > 0))
    dataframe['HasGarage'] = dataframe['GarageArea'].apply(lambda x: int(x > 0))
    dataframe['HasPool'] = dataframe['PoolArea'].apply(lambda x: int(x > 0))
    dataframe['HasFence'] = dataframe['Fence'].apply(lambda x: int(x != 'None'))
    dataframe['HasFireplace'] = dataframe['Fireplaces'].apply(lambda x: int(x > 0))
    dataframe['HasMasVnr'] = dataframe['MasVnrType'].apply(lambda x: int(x != 'None'))
    dataframe['HasShed'] = (dataframe['MiscFeature'] == 'Shed') * 1

    dataframe['HouseAge'] = dataframe['YrSold'].astype('int') - dataframe['YearBuilt'].astype('int')
    dataframe['HouseAgeSinRemod'] = dataframe['YrSold'].astype('int') - dataframe['YearRemodAdd'].astype('int')

In [18]:
# Remove outliers
#train_df.drop(
#    train_df[(train_df['GrLivArea'] > 4000) & (train_df['SalePrice'] < 700000)].index
#);

In [19]:
subclassCategories = [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190] #removed 150 class as useless during predictions

exteriorCategories = ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']

neighborhoodCategories = full_df['Neighborhood'].unique()
saleCondCategories = full_df['SaleCondition'].unique()
garageTypeCategories = full_df['GarageType'].unique()
lotConfigCategories = full_df['LotConfig'].unique() # feature was removed from the model
lotShapeCategories = full_df['LotShape'].unique()
landSlopeCategories = full_df['LandSlope'].unique()
roof_style_categories = full_df['RoofStyle'].unique()

In [20]:
year_built_bins = np.linspace(1871, 2010, 10) # 10 bins

In [21]:
# Reference: https://www.kaggle.com/cerberus4229/voting-regressor-with-pipelines

for dataframe in [train_df, test_df]:
    dataframe['TotalBathrooms'] = (dataframe['FullBath'] + (0.5 * dataframe['HalfBath']) +
                                dataframe['BsmtFullBath'] + (0.5 * dataframe['BsmtHalfBath']))

    dataframe['OverallHouseQC'] = dataframe['OverallQual'] + dataframe['OverallCond']
    dataframe['OverallHouseQCBin'] = pd.qcut(dataframe['OverallQual'] + dataframe['OverallCond'], q=3, labels=[0, 1, 2])
    
    dataframe['IsStreetGraved'] = (dataframe['Street'] == 'Grvl') * 1
    dataframe['IsPavedDrive'] = (dataframe['PavedDrive'] == 'Y') * 1

    dataframe['IsNeighborhoodElite'] = (dataframe['Neighborhood'].isin(['NridgHt', 'CollgeCr', 'Crawfor', 'StoreBr', 'Timber'])) * 1 

    dataframe['YearBuiltBin'] = pd.cut(dataframe['YearBuilt'], bins=year_built_bins, labels=range(1, 10)) 
    dataframe['YearBuiltBin2'] = pd.cut(dataframe['YearBuilt'], bins=[1871, 1910, 1955, 1980, 2000, 2011], labels=range(0, 5)) 
    
    dataframe['IsRecentlyBuilt'] = (full_df['YearBuilt'] == full_df['YrSold'].astype(int)) * 1
    dataframe['IsRecentlyRemod'] = (full_df['YearRemodAdd'] == full_df['YrSold'].astype(int)) * 1

    dataframe['KitchenQCBin'] = pd.cut(dataframe['KitchenQualEnc'] * dataframe['KitchenAbvGr'], [-1, 2, 7], labels=[0, 1])

    dataframe['IsFunctional'] = (dataframe['Functional'] == 'Typ') * 1
    dataframe['FunctionalGroup'] = dataframe['Functional'].map({
        'Typ': 2,
        'Min1': 1,
        'Min2': 1,
        'Mod': 1,
        'Maj1': 0,
        'Maj2': 0,
        'Sev': 0,
        'Sav': 0,
    })

    dataframe['IsModernHouseType'] = dataframe['MSSubClass'].map(
        {20: 1, 30: 0, 40: 0, 45: 0, 50: 0, 60: 1, 70: 0, 75: 0, 80: 0, 85: 0, 90: 0, 120: 1, 150: 0, 160: 1, 180: 0, 190: 0}
    )
    dataframe['IsLandFlat'] = (dataframe['LandContour'] == 'Lvl') * 1

    dataframe['IsExterCondGood'] = dataframe['ExterCond'].map({'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})

    dataframe['IsGasHeating'] = dataframe['Heating'].map({'GasA': 1, 'GasW': 1, 'Grav': 0, 'Wall': 0, 'OthW': 0, 'Floor': 0})
    dataframe['IsHeatingGood'] = dataframe['HeatingQC'].map({'Po': 0, 'Fa': 0, 'TA': 0, 'Gd': 1, 'Ex': 1})

    dataframe['IsFireplaceGood'] = dataframe['FireplaceQu'].map({'Po': 0, 'Fa': 0, 'TA': 0, 'Gd': 1, 'Ex': 1})

    dataframe['IsNewElectrBreakers'] = dataframe['Electrical'].map({'SBrkr': 1, 'FuseF': 0, 'FuseA': 0, 'FuseP': 0, 'Mix': 0})

    dataframe['IsGarageCondGood'] = dataframe['GarageCond'].map({'None': 0, 'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})
    dataframe['IsGarageQualGood'] = dataframe['GarageQual'].map({'None': 0, 'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})

    dataframe['RoofMatlCost'] = dataframe['RoofMatl'].map({
        'CompShg': 0,
        'WdShake': 1,
        'ClyTile': 1,
        'WdShngl': 1,
        'Roll': 0,
        'Metal': 1,
        'Membran': 0,
        'Tar&Grv': 0,
    })
    
    dataframe['IsWoodenRoof'] = dataframe['RoofMatl'].map({
        'CompShg': 0,
        'WdShake': 1,
        'ClyTile': 0,
        'WdShngl': 1,
        'Roll': 0,
        'Metal': 0,
        'Membran': 0,
        'Tar&Grv': 0,
    })

    dataframe['IsCompShngl'] = dataframe['RoofMatl'].map({
        'CompShg': 1,
        'WdShake': 0,
        'ClyTile': 0,
        'WdShngl': 0,
        'Roll': 0,
        'Metal': 0,
        'Membran': 0,
        'Tar&Grv': 0,
    })

    dataframe['IsAdjArterialStreat'] = ((dataframe['Condition1'] == 'Artery') | (dataframe['Condition2'] == 'Artery')) * 1
    dataframe['IsAdjFeederStreat'] = ((dataframe['Condition1'] == 'Feedr') | (dataframe['Condition2'] == 'Feedr')) * 1
    dataframe['IsNormalCondition'] = ((dataframe['Condition1'] == 'Norm') | (dataframe['Condition2'] == 'Norm')) * 1
    dataframe['IsAjdOffSiteFeature'] = ((dataframe['Condition1'] == 'PosA') | (dataframe['Condition2'] == 'PosA')) * 1
    dataframe['IsNearOffSiteFeature'] = ((dataframe['Condition1'] == 'PosN') | (dataframe['Condition2'] == 'PosN')) * 1
    dataframe['IsNearRailroad'] = ((dataframe['Condition1'].isin(['RRNn', 'RRNe'])) | (dataframe['Condition2'].isin(['RRNn', 'RRNe']))) * 1
    dataframe['IsAdjRailroad'] = ((dataframe['Condition1'].isin(['RRAn', 'RRAe'])) | (dataframe['Condition2'].isin(['RRAn', 'RRAe']))) * 1

    dataframe['TotalPorchArea'] = dataframe['OpenPorchSF'] + dataframe['EnclosedPorch'] + dataframe['3SsnPorch'] + dataframe['ScreenPorch']
    dataframe['TotalPorchNumber'] = (dataframe['OpenPorchSF'] > 0)*1 + (dataframe['EnclosedPorch'] > 0)*1 + (dataframe['3SsnPorch'] > 0)*1 + (dataframe['ScreenPorch'] > 0)*1
    dataframe['HasMultiplePorches'] = (dataframe['TotalPorchNumber'] > 1) * 1
    dataframe['HasPorch'] = (dataframe['TotalPorchArea'] > 0) * 1
    dataframe['HasWoodDeck'] = (dataframe['WoodDeckSF'] > 0) * 1

    dataframe['WoodDeckGroups'] = pd.cut(dataframe['WoodDeckSF'], bins=[-1, 1, 200, 500, 2000], labels=[0, 1, 2, 3])
    dataframe['OpenPorchGroups'] = pd.cut(dataframe['OpenPorchSF'], bins=[-1, 1, 40, 70, 1000], labels=[0, 1, 2, 3])
    dataframe['HasEnclosedPorch'] = (dataframe['EnclosedPorch'] > 0) * 1
    dataframe['HasScreenPorch'] = (dataframe['ScreenPorch'] > 0) * 1
    dataframe['HasEnclosedPorch'] = (dataframe['EnclosedPorch'] > 0) * 1

    dataframe['AvgRoomSF'] = dataframe['GrLivArea'] / dataframe['TotRmsAbvGrd']

    dataframe['Shed'] = (dataframe['MiscFeature'] == 'Shed') * 1 * full_df['MiscVal']
    dataframe['OtherFeaturePriceAdj'] = (dataframe['MiscFeature'].isin(['TenC', 'Othr', 'Gar2'])) * 1 * full_df['MiscVal']

    dataframe['Season'] = dataframe['MoSold'].map({
        '12': 0, '1': 0, '2': 0,
        '3': 1, '4': 1, '5': 1, 
        '6': 2, '7': 2, '8': 2, 
        '9': 3, '10': 3, '11': 3,
    })

    dataframe['IsSplitHouse'] = dataframe['MSSubClass'].map({
        '20': 0, 
        '30': 0, 
        '40': 0, 
        '45': 0, 
        '50': 0, 
        '60': 0, 
        '70': 0, 
        '75': 0,
        '80': 1, 
        '85': 1, 
        '90': 0, 
        '120': 0, 
        '150': 0, 
        '160': 0, 
        '180': 1, 
        '190': 0,
    })

    dataframe['IsPlannedUnitDevelopment'] = dataframe['MSSubClass'].map({
        '20': 0, 
        '30': 0, 
        '40': 0, 
        '45': 0, 
        '50': 0, 
        '60': 0, 
        '70': 0, 
        '75': 0,
        '80': 0, 
        '85': 0, 
        '90': 0, 
        '120': 1, 
        '150': 1, 
        '160': 1, 
        '180': 1, 
        '190': 0,
    })

    dataframe['NeighborhoodGroups'] = dataframe['Neighborhood'].map({
        'MeadowV': 0,
        'IDOTRR': 1,
        'BrDale': 1,
        'OldTown': 1,
        'Edwards': 1,
        'BrkSide': 1,
        'Sawyer': 1,
        'Blueste': 1,
        'SWISU': 2,
        'NAmes': 2,
        'NPkVill': 2,
        'Mitchel': 2,
        'SawyerW': 2,
        'Gilbert': 2,  
        'NWAmes': 2,   
        'Blmngtn': 2,  
        'CollgCr': 2,  
        'ClearCr': 3,  
        'Crawfor': 3,  
        'Veenker': 3,  
        'Somerst': 3,  
        'Timber': 3,   
        'StoneBr': 4, 
        'NoRidge': 4, 
        'NridgHt': 4,
    })

    dataframe['IsPopularRoofStyle'] = dataframe['RoofStyle'].isin(['Hip', 'Gable']) * 1
    dataframe['LowQualLivAreaPart'] = dataframe['LowQualFinSF'] / full_df['GrLivArea']

    dataframe['IsHouseFinished'] = (dataframe['HouseStyle'].isin(['2Story', '1Story', '1.5Fin', 'SFoyer', 'SLvl', '2.5Fin'])) * 1

    dataframe['HasBsmtExposure'] = dataframe['BsmtExposure'].map({
        'None': 0,
        'No': 0,
        'Mn': 1,
        'Av': 1,
        'Gd': 1
    })
    
    dataframe['BsmtExposureSimpl'] = dataframe['BsmtExposure'].map({
        'None': 0,
        'No': 1,
        'Mn': 2,
        'Av': 2,
        'Gd': 3,
    })

    dataframe['ExteriorIntersection'] = dataframe['Exterior1st'] + dataframe['Exterior2nd']
    dataframe['IsBuiltInXX'] = dataframe['YearBuilt'].apply(lambda x: 1 if x > 2000 else 0)
    dataframe['InflactionFactor'] = 2010 - dataframe['YrSold'].astype(int)

    dataframe['MasVnrAreaGroups'] = pd.cut(dataframe['MasVnrArea'], bins=[-1, 1, 200, 400, 800, 1700], labels=[0, 1, 2, 3, 4])

    dataframe['ExterQC'] = dataframe['ExterQualEnc'] + dataframe['ExterCondEnc']
    dataframe['BsmtQC'] = dataframe['BsmtQualEnc'] + dataframe['BsmtCondEnc']

    dataframe['GarageAreaPerCar'] = dataframe['GarageArea'] / dataframe['GarageCars']

    dataframe['SaleTypeGroup'] = dataframe['SaleType'].map({
        'WD': 'WD',
        'CWD': 'WD',
        'New': 'New',
        'COD': 'COD',
        'Con': 'Contract',
        'ConLD': 'Contract',
        'ConLw': 'Contract',
        'ConLI': 'Contract',
        'Oth': 'Other'
    })


In [22]:
# https://www.kaggle.com/humananalog/xgboost-lasso

In [23]:
features = [
    'GrLivArea',
    '1stFlrSF',
    '2ndFlrSF',
    'LotArea',
    'BsmtFinSF1', 
    'BsmtFinSF2',
    'BsmtUnfSF', 
    'BsmtFinType1Enc',
    'BsmtFinType2Enc',
    'GarageCars',
    'OverallCond', 
    'Neighborhood',
    'LotShape',
    'LandSlope',
    'BsmtCondEnc',
    'BsmtQualEnc', 
    'SaleCondition',
    'CentralAirEnc',
    'IsAdjArterialStreat',
    'IsAdjFeederStreat',
    'IsNormalCondition',
    'IsNearOffSiteFeature',
    'IsAjdOffSiteFeature',
    'IsAdjRailroad',
    'TotalBathrooms',
    'GarageFinishEnc',
    'KitchenQualEnc',
    'BedroomAbvGr',
    'MSZoning',
    'IsRecentlyBuilt',
    'LandContour',
    'HasFireplace',
    'FunctionalGroup',
    'HouseAge',
    'FenceEnc',
    'IsGasHeating',
    'IsHeatingGood',
    'IsNewElectrBreakers',
    'IsGarageCondGood',
    'IsWoodenRoof',
    'RoofMatlCost',
    'BldgType',
    'HouseStyle',
    'MasVnrType',
    'TotRmsAbvGrd',
    'WoodDeckGroups',
    'HasEnclosedPorch',
    'YearBuiltBin',
    'HasScreenPorch',
    'AvgRoomSF',
    'BsmtExposureEnc',
    'Shed',
    'Season',
    'NeighborhoodGroups',
    'OverallHouseQC',
    'ExterQC',
    'InflactionFactor',
    'IsPavedDrive',
]

X = train_df[features]
Y = train_df['SalePrice']

x_test = test_df[features]

## Modeling

In [24]:
# Build feature transformer

logTransformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

featureTransformer = ColumnTransformer([
        ('basement_area_log', logTransformer, ['GrLivArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'LotArea', 'AvgRoomSF', 'Shed', 'TotRmsAbvGrd']),
        ('neighborhood_onehot', OneHotEncoder(categories=[neighborhoodCategories]), ['Neighborhood']),
        ('neighborhood_grp_onehot', OneHotEncoder(), ['NeighborhoodGroups']),
        ('lot_shape_onehot', OneHotEncoder(categories=[lotShapeCategories]), ['LotShape']),
        ('land_slope_onehot', OneHotEncoder(categories=[landSlopeCategories]), ['LandSlope']),
        ('sale_condtion_onehot', OneHotEncoder(categories=[saleCondCategories]), ['SaleCondition']),
        ('land_contour_onehot', OneHotEncoder(), ['LandContour']),
        ('zoning_onehot', OneHotEncoder(), ['MSZoning']),
        ('bldg_type_onehot', OneHotEncoder(), ['BldgType']),
        ('masvrn_type_onehot', OneHotEncoder(), ['MasVnrType']),
        ('house_style_onehot', OneHotEncoder(), ['HouseStyle']),
        ('season_onehot', OneHotEncoder(), ['Season']),
    ],
    remainder='passthrough'
)

### XGBoostRegressor

In [25]:
%%time

from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    max_depth=6,
    n_estimators=8000,
    learning_rate=0.01,
    min_child_weight=1.5,
    subsample=0.2,
    gamma=0.01,
    reg_alpha=1,
    reg_lambda=0.325,
    objective='reg:gamma',
    booster='gbtree'
)

xgb_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('xgb_regressor', xgb_model),
])

# Measure Performance
print('XGB Regressor:')
score_model(xgb_pipeline, X, Y)

## XGB Regressor:
# [CV] RMSE: 26724.8634 (5441.7765)
# [CV] MAE: 15203.3724 (13.3533)
# [CV] R^2: 0.8859 (0.0118)
# [CV] RMSLE: 0.122810 (0.0023)

XGB Regressor:
[CV] RMSE: 26979.5421 (2428.5517)
[CV] MAE: 15240.8392 (278.0163)
[CV] R^2: 0.8841 (0.0063)
[CV] RMSLE: 0.123613 (0.0029)
CPU times: user 3min, sys: 7.52 s, total: 3min 8s
Wall time: 29.4 s


In [26]:
xgb_pipeline.fit(X, Y)

y_test_predicted = xgb_pipeline.predict(x_test)
y_test_predicted = np.rint(y_test_predicted).astype(int)

# RMSLE: 0.11959549 (NeighborhoodGroups)
# RMSLE: 0.11951315 (OverralHouseQC)
# RMSLE: 0.11925251 (ExterQC)
# RMSLE: 0.11919112 (ExterQC && no ExternCondEnc)
# RMSLE: 0.11711295 (No OverralQual)
# RMSLE: 0.11643312 (InflationFactor)
# RMSLE: 0.11624550 (IsPavedDrive)
# RMSLE: 0.11620483 (Changed process of filling missing values)
# RMSLE: 0.11617889

score_predictions(test_y, y_test_predicted)

RMSLE: 0.11617889
MAE: 13012.90678547


In [27]:
test_df['SalePriceErrValue'] = np.abs(test_y - y_test_predicted)
test_df['SalePriceErrPerc'] = test_df['SalePriceErrValue'] / test_y

test_df[test_df['SalePriceErrPerc'] > 0.5]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,BsmtExposureSimpl,ExteriorIntersection,IsBuiltInXX,InflactionFactor,MasVnrAreaGroups,ExterQC,BsmtQC,GarageAreaPerCar,SalePriceErrValue,SalePriceErrPerc
53,1514,90,RL,98.0,13260,Pave,,IR1,Lvl,AllPub,...,1,HdBoardHdBoard,0,0,1,4,6,,61972.0,0.729941
76,1537,30,RM,68.0,9656,Pave,,Reg,Lvl,AllPub,...,1,AsbShngAsbShng,0,0,0,3,4,390.0,44264.0,3.461099
142,1603,30,C (all),66.0,8712,Grvl,,Reg,Lvl,AllPub,...,1,Wd SdngWd Sdng,0,0,0,2,6,330.0,33382.0,0.665802
189,1650,20,RL,60.0,7038,Pave,,Reg,Lvl,AllPub,...,1,VinylSdVinylSd,0,1,0,4,6,288.0,43194.0,0.564627
579,2040,60,RL,,24572,Pave,,IR1,Lvl,AllPub,...,1,Wd SdngWd Sdng,0,2,4,6,7,288.0,136413.0,0.90942
756,2217,20,RM,80.0,14584,Pave,,Reg,Low,AllPub,...,0,AsbShngVinylSd,0,2,0,1,0,243.5,43482.0,3.319237
1089,2550,20,RL,128.0,39290,Pave,,IR1,Bnk,AllPub,...,3,CemntBdCemntBd,1,3,4,6,8,384.666667,183478.0,0.997977
1299,2760,50,RL,60.0,10818,Pave,Grvl,Reg,Lvl,AllPub,...,1,VinylSdVinylSd,0,4,0,4,5,324.0,40258.0,0.503225


In [28]:
xgb_pipeline.fit(X, Y)
X_columns = get_columns_from_transformer(xgb_pipeline.named_steps['preprocessing'], list(X.columns))

In [29]:
features_list = sorted(zip(xgb_pipeline.named_steps['xgb_regressor'].feature_importances_, X_columns), reverse=True)
features_list

[(0.07669324, 'GarageCars'),
 (0.05976424, 'KitchenQualEnc'),
 (0.050573662, 'GrLivArea'),
 (0.039795697, 'BsmtQualEnc'),
 (0.039587017, 'NeighborhoodGroups_4'),
 (0.033879694, 'TotalBathrooms'),
 (0.030140286, 'MSZoning_C (all)'),
 (0.02975239, 'NeighborhoodGroups_1'),
 (0.028083896, 'OverallHouseQC'),
 (0.026175134, 'IsGasHeating'),
 (0.025397973, 'GarageFinishEnc'),
 (0.02405259, 'CentralAirEnc'),
 (0.01974732, 'IsGarageCondGood'),
 (0.018848225, 'HasFireplace'),
 (0.015064411, 'HouseAge'),
 (0.014854352, '1stFlrSF'),
 (0.013723822, 'ExterQC'),
 (0.011702661, 'SaleCondition_Alloca'),
 (0.01057349, 'SaleCondition_Family'),
 (0.010484252, 'RoofMatlCost'),
 (0.010112458, 'Neighborhood_MeadowV'),
 (0.010094301, 'Neighborhood_IDOTRR'),
 (0.009020462, 'BldgType_Duplex'),
 (0.008974923, 'BldgType_2fmCon'),
 (0.008958503, 'LotShape_IR3'),
 (0.008437211, 'MSZoning_RM'),
 (0.008108573, 'IsNormalCondition'),
 (0.008073493, 'SaleCondition_Abnorml'),
 (0.0072961804, 'IsAdjArterialStreat'),
 (0.0

In [30]:
from eli5.sklearn import PermutationImportance

transformed_X = xgb_pipeline.named_steps['preprocessing'].transform(X)

permutation_importance = PermutationImportance(
    xgb_model, 
    scoring=make_scorer(neg_rmsle),
    cv=2,
    random_state=42,
).fit(transformed_X, Y)

eli5.show_weights(permutation_importance, feature_names=X_columns, top=125)

Weight,Feature
0.0890  ± 0.0067,GrLivArea
0.0341  ± 0.0055,OverallHouseQC
0.0168  ± 0.0061,1stFlrSF
0.0140  ± 0.0068,HouseAge
0.0116  ± 0.0022,LotArea
0.0098  ± 0.0038,GarageCars
0.0065  ± 0.0021,TotalBathrooms
0.0054  ± 0.0022,KitchenQualEnc
0.0051  ± 0.0026,BsmtFinSF1
0.0034  ± 0.0032,NeighborhoodGroups_1


In [31]:
%%time

parameters = {
    'xgb_regressor__objective': ['reg:gamma'], # 'reg:squarederror', 'reg:squaredlogerror'
    'xgb_regressor__learning_rate': [0.01],
    'xgb_regressor__n_estimators': [7900, 8000, 8100],
    'xgb_regressor__max_depth': [11, 12, 13],
    'xgb_regressor__booster': ['gbtree'],
    'xgb_regressor__min_child_weight': [1.5],
    'xgb_regressor__gamma': [0],
    'xgb_regressor__subsample': [0.2],
    'xgb_regressor__reg_alpha': [0, 0.9, 1],
    'xgb_regressor__reg_lambda': [1, 0.3],
}

paramSearch = GridSearchCV(
   estimator=xgb_pipeline,
   scoring=make_scorer(neg_rmsle),
   param_grid=parameters,
   cv=2,
   n_jobs=-1, 
   verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

CPU times: user 1.19 ms, sys: 129 µs, total: 1.32 ms
Wall time: 95.1 µs


## Predict Submissions

In [32]:
xgb_pipeline.fit(X, Y)

y_test_predicted = xgb_pipeline.predict(x_test)
y_test_predicted = np.rint(y_test_predicted).astype(int)

submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': y_test_predicted,
})

submission_df.to_csv('./data/submission_xgb.csv', index=False)