In [1]:
import eli5
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_log_error, make_scorer
from sklearn.model_selection import GridSearchCV, cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer

In [2]:
np.random.seed(42)

# House Prices: Regression Modelling (Part II)

<img style="height: 450px" src="https://livability.com/sites/default/files/151SUBAME031.jpg" />

After we went through the EDA part, now it's time to turn our insights into a workable regression model 🧪

This notebook will be covering all aspects of modelling process I went through in order to get to **Top 4%** (Dec 2020) with a Kaggle RMLE **0.11617**.

**Feel free to upvote this notebook if you find it helpful** 💫 

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

full_df = pd.concat([train_df, test_df], sort=True).reset_index(drop=True)

## Misc Functions

In [4]:
# define a new scoring that is used during Kaggle submission

def neg_rmsle(y_true, y_pred):
    y_pred = np.abs(y_pred)
    
    return -1 * np.sqrt(mean_squared_log_error(y_true, y_pred))

In [5]:
# function that cross-validate model performance based on RMSE, MAE, R^2 and RMSLE metrics 

def score_model(model, X, Y):
    
    scores = cross_validate(
        model, X, Y, 
        scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], cv=2,
        n_jobs=-1, verbose=0)

    rmsle_score = cross_val_score(model, X, Y, cv=2, scoring=make_scorer(neg_rmsle))

    mse_score = np.sqrt(-1 * scores['test_neg_mean_squared_error'].mean())
    mse_std = np.sqrt(scores['test_neg_mean_squared_error'].std())

    mae_score = -1 * scores['test_neg_mean_absolute_error'].mean()
    mae_std = scores['test_neg_mean_absolute_error'].std()

    r2_score_mean = scores['test_r2'].mean()
    r2_std = scores['test_r2'].std()

    print('[CV] RMSE: %.4f (%.4f)' % (mse_score, mse_std))
    print('[CV] MAE: %.4f (%.4f)' % (mae_score, mae_std))
    print('[CV] R^2: %.4f (%.4f)' % (r2_score_mean, r2_std))
    print('[CV] RMSLE: %.6f (%.4f)' % (-1 * rmsle_score.mean(), rmsle_score.std()))

In [6]:
# sklearn's pipeline API is limited at this point and doesn't provide a way to get columns of transformed X array
# This snippet will cover our back 

def get_columns_from_transformer(column_transformer, input_colums):    
    col_name = []

    for transformer_in_columns in column_transformer.transformers_[:-1]: #the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names(raw_col_name)
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)

    [_, _, reminder_columns] = column_transformer.transformers_[-1]

    for col_idx in reminder_columns:
        col_name.append(input_colums[col_idx])

    return col_name

# Data Cleaning and Preparation

In [7]:
for feature in (
    'PoolQC', 
    'FireplaceQu', 
    'Alley', 
    'Fence', 
    'MiscFeature', 
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'GarageType', 
    'GarageFinish', 
    'GarageQual', 
    'GarageCond',
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'MasVnrType',
):
    train_df[feature] = train_df[feature].fillna('None')
    test_df[feature] = test_df[feature].fillna('None')
    full_df[feature] = full_df[feature].fillna('None')

for feature in (
    'BsmtFinSF1', 
    'BsmtFinSF2', 
    'BsmtUnfSF',
    'TotalBsmtSF', 
    'BsmtFullBath', 
    'BsmtHalfBath',
    'MasVnrArea',
    'GarageCars',
    'GarageArea',
    'GarageYrBlt',
):
    train_df[feature] = train_df[feature].fillna(0)
    test_df[feature] = test_df[feature].fillna(0)
    full_df[feature] = full_df[feature].fillna(0)

for feature in (
    'Electrical', 
    'KitchenQual', 
    'Exterior1st',
    'Exterior2nd', 
    'SaleType',
    'Utilities',
):
    train_df[feature] = train_df[feature].fillna(train_df[feature].mode()[0])
    test_df[feature] = test_df[feature].fillna(test_df[feature].mode()[0])
    full_df[feature] = full_df[feature].fillna(full_df[feature].mode()[0])

for dataframe in [train_df, test_df, full_df]:
    dataframe['MSZoning'] = dataframe.groupby(['Neighborhood'])['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
    dataframe['MSSubClass'] = dataframe.groupby(['HouseStyle'])['MSSubClass'].transform(lambda x: x.fillna(x.mode()[0]))
    dataframe['LotFrontage'] = dataframe.groupby(['Neighborhood', 'MSSubClass'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    dataframe['Functional'] = dataframe['Functional'].fillna('Typ')

# Feature Engineering

For the sake of feature engineering we would like to encode some of the ordinal features that represents quality or conditions:

In [8]:
num_features = [f for f in train_df.columns if train_df.dtypes[f] != 'object']
num_features.remove('Id')
num_features.remove('SalePrice')

cat_features = [f for f in train_df.columns if train_df.dtypes[f] == 'object']

In [9]:
ordinal_feature_mapping = {
    'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}, 
    'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'PoolQC': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'Fence': {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'CentralAir': {'N': 0, 'Y': 1},
    'Alley': {'None': 0, 'Pave': 1, 'Grvl': 2},
    'Street': {'Pave': 0, 'Grvl': 1},
    'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7}
}

non_ordinal_cat_features = list(set(cat_features) - set(ordinal_feature_mapping.keys()))

for cat_feature in non_ordinal_cat_features:
    train_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(train_df[cat_feature])
    test_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(test_df[cat_feature])
    full_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(full_df[cat_feature])

for ordinal_feature, feature_mapping in ordinal_feature_mapping.items():
    train_df[ordinal_feature + 'Enc'] = train_df[ordinal_feature].map(feature_mapping)
    test_df[ordinal_feature + 'Enc'] = test_df[ordinal_feature].map(feature_mapping)
    full_df[ordinal_feature + 'Enc'] = full_df[ordinal_feature].map(feature_mapping)

In [10]:
neighborhoodCategories = full_df['Neighborhood'].unique()
saleCondCategories = full_df['SaleCondition'].unique()
garageTypeCategories = full_df['GarageType'].unique()
lotShapeCategories = full_df['LotShape'].unique()
landSlopeCategories = full_df['LandSlope'].unique()

Now we can engineer new features. During expiriments there were much more features, but I have removed those that did not help me to improve model generalization. Here is a list of residuals that I found to be useful during modelling:

In [11]:
for dataframe in [train_df, test_df, full_df]:
    dataframe['AvgRoomSF'] = dataframe['GrLivArea'] / dataframe['TotRmsAbvGrd']

    dataframe['OverallHouseQC'] = dataframe['OverallQual'] + dataframe['OverallCond']

    dataframe['IsNeighborhoodElite'] = (dataframe['Neighborhood'].isin(['NridgHt', 'CollgeCr', 'Crawfor', 'StoreBr', 'Timber'])) * 1 

    dataframe['NeighborhoodGroups'] = dataframe['Neighborhood'].map({
        'MeadowV': 0,
        'IDOTRR': 1,
        'BrDale': 1,
        'OldTown': 1,
        'Edwards': 1,
        'BrkSide': 1,
        'Sawyer': 1,
        'Blueste': 1,
        'SWISU': 2,
        'NAmes': 2,
        'NPkVill': 2,
        'Mitchel': 2,
        'SawyerW': 2,
        'Gilbert': 2,  
        'NWAmes': 2,   
        'Blmngtn': 2,  
        'CollgCr': 2,  
        'ClearCr': 3,  
        'Crawfor': 3,  
        'Veenker': 3,  
        'Somerst': 3,  
        'Timber': 3,   
        'StoneBr': 4, 
        'NoRidge': 4, 
        'NridgHt': 4,
    })

    dataframe['HasFireplace'] = dataframe['Fireplaces'].apply(lambda x: int(x > 0))

    dataframe['TotalBathrooms'] = (dataframe['FullBath'] + (0.5 * dataframe['HalfBath']) +
                                dataframe['BsmtFullBath'] + (0.5 * dataframe['BsmtHalfBath']))

    dataframe['IsPavedDrive'] = (dataframe['PavedDrive'] == 'Y') * 1

    year_built_bins = np.linspace(1871, 2010, 10) 
    dataframe['YearBuiltBin'] = pd.cut(dataframe['YearBuilt'], bins=year_built_bins, labels=range(1, 10)) 

    dataframe['HouseAge'] = dataframe['YrSold'] - dataframe['YearBuilt'].astype('int')
    dataframe['IsRecentlyBuilt'] = (full_df['YearBuilt'] == full_df['YrSold'].astype(int)) * 1
    dataframe['IsRecentlyRemod'] = (full_df['YearRemodAdd'] == full_df['YrSold'].astype(int)) * 1 # check it with a new model

    dataframe['ExterQC'] = dataframe['ExterQualEnc'] + dataframe['ExterCondEnc']

    dataframe['FunctionalGroup'] = dataframe['Functional'].map({
        'Typ': 2,
        'Min1': 1,
        'Min2': 1,
        'Mod': 1,
        'Maj1': 0,
        'Maj2': 0,
        'Sev': 0,
        'Sav': 0,
    })

    dataframe['IsGasHeating'] = dataframe['Heating'].map({'GasA': 1, 'GasW': 1, 'Grav': 0, 'Wall': 0, 'OthW': 0, 'Floor': 0})
    dataframe['IsHeatingGood'] = dataframe['HeatingQC'].map({'Po': 0, 'Fa': 0, 'TA': 0, 'Gd': 1, 'Ex': 1})

    dataframe['IsNewElectrBreakers'] = dataframe['Electrical'].map({'SBrkr': 1, 'FuseF': 0, 'FuseA': 0, 'FuseP': 0, 'Mix': 0})

    dataframe['IsGarageCondGood'] = dataframe['GarageCond'].map({'None': 0, 'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})

    dataframe['RoofMatlCost'] = dataframe['RoofMatl'].map({
        'CompShg': 0,
        'WdShake': 1,
        'ClyTile': 1,
        'WdShngl': 1,
        'Roll': 0,
        'Metal': 1,
        'Membran': 0,
        'Tar&Grv': 0,
    })
    
    dataframe['IsWoodenRoof'] = dataframe['RoofMatl'].map({
        'CompShg': 0,
        'WdShake': 1,
        'ClyTile': 0,
        'WdShngl': 1,
        'Roll': 0,
        'Metal': 0,
        'Membran': 0,
        'Tar&Grv': 0,
    })

    dataframe['IsAdjArterialStreat'] = ((dataframe['Condition1'] == 'Artery') | (dataframe['Condition2'] == 'Artery')) * 1
    dataframe['IsAdjFeederStreat'] = ((dataframe['Condition1'] == 'Feedr') | (dataframe['Condition2'] == 'Feedr')) * 1
    dataframe['IsNormalCondition'] = ((dataframe['Condition1'] == 'Norm') | (dataframe['Condition2'] == 'Norm')) * 1
    dataframe['IsAjdOffSiteFeature'] = ((dataframe['Condition1'] == 'PosA') | (dataframe['Condition2'] == 'PosA')) * 1
    dataframe['IsNearOffSiteFeature'] = ((dataframe['Condition1'] == 'PosN') | (dataframe['Condition2'] == 'PosN')) * 1
    dataframe['IsNearRailroad'] = ((dataframe['Condition1'].isin(['RRNn', 'RRNe'])) | (dataframe['Condition2'].isin(['RRNn', 'RRNe']))) * 1
    dataframe['IsAdjRailroad'] = ((dataframe['Condition1'].isin(['RRAn', 'RRAe'])) | (dataframe['Condition2'].isin(['RRAn', 'RRAe']))) * 1

    dataframe['WoodDeckGroups'] = pd.cut(dataframe['WoodDeckSF'], bins=[-1, 1, 200, 500, 2000], labels=[0, 1, 2, 3])
    dataframe['HasEnclosedPorch'] = (dataframe['EnclosedPorch'] > 0) * 1
    dataframe['HasScreenPorch'] = (dataframe['ScreenPorch'] > 0) * 1
    dataframe['HasEnclosedPorch'] = (dataframe['EnclosedPorch'] > 0) * 1

    dataframe['Shed'] = (dataframe['MiscFeature'] == 'Shed') * 1 * full_df['MiscVal']

    dataframe['Season'] = dataframe['MoSold'].map({
        12: 0, 1: 0, 2: 0,
        3: 1, 4: 1, 5: 1, 
        6: 2, 7: 2, 8: 2, 
        9: 3, 10: 3, 11: 3,
    })

    dataframe['InflationFactor'] = dataframe['YrSold'] - 2006

I had to keep feature engineering and preceding steps outside of sklearn pipelines. This is due to the current limitations of the pipeline API which would made the code less readable and clean.

I have not used any automatic technics for feature engineering and selection, but manually selected features based on my understanding of the domain and EDA:

In [12]:
features = [
    'GrLivArea',
    '1stFlrSF',
    '2ndFlrSF',
    'LotArea',
    'BsmtFinSF1', 
    'BsmtFinSF2',
    'BsmtUnfSF', 
    'BsmtFinType1Enc',
    'BsmtFinType2Enc',
    'GarageCars',
    'OverallCond', 
    'Neighborhood',
    'LotShape',
    'LandSlope',
    'BsmtCondEnc',
    'BsmtQualEnc', 
    'SaleCondition',
    'CentralAirEnc',
    'IsAdjArterialStreat',
    'IsAdjFeederStreat',
    'IsNormalCondition',
    'IsNearOffSiteFeature',
    'IsAjdOffSiteFeature',
    'IsAdjRailroad',
    'TotalBathrooms',
    'GarageFinishEnc',
    'KitchenQualEnc',
    'BedroomAbvGr',
    'MSZoning',
    'IsRecentlyBuilt',
    'LandContour',
    'HasFireplace',
    'FunctionalGroup',
    'HouseAge',
    'FenceEnc',
    'IsGasHeating',
    'IsHeatingGood',
    'IsNewElectrBreakers',
    'IsGarageCondGood',
    'IsWoodenRoof',
    'RoofMatlCost',
    'BldgType',
    'HouseStyle',
    'MasVnrType',
    'TotRmsAbvGrd',
    'WoodDeckGroups',
    'HasEnclosedPorch',
    'YearBuiltBin',
    'HasScreenPorch',
    'AvgRoomSF',
    'BsmtExposureEnc',
    'Shed',
    'Season',
    'NeighborhoodGroups',
    'OverallHouseQC',
    'ExterQC',
    'InflationFactor',
    'IsPavedDrive',
]

X = train_df[features]
Y = train_df['SalePrice']

x_test = test_df[features]

# Modeling

During experiments, I tried different models: RandomForest, Lasso, Ridge, GradientBoosting, HistGradientBoosting, VotingRegressor, StackRegressor. So far XGBoost have showed the best results for my feature set:

In [13]:
logTransformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

featureTransformer = ColumnTransformer([
        ('log_scaling', logTransformer, ['GrLivArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'LotArea', 'AvgRoomSF', 'Shed', 'TotRmsAbvGrd']),
        ('neighborhood_onehot', OneHotEncoder(categories=[neighborhoodCategories]), ['Neighborhood']),
        ('neighborhood_grp_onehot', OneHotEncoder(), ['NeighborhoodGroups']),
        ('lot_shape_onehot', OneHotEncoder(categories=[lotShapeCategories]), ['LotShape']),
        ('land_slope_onehot', OneHotEncoder(categories=[landSlopeCategories]), ['LandSlope']),
        ('sale_condition_onehot', OneHotEncoder(categories=[saleCondCategories]), ['SaleCondition']),
        ('land_contour_onehot', OneHotEncoder(), ['LandContour']),
        ('zoning_onehot', OneHotEncoder(), ['MSZoning']),
        ('bldg_type_onehot', OneHotEncoder(), ['BldgType']),
        ('masvrn_type_onehot', OneHotEncoder(), ['MasVnrType']),
        ('house_style_onehot', OneHotEncoder(), ['HouseStyle']),
        ('season_onehot', OneHotEncoder(), ['Season']),
    ],
    remainder='passthrough'
)

In [14]:
%%time

from xgboost import XGBRegressor

xgb_model = XGBRegressor(
    max_depth=6,
    n_estimators=8000,
    learning_rate=0.01,
    min_child_weight=1.5,
    subsample=0.2,
    gamma=0.01,
    reg_alpha=1,
    reg_lambda=0.325,
    objective='reg:gamma',
    booster='gbtree'
)

xgb_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('xgb_regressor', xgb_model),
])

print('XGB Regressor:')
score_model(xgb_pipeline, X, Y)

XGB Regressor:
[CV] RMSE: 26969.3207 (2539.5470)
[CV] MAE: 15251.8815 (266.9740)
[CV] R^2: 0.8842 (0.0062)
[CV] RMSLE: 0.123592 (0.0029)
CPU times: user 2min 6s, sys: 2.69 s, total: 2min 9s
Wall time: 24.4 s


Ames dataset is pretty small for the amount of features it contains. I have used 2-folds cross-validation to evaluate the model and find hyperparams. Two folds should help to ensure that we have enough samples in validation fold to get more-less meaningfull feedback about model changes.

## Debugging

To inspect model results and affect of different adjustments, I have collected information about model feature importance and used ELI5 permutation importance as an additional source of information:

In [15]:
xgb_pipeline.fit(X, Y)
X_columns = get_columns_from_transformer(xgb_pipeline.named_steps['preprocessing'], list(X.columns))

In [16]:
features_list = sorted(zip(xgb_pipeline.named_steps['xgb_regressor'].feature_importances_, X_columns), reverse=True)
features_list

[(0.07669324, 'GarageCars'),
 (0.05976424, 'KitchenQualEnc'),
 (0.050573662, 'GrLivArea'),
 (0.039795697, 'BsmtQualEnc'),
 (0.039587017, 'NeighborhoodGroups_4'),
 (0.033879694, 'TotalBathrooms'),
 (0.030140286, 'MSZoning_C (all)'),
 (0.02975239, 'NeighborhoodGroups_1'),
 (0.028083896, 'OverallHouseQC'),
 (0.026175134, 'IsGasHeating'),
 (0.025397973, 'GarageFinishEnc'),
 (0.02405259, 'CentralAirEnc'),
 (0.01974732, 'IsGarageCondGood'),
 (0.018848225, 'HasFireplace'),
 (0.015064411, 'HouseAge'),
 (0.014854352, '1stFlrSF'),
 (0.013723822, 'ExterQC'),
 (0.011702661, 'SaleCondition_Alloca'),
 (0.01057349, 'SaleCondition_Family'),
 (0.010484252, 'RoofMatlCost'),
 (0.010112458, 'Neighborhood_MeadowV'),
 (0.010094301, 'Neighborhood_IDOTRR'),
 (0.009020462, 'BldgType_Duplex'),
 (0.008974923, 'BldgType_2fmCon'),
 (0.008958503, 'LotShape_IR3'),
 (0.008437211, 'MSZoning_RM'),
 (0.008108573, 'IsNormalCondition'),
 (0.008073493, 'SaleCondition_Abnorml'),
 (0.0072961804, 'IsAdjArterialStreat'),
 (0.0

In [17]:
from eli5.sklearn import PermutationImportance

transformed_X = xgb_pipeline.named_steps['preprocessing'].transform(X)

permutation_importance = PermutationImportance(
    xgb_model, 
    scoring=make_scorer(neg_rmsle),
    cv=2,
    random_state=42,
).fit(transformed_X, Y)

eli5.show_weights(permutation_importance, feature_names=X_columns, top=125)

Weight,Feature
0.0888  ± 0.0066,GrLivArea
0.0341  ± 0.0055,OverallHouseQC
0.0169  ± 0.0062,1stFlrSF
0.0140  ± 0.0068,HouseAge
0.0116  ± 0.0022,LotArea
0.0098  ± 0.0038,GarageCars
0.0065  ± 0.0020,TotalBathrooms
0.0054  ± 0.0022,KitchenQualEnc
0.0050  ± 0.0027,BsmtFinSF1
0.0034  ± 0.0032,NeighborhoodGroups_1


## Hypetuning

Parameter fine-tuning is quite time-consuming process. I have used GrigSearch approach to find parameter baseline set and than kind of manually optimized them from that point. My best shot is used in the XGB params above.

In [18]:
%%time

parameters = {
    'xgb_regressor__objective': ['reg:gamma'], # 'reg:squarederror', 'reg:squaredlogerror'
    'xgb_regressor__learning_rate': [0.01],
    'xgb_regressor__n_estimators': [7900, 8000, 8100],
    'xgb_regressor__max_depth': [11, 12, 13],
    'xgb_regressor__booster': ['gbtree'],
    'xgb_regressor__min_child_weight': [1.5],
    'xgb_regressor__gamma': [0],
    'xgb_regressor__subsample': [0.2],
    'xgb_regressor__reg_alpha': [0, 0.9, 1],
    'xgb_regressor__reg_lambda': [1, 0.3],
}

paramSearch = GridSearchCV(
   estimator=xgb_pipeline,
   scoring=make_scorer(neg_rmsle),
   param_grid=parameters,
   cv=2,
   n_jobs=-1, 
   verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

CPU times: user 1.08 ms, sys: 210 µs, total: 1.29 ms
Wall time: 102 µs


## Predict Submissions

Finally, it's time to check model generalization with Kaggle submissions. I was able to get to **Top 4%** (Dec 2020) with a Kaggle RMLE **0.11617**:

In [19]:
xgb_pipeline.fit(X, Y)

y_test_predicted = xgb_pipeline.predict(x_test)
y_test_predicted = np.rint(y_test_predicted).astype(int)

submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': y_test_predicted,
})

submission_df.to_csv('./data/submission_xgb.csv', index=False)

# Summary

I have spent a few weeks on this task as part of my learn by doing plan and it was definetly worth it.

It's easy to create a baseline model and it's hard to go from that point to model that would be presice enough to be useful. For this task, XGBoost model was super helpul and allowed me to cross Top 25%. The rest of the improvements was brought by extensive feature engineering (and some by hypetuning).

**If you find this notebook helpfull, feel free to upvote** 💫

## Reference:
- https://www.kaggle.com/humananalog/xgboost-lasso
- https://www.kaggle.com/cerberus4229/voting-regressor-with-pipelines