In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, \
    RobustScaler, FunctionTransformer, PowerTransformer
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_score, learning_curve
from sklearn.pipeline import Pipeline
from sklearn import set_config

In [2]:
np.random.seed(42)
set_config(display='diagram')

plt.rcParams['figure.figsize'] = (12, 8)
sns.set_theme(style='whitegrid')

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

full_df = pd.concat([train_df, test_df], sort=True).reset_index(drop=True)

In [4]:
all_df = pd.read_csv('data/test_labels.csv')
test_y = all_df['SalePrice']
test_y

0       105000.0
1       172000.0
2       189900.0
3       195500.0
4       191500.0
          ...   
1454     90500.0
1455     71000.0
1456    131000.0
1457    132000.0
1458    188000.0
Name: SalePrice, Length: 1459, dtype: float64

## Misc Functions

In [5]:
def plot_learning_curve(estimator, X_train, y_train, cv, train_sizes=np.linspace(0.1, 1, 10)):
    plt.style.use('seaborn-darkgrid')
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X_train, y_train, 
        scoring='neg_mean_squared_error',
        cv=cv, 
        n_jobs=-1, 
        train_sizes=train_sizes,
        shuffle=True,
        random_state=42
        )
    train_mean_scores = np.mean(train_scores, axis=1)
    test_mean_scores = np.mean(test_scores, axis=1)

    plt.title('Learning curve')
    plt.plot(train_sizes, train_mean_scores, 'y', label='Train Learning curve')
    plt.plot(train_sizes, test_mean_scores, 'b', label='Test Learning curve')
    plt.legend()

In [6]:
def neg_rmsle(y_true, y_pred):
    y_pred = np.abs(y_pred)
    
    return -1 * np.sqrt(mean_squared_log_error(y_true, y_pred))

In [7]:
def score_model(model, X, Y):
    
    scores = cross_validate(
        model, X, Y, 
        scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], cv=2,
        n_jobs=-1, verbose=0)

    rmsle_score = cross_val_score(model, X, Y, cv=2, scoring=make_scorer(neg_rmsle))

    mse_score = -1 * scores['test_neg_mean_squared_error'].mean()
    mse_std = scores['test_neg_mean_squared_error'].std()

    mae_score = -1 * scores['test_neg_mean_absolute_error'].mean()
    mae_std = scores['test_neg_mean_absolute_error'].std()

    r2_score_mean = scores['test_r2'].mean()
    r2_std = scores['test_r2'].std()

    print('[CV] MSE: %.4f (%.4f)' % (mse_score, mse_std))
    print('[CV] MAE: %.4f (%.4f)' % (mae_score, mae_std))
    print('[CV] R^2: %.4f (%.4f)' % (r2_score_mean, r2_std))
    print('[CV] RMSLE: %.6f (%.4f)' % (-1 * rmsle_score.mean(), rmsle_score.std()))

In [8]:
def score_predictions(y_true, y_predicted):
    rmsle_score = neg_rmsle(y_true, y_predicted)
    print('RMSLE: %.8f' % (-1 * rmsle_score))

In [9]:
def get_columns_from_transformer(column_transformer, input_colums):    
    col_name = []

    for transformer_in_columns in column_transformer.transformers_[:-1]: #the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names(raw_col_name)
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)

    [_, _, reminder_columns] = column_transformer.transformers_[-1]

    for col_idx in reminder_columns:
        col_name.append(input_colums[col_idx])

    return col_name

## Data Cleaning and Preparation

In [10]:
for dataframe in [train_df, test_df]:
    dataframe['YrSold'] = dataframe['YrSold'].astype(str)
    dataframe['MoSold'] = dataframe['MoSold'].astype(str)

In [11]:
num_features = [f for f in train_df.columns if train_df.dtypes[f] != 'object']
num_features.remove('Id')
num_features.remove('SalePrice')

cat_features = [f for f in train_df.columns if train_df.dtypes[f] == 'object']

In [12]:
for feature in (
    'PoolQC', 
    'FireplaceQu', 
    'Alley', 
    'Fence', 
    'MiscFeature', 
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'GarageType', 
    'GarageFinish', 
    'GarageQual', 
    'GarageCond',
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'MasVnrType',
    'MSSubClass',
):
    train_df[feature] = train_df[feature].fillna('None')
    test_df[feature] = test_df[feature].fillna('None')
    full_df[feature] = full_df[feature].fillna('None')

for feature in (
    'BsmtFinSF1', 
    'BsmtFinSF2', 
    'BsmtUnfSF',
    'TotalBsmtSF', 
    'BsmtFullBath', 
    'BsmtHalfBath',
    'MasVnrArea',
    'GarageCars',
    'GarageArea',
    'GarageYrBlt',
):
    train_df[feature] = train_df[feature].fillna(0)
    test_df[feature] = test_df[feature].fillna(0)
    full_df[feature] = full_df[feature].fillna(0)

for feature in (
    'Electrical', 
    'KitchenQual', 
    'Exterior1st',
    'Exterior2nd', 
    'SaleType',
    'MSZoning',
    'Utilities',
):
    train_df[feature] = train_df[feature].fillna(train_df[feature].mode()[0])
    test_df[feature] = test_df[feature].fillna(test_df[feature].mode()[0])
    full_df[feature] = full_df[feature].fillna(test_df[feature].mode()[0])

train_df['Functional'] = train_df['Functional'].fillna('Typ')
test_df['Functional'] = test_df['Functional'].fillna('Typ')
full_df['Functional'] = full_df['Functional'].fillna('Typ')

In [13]:
train_df['LotFrontage'] = train_df.groupby(['Neighborhood', 'MSSubClass'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
test_df['LotFrontage'] = test_df.groupby(['Neighborhood', 'MSSubClass'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [14]:
for dataframe in [train_df, test_df]:
    dataframe['Exterior1st'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)
    dataframe['Exterior2nd'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)

In [15]:
ordinal_feature_mapping = {
    'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}, 
    'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'PoolQC': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'Fence': {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'CentralAir': {'N': 0, 'Y': 1},
    'Alley': {'None': 0, 'Pave': 1, 'Grvl': 2},
    'Street': {'Pave': 0, 'Grvl': 1},
    'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7}
}

non_ordinal_cat_features = list(set(cat_features) - set(ordinal_feature_mapping.keys()))

for cat_feature in non_ordinal_cat_features:
    train_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(train_df[cat_feature])
    test_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(test_df[cat_feature])

for ordinal_feature, feature_mapping in ordinal_feature_mapping.items():
    train_df[ordinal_feature + 'Enc'] = train_df[ordinal_feature].map(feature_mapping)
    test_df[ordinal_feature + 'Enc'] = test_df[ordinal_feature].map(feature_mapping)

### Num Feature Scaling

In [16]:
# https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

## Feature Engineering

In [17]:
for dataframe in [train_df, test_df]:
    dataframe['Has2ndFloor'] = dataframe['2ndFlrSF'].apply(lambda x: int(x > 0))
    dataframe['HasBsmnt'] = dataframe['TotalBsmtSF'].apply(lambda x: int(x > 0))
    dataframe['HasGarage'] = dataframe['GarageArea'].apply(lambda x: int(x > 0))
    dataframe['HasPool'] = dataframe['PoolArea'].apply(lambda x: int(x > 0))
    dataframe['HasFence'] = dataframe['Fence'].apply(lambda x: int(x != 'None'))
    dataframe['HasFireplace'] = dataframe['Fireplaces'].apply(lambda x: int(x > 0))
    dataframe['HasMasVnr'] = dataframe['MasVnrType'].apply(lambda x: int(x != 'None'))
    dataframe['HasShed'] = (dataframe['MiscFeature'] == 'Shed') * 1

    dataframe['HouseAge'] = dataframe['YrSold'].astype('int') - dataframe['YearBuilt'].astype('int')
    dataframe['HouseAgeSinRemod'] = dataframe['YrSold'].astype('int') - dataframe['YearRemodAdd'].astype('int')

In [18]:
# Remove outliers
train_df.drop(
    train_df[(train_df['GrLivArea'] > 4000) & (train_df['SalePrice'] < 700000)].index
);

In [19]:
subclassCategories = [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190] #removed 150 class as useless during predictions

exteriorCategories = ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']

neighborhoodCategories = full_df['Neighborhood'].unique()
saleCondCategories = full_df['SaleCondition'].unique()
garageTypeCategories = full_df['GarageType'].unique()
lotConfigCategories = full_df['LotConfig'].unique() # feature was removed from the model
lotShapeCategories = full_df['LotShape'].unique()
landSlopeCategories = full_df['LandSlope'].unique()

In [20]:
year_built_bins = np.linspace(1871, 2010, 10) # 10 bins

In [21]:
# Reference: https://www.kaggle.com/cerberus4229/voting-regressor-with-pipelines

for dataframe in [train_df, test_df]:
    dataframe['TotalBathrooms'] = (dataframe['FullBath'] + (0.5 * dataframe['HalfBath']) +
                                dataframe['BsmtFullBath'] + (0.5 * dataframe['BsmtHalfBath']))

    dataframe['OverallHouseQCBin'] = pd.qcut(dataframe['OverallQual'] + dataframe['OverallCond'], q=3, labels=[0, 1, 2])
    
    dataframe['IsStreetGraved'] = (dataframe['Street'] == 'Grvl') * 1
    dataframe['IsPavedDrive'] = (dataframe['PavedDrive'] == 'Y') * 1

    dataframe['IsNeighborhoodElite'] = (dataframe['Neighborhood'].isin(['NridgHt', 'CollgeCr', 'Crawfor', 'StoreBr', 'Timber'])) * 1 

    dataframe['YearBuiltBin'] = pd.cut(dataframe['YearBuilt'], bins=year_built_bins, labels=range(1, 10)) 
    dataframe['YearBuiltBin2'] = pd.cut(dataframe['YearBuilt'], bins=[1871, 1910, 1955, 1980, 2000, 2011], labels=range(0, 5)) 
    dataframe['IsNewHouseSold'] = (full_df['YearBuilt'] == full_df['YrSold'].astype(int)) * 1

    dataframe['KitchenQCBin'] = pd.cut(dataframe['KitchenQualEnc'] * dataframe['KitchenAbvGr'], [-1, 2, 7], labels=[0, 1])

    dataframe['IsFunctional'] = (dataframe['Functional'] == 'Typ') * 1
    dataframe['FunctionalGroup'] = dataframe['Functional'].map({
        'Typ': 2,
        'Min1': 1,
        'Min2': 1,
        'Mod': 1,
        'Maj1': 0,
        'Maj2': 0,
        'Sev': 0,
        'Sav': 0,
    })

    dataframe['IsModernHouseType'] = dataframe['MSSubClass'].map(
        {20: 1, 30: 0, 40: 0, 45: 0, 50: 0, 60: 1, 70: 0, 75: 0, 80: 0, 85: 0, 90: 0, 120: 1, 150: 0, 160: 0, 180: 0, 190: 0}
    )
    dataframe['IsLandFlat'] = (dataframe['LandContour'] == 'Lvl') * 1

    dataframe['IsExterCondGood'] = dataframe['ExterCond'].map({'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})

    dataframe['IsGasHeating'] = dataframe['Heating'].map({'GasA': 1, 'GasW': 1, 'Grav': 0, 'Wall': 0, 'OthW': 0, 'Floor': 0})
    dataframe['IsHeatingGood'] = dataframe['HeatingQC'].map({'Po': 0, 'Fa': 0, 'TA': 0, 'Gd': 1, 'Ex': 1})

    dataframe['IsNewElectrBreakers'] = dataframe['Electrical'].map({'SBrkr': 1, 'FuseF': 0, 'FuseA': 0, 'FuseP': 0, 'Mix': 0})

    dataframe['IsGarageCondGood'] = dataframe['GarageCond'].map({'None': 0, 'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})
    dataframe['IsGarageQualGood'] = dataframe['GarageQual'].map({'None': 0, 'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})

    dataframe['RoofMatlCost'] = dataframe['RoofMatl'].map({
        'CompShg': 0,
        'WdShake': 1,
        'ClyTile': 1,
        'WdShngl': 1,
        'Roll': 0,
        'Metal': 1,
        'Membran': 0,
        'Tar&Grv': 0,
    })
    
    dataframe['IsWoodenRoof'] = dataframe['RoofMatl'].map({
        'CompShg': 0,
        'WdShake': 1,
        'ClyTile': 0,
        'WdShngl': 1,
        'Roll': 0,
        'Metal': 0,
        'Membran': 0,
        'Tar&Grv': 0,
    })

    dataframe['IsCompShngl'] = dataframe['RoofMatl'].map({
        'CompShg': 1,
        'WdShake': 0,
        'ClyTile': 0,
        'WdShngl': 0,
        'Roll': 0,
        'Metal': 0,
        'Membran': 0,
        'Tar&Grv': 0,
    })

    dataframe['IsAdjArterialStreat'] = ((dataframe['Condition1'] == 'Artery') | (dataframe['Condition2'] == 'Artery')) * 1
    dataframe['IsAdjFeederStreat'] = ((dataframe['Condition1'] == 'Feedr') | (dataframe['Condition2'] == 'Feedr')) * 1
    dataframe['IsNormalCondition'] = ((dataframe['Condition1'] == 'Norm') | (dataframe['Condition2'] == 'Norm')) * 1
    dataframe['IsAjdOffSiteFeature'] = ((dataframe['Condition1'] == 'PosA') | (dataframe['Condition2'] == 'PosA')) * 1
    dataframe['IsNearOffSiteFeature'] = ((dataframe['Condition1'] == 'PosN') | (dataframe['Condition2'] == 'PosN')) * 1
    dataframe['IsNearRailroad'] = ((dataframe['Condition1'].isin(['RRNn', 'RRNe'])) | (dataframe['Condition2'].isin(['RRNn', 'RRNe']))) * 1
    dataframe['IsAdjRailroad'] = ((dataframe['Condition1'].isin(['RRAn', 'RRAe'])) | (dataframe['Condition2'].isin(['RRAn', 'RRAe']))) * 1

    dataframe['TotalPorchArea'] = dataframe['OpenPorchSF'] + dataframe['EnclosedPorch'] + dataframe['3SsnPorch'] + dataframe['ScreenPorch']
    dataframe['HasPorch'] = (dataframe['TotalPorchArea'] > 0) * 1
    dataframe['HasWoodDeck'] = (dataframe['WoodDeckSF'] > 0) * 1

    dataframe['WoodDeckGroups'] = pd.cut(dataframe['WoodDeckSF'], bins=[-1, 1, 200, 500, 2000], labels=[0, 1, 2, 3])
    dataframe['OpenPorchGroups'] = pd.cut(dataframe['OpenPorchSF'], bins=[-1, 1, 40, 70, 1000], labels=[0, 1, 2, 3])
    dataframe['HasEnclosedPorch'] = (dataframe['EnclosedPorch'] > 0) * 1
    dataframe['HasScreenPorch'] = (dataframe['ScreenPorch'] > 0) * 1
    dataframe['HasEnclosedPorch'] = (dataframe['EnclosedPorch'] > 0) * 1

    dataframe['AvgRoomSF'] = dataframe['GrLivArea'] / dataframe['TotRmsAbvGrd']

    dataframe['Shed'] = (dataframe['MiscFeature'] == 'Shed') * 1 * full_df['MiscVal']
    dataframe['OtherFeature'] = (dataframe['MiscFeature'].isin(['TenC', 'Othr'])) * 1 * full_df['MiscVal']

    dataframe['Season'] = dataframe['MoSold'].map({
        '12': 0, '1': 0, '2': 0,
        '3': 1, '4': 1, '5': 1, 
        '6': 2, '7': 2, '8': 2, 
        '9': 3, '10': 3, '11': 3,
    })


In [22]:
# https://www.kaggle.com/humananalog/xgboost-lasso

In [23]:
features = [
    'GrLivArea',
    '1stFlrSF',
    '2ndFlrSF',
    'LotArea',
    'BsmtFinSF1', 
    'BsmtFinSF2',
    'BsmtUnfSF', 
    'BsmtFinType1Enc',
    'BsmtFinType2Enc',
    'OverallQual',
    'GarageCars',
    'OverallCond', 
    'Neighborhood',
    #'MSSubClass', 
    'LotShape',
    'LandSlope',
    'BsmtCondEnc',
    'BsmtQualEnc', 
    'SaleCondition',
    'CentralAirEnc',
    'IsAdjArterialStreat',
    'IsAdjFeederStreat',
    'IsNormalCondition',
    'IsNearOffSiteFeature',
    'IsAjdOffSiteFeature',
    'IsNearRailroad',
    'IsAdjRailroad',
    'TotalBathrooms',
    'GarageFinishEnc',
    'KitchenQualEnc',
    'BedroomAbvGr',
    'MSZoning',
    'ExterQualEnc',
    'IsNewHouseSold',
    'LandContour',
    'HasFireplace',
    'FunctionalGroup',
    'HouseAge',
    'FenceEnc',
    #'IsModernHouseType',
    'IsGasHeating',
    'IsHeatingGood',
    'IsNewElectrBreakers',
    'IsGarageCondGood',
    'IsWoodenRoof',
    'RoofMatlCost',
    'BldgType',
    'HouseStyle',
    'HasPool',
    'MasVnrType',
    'TotRmsAbvGrd',
    'WoodDeckGroups',
    'HasEnclosedPorch',
    'YearBuiltBin',
    'HasScreenPorch',
    'AvgRoomSF',
    'BsmtExposureEnc',
    'Shed',
    'Season'
    #'Exterior1st'
]

X = train_df[features]
Y = train_df['SalePrice']

x_test = test_df[features]

## Modeling

In [24]:
# Build feature transformer

logTransformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

featureTransformer = ColumnTransformer([
        ('basement_area_log', logTransformer, ['GrLivArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'LotArea', 'AvgRoomSF', 'Shed', 'TotRmsAbvGrd']),
        ('neighborhood_onehot', OneHotEncoder(categories=[neighborhoodCategories]), ['Neighborhood']),
        #('subclass_onehot', OneHotEncoder(categories=[subclassCategories], handle_unknown='ignore'), ['MSSubClass']),
        ('lot_shape_onehot', OneHotEncoder(categories=[lotShapeCategories]), ['LotShape']),
        ('land_slope_onehot', OneHotEncoder(categories=[landSlopeCategories]), ['LandSlope']),
        ('sale_condtion_onehot', OneHotEncoder(categories=[saleCondCategories]), ['SaleCondition']),
        ('land_contour_onehot', OneHotEncoder(), ['LandContour']),
        ('zoning_onehot', OneHotEncoder(), ['MSZoning']),
        ('bldg_type_onehot', OneHotEncoder(), ['BldgType']),
        ('masvrn_type_onehot', OneHotEncoder(), ['MasVnrType']),
        ('house_style_onehot', OneHotEncoder(), ['HouseStyle']),
        ('season_onehot', OneHotEncoder(), ['Season']),
        #('ext_onehot', OneHotEncoder(categories=[exteriorCategories]), ['Exterior1st']),
    ],
    remainder='passthrough'
)

### GradientBoostingRegressor

In [25]:
gb_model = GradientBoostingRegressor(
    loss='lad',
    max_depth=3,
    max_features=None,
    n_estimators=1500,
    subsample=0.65,
    min_samples_split=2,
    min_samples_leaf=2,
    random_state=42,
)

gb_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('gb_regressor', gb_model),
])

print('GradientBoosting Regressor:')
score_model(gb_pipeline, X, Y)

GradientBoosting Regressor:
[CV] MSE: 753338961.9998 (10320415.2553)
[CV] MAE: 16186.1894 (101.8192)
[CV] R^2: 0.8799 (0.0091)
[CV] RMSLE: 0.125813 (0.0042)


In [26]:
parameters = {
    'gb_regressor__loss': ['ls', 'lad', 'huber', 'quantile'],
    'gb_regressor__n_estimators': [900, 1000, 1500, 2000],
    'gb_regressor__max_depth': [3, 4, 5, 6],
}

paramSearch = GridSearchCV(
  estimator=gb_pipeline,
  scoring=make_scorer(neg_rmsle),
  param_grid=parameters, 
  cv=2,
  n_jobs=-1, 
  verbose=3
)


#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

### HistGradientBoostingRegressor

In [27]:
hgb_model = HistGradientBoostingRegressor(
    loss='poisson',
    max_depth=4,
    max_iter=400,
    learning_rate=0.1,
    l2_regularization=0.325,
    random_state=42,
)

hgb_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('hgb_regressor', hgb_model),
])

print('HistGradientBoosting Regressor:')
score_model(hgb_pipeline, X, Y)

HistGradientBoosting Regressor:
[CV] MSE: 726548734.2341 (35983985.7462)
[CV] MAE: 16615.7606 (734.4582)
[CV] R^2: 0.8846 (0.0015)
[CV] RMSLE: 0.130161 (0.0067)


In [28]:
parameters = {
    'hgb_regressor__max_iter': [500, 600, 800, 1000, 2000], # 100
    'hgb_regressor__max_depth': [2, 3, 4, 5, 10], # None
    'hgb_regressor__loss': ['poisson', 'least_squares'], # 'least_squares',
    'hgb_regressor__learning_rate': [0.1, 0.001, 0.2, 0.25],
    'hgb_regressor__l2_regularization': [0.26, 0.3, 0.35, 0.4], # 0
    'hgb_regressor__min_samples_leaf': [20, 22, 25, 15],
    'hgb_regressor__max_leaf_nodes': [31, 20, 40], 
}

paramSearch = RandomizedSearchCV(
  estimator=hgb_pipeline,
  scoring=make_scorer(neg_rmsle),
  param_distributions=parameters, 
  n_iter=100,
  cv=2,
  n_jobs=-1, 
  verbose=3
)


#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

### Random Forest

In [29]:
%%time

random_forest_model = RandomForestRegressor(
        bootstrap=True,
        max_depth=20,
        max_features='auto',
        max_leaf_nodes=None,
        max_samples=None,
        min_samples_leaf=5,
        min_samples_split=6,
        n_estimators=3000,
        random_state=42,
    )

random_forest_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('random_forest', random_forest_model),
])

# Measure performance

print('RandomForest Regressor:')
score_model(random_forest_pipeline, X, Y)

## RF(max_depth=25, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=1000)
## CV=2, 0.15007
#[CV] MSE: 889831012.8445 (46061331.0130)
#[CV] MAE: 17721.0043 (817.3113)
#[CV] R^2: 0.8587 (0.0015)

RandomForest Regressor:
[CV] MSE: 956333644.7460 (24433391.1496)
[CV] MAE: 18390.3192 (371.2980)
[CV] R^2: 0.8474 (0.0134)
[CV] RMSLE: 0.150478 (0.0050)
CPU times: user 28.8 s, sys: 396 ms, total: 29.2 s
Wall time: 42 s


In [30]:
#random_forest_pipeline.fit(X, Y)

#X_columns = get_columns_from_transformer(random_forest_pipeline.named_steps['preprocessing'], list(X.columns))

#features_list = sorted(zip(random_forest_pipeline.named_steps['random_forest'].feature_importances_, X_columns), reverse=True)
#random_forest_pipeline.named_steps['preprocessing'].transformers_

#features_list

In [31]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram

parameters = {
    'random_forest__max_depth': [None, 15,  20, 25],
    'random_forest__max_features': ['sqrt'],
    'random_forest__max_leaf_nodes': [None, 5, 15, 17, 20],
    'random_forest__min_samples_leaf': [1, 2, 3],
    'random_forest__min_samples_split': [2, 1, 3],
    'random_forest__max_samples': [None, 2, 5],
    'random_forest__n_estimators': [1500, 2000, 2500]
}

paramSearch = GridSearchCV(
  estimator=random_forest_pipeline,
  scoring=make_scorer(neg_rmsle),
  param_grid=parameters, 
  cv=2,
  n_jobs=-1, 
  verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

In [32]:
#plot_learning_curve(random_forest_pipeline, X, Y, cv=3)

### XGBoostRegressor

In [33]:
%%time

from xgboost import XGBRegressor

xgb_model = XGBRegressor(
        max_depth=6,
        n_estimators=8000,
        learning_rate=0.01,
        min_child_weight=1.5,
        subsample=0.2,
        gamma=0.01,
        reg_alpha=1,
        reg_lambda=0.325,
        objective='reg:gamma',
        booster='gbtree'
    )

xgb_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('xgb_regressor', xgb_model),
])

# Measure Performance

print('XGB Regressor:')
score_model(xgb_pipeline, X, Y)

XGB Regressor:
[CV] MSE: 760155640.7229 (32703302.8413)
[CV] MAE: 15452.5650 (167.7466)
[CV] R^2: 0.8786 (0.0128)
[CV] RMSLE: 0.124407 (0.0032)
CPU times: user 3min 27s, sys: 11.2 s, total: 3min 38s
Wall time: 31.9 s


In [34]:
xgb_pipeline.fit(X, Y)

y_test_predicted = xgb_pipeline.predict(x_test)
y_test_predicted = np.rint(y_test_predicted).astype(int)

# RMSLE: 0.12007407 (ShedLog)
# RMSLE: 0.12003497 (TotalRoomLog)
# RMSLE: 0.11978800 (Season & ?)
# RMSLE: 0.11999545 (no Season & no MSSubClass & no IsModernHouseType)
# RMSLE: 0.11978800 (Season & no MSSubClass & no IsModernHouseType)
# RMSLE: 0.11988965 (Season & no MSSubClass & IsModernHouseType)

# RMSLE: 0.12138565 (IsStreetGraved)
# RMSLE: 0.12146436 (AlleyEnc)
# RMSLE: 0.12129823 (OpenPorchGroups)

score_predictions(test_y, y_test_predicted)

RMSLE: 0.11978800


In [35]:
xgb_pipeline.fit(X, Y)
X_columns = get_columns_from_transformer(xgb_pipeline.named_steps['preprocessing'], list(X.columns))

features_list = sorted(zip(xgb_pipeline.named_steps['xgb_regressor'].feature_importances_, X_columns), reverse=True)
features_list

[(0.08710987, 'OverallQual'),
 (0.06561573, 'ExterQualEnc'),
 (0.052686792, 'GarageCars'),
 (0.04581558, 'GrLivArea'),
 (0.04082287, 'KitchenQualEnc'),
 (0.03253535, 'CentralAirEnc'),
 (0.03250852, 'TotalBathrooms'),
 (0.025706265, 'IsGarageCondGood'),
 (0.023370061, 'BsmtQualEnc'),
 (0.01908602, 'MSZoning_C (all)'),
 (0.01830629, 'RoofMatlCost'),
 (0.01655926, 'IsGasHeating'),
 (0.01584205, 'SaleCondition_AdjLand'),
 (0.01583943, 'GarageFinishEnc'),
 (0.015127808, 'LotShape_IR3'),
 (0.0136772925, 'HasFireplace'),
 (0.012414295, '1stFlrSF'),
 (0.012386328, 'SaleCondition_Family'),
 (0.01130566, 'HouseStyle_2.5Fin'),
 (0.010633394, 'MSZoning_RM'),
 (0.010397398, 'Neighborhood_MeadowV'),
 (0.010184376, 'Neighborhood_IDOTRR'),
 (0.0095345825, 'HouseStyle_2.5Unf'),
 (0.0093012145, 'BldgType_Duplex'),
 (0.009163137, 'IsNormalCondition'),
 (0.0085189175, 'SaleCondition_Abnorml'),
 (0.0077706235, 'MasVnrType_BrkCmn'),
 (0.007627485, 'Neighborhood_OldTown'),
 (0.007409643, 'OverallCond'),
 (0.

In [36]:
%%time

parameters = {
    'xgb_regressor__objective': ['reg:gamma'], # 'reg:squarederror', 'reg:squaredlogerror'
    'xgb_regressor__learning_rate': [0.01],
    'xgb_regressor__n_estimators': [7900, 8000, 8100],
    'xgb_regressor__max_depth': [11, 12, 13],
    'xgb_regressor__booster': ['gbtree'],
    'xgb_regressor__min_child_weight': [1.5],
    'xgb_regressor__gamma': [0],
    'xgb_regressor__subsample': [0.2],
    'xgb_regressor__reg_alpha': [0, 0.9, 1],
    'xgb_regressor__reg_lambda': [1, 0.3],
}

paramSearch = GridSearchCV(
   estimator=xgb_pipeline,
   scoring=make_scorer(neg_rmsle),
   param_grid=parameters,
   cv=2,
   n_jobs=-1, 
   verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

CPU times: user 71 µs, sys: 1e+03 ns, total: 72 µs
Wall time: 77 µs


### StackingRegressor

In [37]:
# https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition

In [38]:
%%time

stacking_model = StackingRegressor(
    estimators=[
        ('xgb', xgb_model),
        ('random_forest', random_forest_model),
        ('GB', gb_model),
        ('HGB', hgb_model),
    ], 
    #final_estimator=LassoCV(),
    cv=2,
    n_jobs=-1,
)

stacking_regressor_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('stacking_regressor', stacking_model)
])

print('Stacking Regressor:')
#score_model(stacking_regressor_pipeline, X, Y)

Stacking Regressor:
CPU times: user 356 µs, sys: 101 µs, total: 457 µs
Wall time: 424 µs


### VotingRegressor

In [39]:
%%time

# https://www.kaggle.com/marktsvirko/votingregressor-xgb-svm-top-10

voting_model = VotingRegressor(
    estimators=[
        ('xgb', xgb_model), 
        ('random_forest', random_forest_model),
        ('GB', gb_model),
        ('HGB', hgb_model),
    ],
    weights=[0.8, 0.1, 0.05, 0.05],
    n_jobs=-1,
)

voting_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('voting_regressor', voting_model)
])

print('Voting Regressor:')
#score_model(voting_pipeline, X, Y)

# RMSLE: 0.12062175

Voting Regressor:
CPU times: user 407 µs, sys: 83 µs, total: 490 µs
Wall time: 447 µs


In [40]:
parameters = {
    'voting_regressor__weights': [None, [0.6, 0.3, 0.1]],
}

paramSearch = GridSearchCV(
   estimator=voting_pipeline,
   scoring=make_scorer(neg_rmsle),
   param_grid=parameters,
   cv=2,
   n_jobs=-1, 
   verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

## Predict Submissions

In [41]:
xgb_pipeline.fit(X, Y)

y_test_predicted = xgb_pipeline.predict(x_test)
y_test_predicted = np.rint(y_test_predicted).astype(int)

submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': y_test_predicted,
})

submission_df.to_csv('./data/submission_xgb.csv', index=False)

In [42]:
#stacking_regressor_pipeline.fit(X, Y)

#y_test_predicted = stacking_regressor_pipeline.predict(x_test)
#y_test_predicted = np.rint(y_test_predicted).astype(int)

#submission_df = pd.DataFrame({
#    'Id': test_df['Id'],
#    'SalePrice': y_test_predicted,
#})

#submission_df.to_csv('./data/submission_stacking.csv', index=False)

In [43]:
#voting_pipeline.fit(X, Y)

#y_test_predicted = voting_pipeline.predict(x_test)
#y_test_predicted = np.rint(y_test_predicted).astype(int)

#submission_df = pd.DataFrame({
#    'Id': test_df['Id'],
#    'SalePrice': y_test_predicted,
#})

#submission_df.to_csv('./data/submission_voting.csv', index=False)