In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, \
    RobustScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, LassoCV, Ridge, ElasticNetCV, ElasticNet
from sklearn.svm import SVR
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error, make_scorer
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate, cross_val_score, learning_curve
from sklearn.pipeline import Pipeline
from sklearn import set_config

In [2]:
np.random.seed(42)
set_config(display='diagram')

plt.rcParams['figure.figsize'] = (12, 8)
sns.set_theme(style='whitegrid')

In [3]:
train_df = trainDf = pd.read_csv('data/train.csv')
test_df = testDf = pd.read_csv('data/test.csv')

full_df = fullDf = pd.concat([trainDf, testDf], sort=True).reset_index(drop=True)

## Misc Functions

In [4]:
def plot_learning_curve(estimator, X_train, y_train, cv, train_sizes=np.linspace(0.1, 1, 10)):
    plt.style.use('seaborn-darkgrid')
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X_train, y_train, 
        scoring='neg_mean_squared_error',
        cv=cv, 
        n_jobs=-1, 
        train_sizes=train_sizes,
        shuffle=True,
        random_state=42
        )
    train_mean_scores = np.mean(train_scores, axis=1)
    test_mean_scores = np.mean(test_scores, axis=1)

    plt.title('Learning curve')
    plt.plot(train_sizes, train_mean_scores, 'y', label='Train Learning curve')
    plt.plot(train_sizes, test_mean_scores, 'b', label='Test Learning curve')
    plt.legend()

In [5]:
def neg_rmsle(y_true, y_pred):
    y_pred = np.abs(y_pred)
    
    return -1 * np.sqrt(mean_squared_log_error(y_true, y_pred))

In [6]:
def score_model(model, X, Y):
    
    scores = cross_validate(
        model, X, Y, 
        scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], cv=2,
        n_jobs=-1, verbose=0)

    rmsle_score = cross_val_score(model, X, Y, cv=2, scoring=make_scorer(neg_rmsle))

    mse_score = -1 * scores['test_neg_mean_squared_error'].mean()
    mse_std = scores['test_neg_mean_squared_error'].std()

    mae_score = -1 * scores['test_neg_mean_absolute_error'].mean()
    mae_std = scores['test_neg_mean_absolute_error'].std()

    r2_score_mean = scores['test_r2'].mean()
    r2_std = scores['test_r2'].std()

    print('[CV] MSE: %.4f (%.4f)' % (mse_score, mse_std))
    print('[CV] MAE: %.4f (%.4f)' % (mae_score, mae_std))
    print('[CV] R^2: %.4f (%.4f)' % (r2_score_mean, r2_std))
    print('[CV] RMSLE: %.6f (%.4f)' % (-1 * rmsle_score.mean(), rmsle_score.std()))

In [7]:
def get_columns_from_transformer(column_transformer, input_colums):    
    col_name = []

    for transformer_in_columns in column_transformer.transformers_[:-1]: #the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names(raw_col_name)
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)

    [_, _, reminder_columns] = column_transformer.transformers_[-1]

    for col_idx in reminder_columns:
        col_name.append(input_colums[col_idx])

    return col_name

## Data Cleaning and Preparation

In [8]:
num_features = [f for f in full_df.columns if full_df.dtypes[f] != 'object']
num_features.remove('Id')
num_features.remove('SalePrice')

cat_features = [f for f in full_df.columns if full_df.dtypes[f] == 'object']

In [9]:
for feature in (
    'PoolQC', 
    'FireplaceQu', 
    'Alley', 
    'Fence', 
    'MiscFeature', 
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'GarageType', 
    'GarageFinish', 
    'GarageQual', 
    'GarageCond',
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'MasVnrType',
    'MSSubClass',
):
    train_df[feature] = train_df[feature].fillna('None')
    test_df[feature] = test_df[feature].fillna('None')
    full_df[feature] = full_df[feature].fillna('None')

for feature in (
    'BsmtFinSF1', 
    'BsmtFinSF2', 
    'BsmtUnfSF',
    'TotalBsmtSF', 
    'BsmtFullBath', 
    'BsmtHalfBath',
    'MasVnrArea',
    'GarageCars',
    'GarageArea',
    'GarageYrBlt',
):
    train_df[feature] = train_df[feature].fillna(0)
    test_df[feature] = test_df[feature].fillna(0)
    full_df[feature] = full_df[feature].fillna(0)

for feature in (
    'Electrical', 
    'KitchenQual', 
    'Exterior1st',
    'Exterior2nd', 
    'SaleType',
    'MSZoning',
    'Utilities',
):
    train_df[feature] = train_df[feature].fillna(train_df[feature].mode()[0])
    test_df[feature] = test_df[feature].fillna(test_df[feature].mode()[0])
    full_df[feature] = full_df[feature].fillna(test_df[feature].mode()[0])

train_df['Functional'] = train_df['Functional'].fillna('Typ')
test_df['Functional'] = test_df['Functional'].fillna('Typ')
full_df['Functional'] = full_df['Functional'].fillna('Typ')

In [10]:
for dataframe in [train_df, test_df]:
    dataframe['Exterior1st'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)
    dataframe['Exterior2nd'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)

In [11]:
ordinal_feature_mapping = {
    'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}, 
    'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'PoolQC': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'Fence': {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'CentralAir': {'N': 0, 'Y': 1},
    'Alley': {'None': 0, 'Pave': 1, 'Grvl': 2},
    'Street': {'Pave': 0, 'Grvl': 1},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7}
}

non_ordinal_cat_features = list(set(cat_features) - set(ordinal_feature_mapping.keys()))

for cat_feature in non_ordinal_cat_features:
    train_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(train_df[cat_feature])
    test_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(test_df[cat_feature])

for ordinal_feature, feature_mapping in ordinal_feature_mapping.items():
    train_df[ordinal_feature + 'Enc'] = train_df[ordinal_feature].map(feature_mapping)
    test_df[ordinal_feature + 'Enc'] = test_df[ordinal_feature].map(feature_mapping)

### Num Feature Scaling

In [12]:
# https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

## Feature Engineering

In [13]:
for dataframe in [train_df, test_df]:
    dataframe['Has2ndFloor'] = dataframe['2ndFlrSF'].apply(lambda x: int(x > 0))
    dataframe['HasBsmnt'] = dataframe['TotalBsmtSF'].apply(lambda x: int(x > 0))
    dataframe['HasGarage'] = dataframe['GarageArea'].apply(lambda x: int(x > 0))
    dataframe['HasPool'] = dataframe['PoolArea'].apply(lambda x: int(x > 0))
    dataframe['HasFence'] = dataframe['Fence'].apply(lambda x: int(x != 'None'))
    dataframe['HasFireplace'] = dataframe['Fireplaces'].apply(lambda x: int(x > 0))
    dataframe['HasMasVnr'] = dataframe['MasVnrType'].apply(lambda x: int(x != 'None'))
    dataframe['HasShed'] = (dataframe['MiscFeature'] == 'Shed') * 1

    dataframe['HouseAge'] = dataframe['YrSold'].astype('int') - dataframe['YearBuilt'].astype('int')
    dataframe['HouseAgeSinRemod'] = dataframe['YrSold'].astype('int') - dataframe['YearRemodAdd'].astype('int')

In [14]:
# Remove outliers
train_df.drop(
    train_df[(train_df['GrLivArea'] > 4000) & (train_df['SalePrice'] < 700000)].index
);

In [15]:
subclassCategories = [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 160, 180, 190] #removed 150 class as useless during predictions
basementFinishCategories = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
electricalCategories = ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr']

exteriorCategories = ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']

conditionCategories = train_df['Condition1'].unique()
neighborhoodCategories = full_df['Neighborhood'].unique()
saleCondCategories = full_df['SaleCondition'].unique()
garageTypeCategories = full_df['GarageType'].unique()
lotConfigCategories = full_df['LotConfig'].unique() # feature was removed from the model
lotShapeCategories = full_df['LotShape'].unique()
landSlopeCategories = full_df['LandSlope'].unique()

In [16]:
year_built_bins = np.linspace(1871, 2010, 10) # 10 bins

In [17]:
# Reference: https://www.kaggle.com/cerberus4229/voting-regressor-with-pipelines

for dataframe in [train_df, test_df]:
    dataframe['TotalBathrooms'] = (dataframe['FullBath'] + (0.5 * dataframe['HalfBath']) +
                                dataframe['BsmtFullBath'] + (0.5 * dataframe['BsmtHalfBath']))

    dataframe['OverallHouseQCBin'] = pd.qcut(dataframe['OverallQual'] + dataframe['OverallCond'], q=3, labels=[0, 1, 2])
    dataframe['IsPavedDrive'] = (dataframe['PavedDrive'] == 'Y') * 1
    dataframe['IsNeighborhoodElite'] = (dataframe['Neighborhood'].isin(['NridgHt', 'CollgeCr', 'Crawfor', 'StoreBr', 'Timber'])) * 1 
    dataframe['YearBuiltBin'] = pd.cut(dataframe['YearBuilt'], bins=year_built_bins, labels=range(1, 10)) 
    dataframe['KitchenQCBin'] = pd.cut(dataframe['KitchenQualEnc'] * dataframe['KitchenAbvGr'], [-1, 2, 7], labels=[0, 1])

    dataframe['IsFunctional'] = (dataframe['Functional'] == 'Typ') * 1
    dataframe['FunctionalGroup'] = dataframe['Functional'].map({
        'Typ': 2,
        'Min1': 1,
        'Min2': 1,
        'Mod': 1,
        'Maj1': 0,
        'Maj2': 0,
        'Sev': 0,
        'Sav': 0,
    })

    dataframe['IsModernHouseType'] = dataframe['MSSubClass'].map(
        {20: 1, 30: 0, 40: 0, 45: 0,50: 0, 60: 1, 70: 0, 75: 0, 80: 0, 85: 0, 90: 0, 120: 1, 150: 0, 160: 0, 180: 0, 190: 0}
    )
    dataframe['IsLandFlat'] = (dataframe['LandContour'] == 'Lvl') * 1
    dataframe['IsNewHouseSold'] = (full_df['YearBuilt'] == full_df['YrSold'].astype(int)) * 1

    dataframe['IsExterCondGood'] = dataframe['ExterCond'].map({'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})

    dataframe['IsGasHeating'] = dataframe['Heating'].map({'GasA': 1, 'GasW': 1, 'Grav': 0, 'Wall': 0, 'OthW': 0, 'Floor': 0})
    dataframe['IsHeatingGood'] = dataframe['HeatingQC'].map({'Po': 0, 'Fa': 0, 'TA': 0, 'Gd': 1, 'Ex': 1})

    dataframe['IsNewElectrBreakers'] = dataframe['Electrical'].map({'SBrkr': 1, 'FuseF': 0, 'FuseA': 0, 'FuseP': 0, 'Mix': 0})

    dataframe['IsGarageCondGood'] = dataframe['GarageCond'].map({'None': 0, 'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})
    dataframe['IsGarageQualGood'] = dataframe['GarageQual'].map({'None': 0, 'Po': 0, 'Fa': 0, 'TA': 1, 'Gd': 1, 'Ex': 1})

In [18]:
# https://www.kaggle.com/humananalog/xgboost-lasso

In [19]:
baseline_features = [
    '1stFlrSF',
    '2ndFlrSF',
    'BsmtFinSF1', 
    'BsmtFinSF2',
    'BsmtUnfSF', 
    'BsmtFinType1Enc',
    'BsmtFinType2Enc',
    'OverallQual',
    'GarageCars',
    'OverallCond', 
    'Neighborhood',
    'MSSubClass', 
    'LotShape',
    'LandSlope',
    'BsmtCondEnc',
    'BsmtQualEnc', 
    'SaleCondition',
    'CentralAirEnc',
    'Condition1',
    'Condition2',
    'TotalBathrooms',
    'GarageFinishEnc',
    'KitchenQualEnc',
    'BedroomAbvGr',
    'MSZoning',
    'ExterQualEnc',
    'IsNewHouseSold',
    'LandContour',
    'HasFireplace',
    'FunctionalGroup',
    'HouseAge',
    'FenceEnc',
    'IsModernHouseType',
    'IsGasHeating',
    'IsHeatingGood',
    'IsNewElectrBreakers',
    'IsGarageCondGood'
]

X = train_df[baseline_features]
Y = train_df['SalePrice']

## Modeling

In [20]:
# Build feature transformer

logTransformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

featureTransformer = ColumnTransformer([
        ('basement_area_log', logTransformer, ['BsmtFinSF1', 'BsmtFinSF2']),
        ('neighborhood_onehot', OneHotEncoder(categories=[neighborhoodCategories]), ['Neighborhood']),
        ('subclass_onehot', OneHotEncoder(categories=[subclassCategories], handle_unknown='ignore'), ['MSSubClass']),
        ('lot_shape_onehot', OneHotEncoder(categories=[lotShapeCategories]), ['LotShape']),
        ('land_slope_onehot', OneHotEncoder(categories=[landSlopeCategories]), ['LandSlope']),
        ('sale_condtion_onehot', OneHotEncoder(categories=[saleCondCategories]), ['SaleCondition']),
        ('land_contour_onehot', OneHotEncoder(), ['LandContour']),
        ('zoning_onehot', OneHotEncoder(), ['MSZoning']),
        ('condition_onehot', OneHotEncoder(categories=[conditionCategories, conditionCategories]), ['Condition1', 'Condition2']), #
    ],
    remainder='passthrough'
)

### ElasticNet

In [21]:
%%time

elasticnet_model = ElasticNet(
    alpha=0.5,
    l1_ratio=0.8,
    random_state=42,
)

elasticnet_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('elasticnet', elasticnet_model)
])

print('ElasticNet:')
score_model(elasticnet_pipeline, X, Y)

ElasticNet:
[CV] MSE: 1152136483.8834 (87481744.5543)
[CV] MAE: 20110.6882 (574.1290)
[CV] R^2: 0.8156 (0.0254)
[CV] RMSLE: 0.192544 (0.0291)


In [22]:
parameters = {
    'elasticnet__alpha': [9e-4, 7e-4, 5e-4, 3e-4, 1e-4, 1e-3, 5e-2, 1e-2, 0.1, 0.5, 1, 2, 5],
    'elasticnet__l1_ratio': [0.01, 0.1, 0.2, 0.3, 0.5, 0.6, 0.7, 0.8, 1],
}

paramSearch = GridSearchCV(
  estimator=elasticnet_pipeline,
  scoring='neg_mean_squared_error',
  param_grid=parameters, 
  cv=2,
  n_jobs=-1, 
  verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

### Ridge

In [23]:
# https://www.kaggle.com/marktsvirko/votingregressor-xgb-svm-top-10

In [24]:
%%time

ridge_model = Ridge(
        solver='auto',
        alpha=522,
        random_state=42,
    )

ridge_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('ridge', ridge_model),
])

print('Ridge:')
score_model(ridge_pipeline, X, Y)

Ridge:
[CV] MSE: 1365141356.2157 (87592705.2026)
[CV] MAE: 22099.5218 (440.3525)
[CV] R^2: 0.7817 (0.0275)
[CV] RMSLE: 0.176990 (0.0003)


In [25]:
parameters = {
    'ridge__solver': ['auto', 'saga', 'sag', 'cholesky'],
    'ridge__alpha': [510, 519, 520, 522, 530],
}

paramSearch = GridSearchCV(
  estimator=ridge_pipeline,
  scoring=make_scorer(neg_rmsle),
  param_grid=parameters, 
  cv=2,
  n_jobs=-1, 
  verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

### SVR

In [26]:
%%time

svr_model = SVR(
    kernel='linear',
    C=200
)

svr_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('svr', svr_model),
])

print('SVR:')
score_model(svr_pipeline, X, Y)

SVR:
[CV] MSE: 1326518950.8863 (22448291.5774)
[CV] MAE: 20511.2439 (889.2198)
[CV] R^2: 0.7885 (0.0168)
[CV] RMSLE: 0.161865 (0.0084)


In [27]:
parameters = {
    'svr__C': [1, 20, 50, 100, 120, 150, 200],
    'svr__kernel': ['linear'],
    #'svr__tol': [1e-3, 1e-5, 1e-4, 1e-2, 0.1]
}

paramSearch = GridSearchCV(
   estimator=svr_pipeline,
   scoring=make_scorer(neg_rmsle),
   param_grid=parameters,
   cv=2,
   n_jobs=-1, 
   verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

### Random Forest

In [28]:
%%time

random_forest_model = RandomForestRegressor(
        bootstrap=True,
        max_depth=15,
        max_features='sqrt',
        max_leaf_nodes=None,
        max_samples=None,
        min_samples_leaf=5,
        min_samples_split=5,
        n_estimators=2500,
        random_state=42,
    )

random_forest_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('random_forest', random_forest_model),
])

# Measure performance

print('RandomForest Regressor:')
score_model(random_forest_pipeline, X, Y)

## RF(max_depth=25, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=1000)
## CV=2, 0.15007
#[CV] MSE: 889831012.8445 (46061331.0130)
#[CV] MAE: 17721.0043 (817.3113)
#[CV] R^2: 0.8587 (0.0015)

RandomForest Regressor:
[CV] MSE: 1227189351.4163 (814319.8645)
[CV] MAE: 20672.8035 (762.0916)
[CV] R^2: 0.8045 (0.0121)
[CV] RMSLE: 0.168217 (0.0071)


In [29]:
random_forest_pipeline.fit(X, Y)

X_columns = get_columns_from_transformer(random_forest_pipeline.named_steps['preprocessing'], list(X.columns))

features_list = sorted(zip(random_forest_pipeline.named_steps['random_forest'].feature_importances_, X_columns), reverse=True)
random_forest_pipeline.named_steps['preprocessing'].transformers_

features_list

[(0.1473394414479701, 'OverallQual'),
 (0.09473689626361305, 'GarageCars'),
 (0.08255122527551798, '1stFlrSF'),
 (0.07976855119898248, 'ExterQualEnc'),
 (0.06730423854276237, 'BsmtQualEnc'),
 (0.06650464044186613, 'TotalBathrooms'),
 (0.06601880974712472, 'KitchenQualEnc'),
 (0.05872667055681261, 'HouseAge'),
 (0.046293983304633815, 'BsmtFinSF1'),
 (0.04134684977974674, '2ndFlrSF'),
 (0.03704061339654462, 'GarageFinishEnc'),
 (0.03383274588348624, 'HasFireplace'),
 (0.01933792031459685, 'BsmtFinType1Enc'),
 (0.014735207052583231, 'BsmtUnfSF'),
 (0.014532984236152425, 'MSSubClass_60'),
 (0.013647778591461832, 'Neighborhood_NridgHt'),
 (0.011552013149265789, 'IsModernHouseType'),
 (0.011241629261648846, 'BedroomAbvGr'),
 (0.011080505197984154, 'Neighborhood_NoRidge'),
 (0.00790894810526982, 'SaleCondition_Partial'),
 (0.007612207942180314, 'IsHeatingGood'),
 (0.005672509664686905, 'MSZoning_RM'),
 (0.005449624528569497, 'OverallCond'),
 (0.004722856156031359, 'LotShape_Reg'),
 (0.0045352

In [30]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram

parameters = {
    'random_forest__max_depth': [None, 15,  20, 25],
    'random_forest__max_features': ['sqrt'],
    'random_forest__max_leaf_nodes': [None, 5, 15, 17, 20],
    'random_forest__min_samples_leaf': [1, 2, 3],
    'random_forest__min_samples_split': [2, 1, 3],
    'random_forest__max_samples': [None, 2, 5],
    'random_forest__n_estimators': [1500, 2000, 2500]
}

paramSearch = GridSearchCV(
  estimator=random_forest_pipeline,
  scoring=make_scorer(neg_rmsle),
  param_grid=parameters, 
  cv=2,
  n_jobs=-1, 
  verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

In [31]:
#plot_learning_curve(random_forest_pipeline, X, Y, cv=3)

### XGBoostRegressor

In [32]:
%%time

from xgboost import XGBRegressor

xgb_model = XGBRegressor(
        max_depth=5,
        n_estimators=7050,
        learning_rate=0.01,
        min_child_weight=1.5,
        subsample=0.2,
        gamma=0,
        reg_alpha=1,
        reg_lambda=0.1,
        objective='reg:gamma',
        booster='gbtree'
    )

xgb_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('xgb_regressor', xgb_model),
])

# Measure Performance

print('XGB Regressor:')
score_model(xgb_pipeline, X, Y)

XGB Regressor:
[CV] MSE: 733013467.9466 (10248330.7964)
[CV] MAE: 15624.5991 (281.8154)
[CV] R^2: 0.8831 (0.0089)
[CV] RMSLE: 0.124497 (0.0047)


In [33]:
xgb_pipeline.fit(X, Y)
X_columns = get_columns_from_transformer(xgb_pipeline.named_steps['preprocessing'], list(X.columns))

features_list = sorted(zip(xgb_pipeline.named_steps['xgb_regressor'].feature_importances_, X_columns), reverse=True)
features_list

[(0.09893528, 'OverallQual'),
 (0.058594067, 'ExterQualEnc'),
 (0.054355096, 'GarageCars'),
 (0.05265582, 'TotalBathrooms'),
 (0.04070321, 'KitchenQualEnc'),
 (0.039280742, 'MSZoning_C (all)'),
 (0.037668053, 'HasFireplace'),
 (0.024993883, 'CentralAirEnc'),
 (0.022148168, 'BsmtQualEnc'),
 (0.021954713, 'IsGarageCondGood'),
 (0.020551415, 'IsGasHeating'),
 (0.019206144, '1stFlrSF'),
 (0.018297909, 'MSSubClass_30'),
 (0.016538993, 'BsmtFinSF1'),
 (0.013820155, 'SaleCondition_Family'),
 (0.01298328, 'GarageFinishEnc'),
 (0.0129729705, 'MSSubClass_180'),
 (0.0127187185, 'Neighborhood_IDOTRR'),
 (0.01259859, '2ndFlrSF'),
 (0.011665741, 'LotShape_IR3'),
 (0.010319879, 'SaleCondition_Alloca'),
 (0.010107886, 'SaleCondition_Abnorml'),
 (0.009696873, 'MSZoning_RM'),
 (0.009354685, 'Condition1_PosN'),
 (0.00934708, 'Condition2_Norm'),
 (0.008600765, 'Neighborhood_MeadowV'),
 (0.008299672, 'MSSubClass_45'),
 (0.008248682, 'Neighborhood_OldTown'),
 (0.007933357, 'Neighborhood_Blmngtn'),
 (0.00775

In [34]:
parameters = {
    'xgb_regressor__objective': ['reg:gamma'], # 'reg:squarederror', 'reg:squaredlogerror'
    'xgb_regressor__learning_rate': [0.01],
    'xgb_regressor__n_estimators': [6900, 7000, 7100],
    'xgb_regressor__max_depth': [5],
    'xgb_regressor__booster': ['gbtree'],
    'xgb_regressor__min_child_weight': [1.5],
    'xgb_regressor__gamma': [0],
    'xgb_regressor__subsample': [0.2],
    'xgb_regressor__reg_alpha': [1],
    'xgb_regressor__reg_lambda': [0.1],
}

paramSearch = GridSearchCV(
   estimator=xgb_pipeline,
   scoring=make_scorer(neg_rmsle),
   param_grid=parameters,
   cv=2,
   n_jobs=-1, 
   verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

### StackingRegressor

In [35]:
# https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition

In [36]:
%%time

stacking_final_model = XGBRegressor()

stacking_model = StackingRegressor(
    estimators=[
        ('xgb', xgb_model),
        ('random_forest', random_forest_model),
        ('elasticnet', elasticnet_model),
        ('ridge', ridge_model),
        ('svr', svr_model)
    ], 
    #final_estimator=stacking_final_model,
    cv=2,
    n_jobs=-1,
)

stacking_regressor_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('stacking_regressor', stacking_model)
])

print('Stacking Regressor:')
score_model(stacking_regressor_pipeline, X, Y)

Stacking Regressor:
[CV] MSE: 782497273.3233 (50637385.5474)
[CV] MAE: 16222.3362 (147.1010)
[CV] R^2: 0.8748 (0.0158)
[CV] RMSLE: 0.132203 (0.0019)
CPU times: user 8.05 s, sys: 709 ms, total: 8.75 s
Wall time: 1min 5s


### VotingRegressor

In [39]:
%%time

# https://www.kaggle.com/marktsvirko/votingregressor-xgb-svm-top-10

voting_model = VotingRegressor(
    estimators=[
        ('xgb', xgb_model), 
        ('random_forest', random_forest_model),
        ('elasticnet', elasticnet_model),
        ('ridge', ridge_model),
        ('svr', svr_model)
    ],
    n_jobs=-1,
)

voting_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('voting_regressor', voting_model)
])

print('Voting Regressor:')
score_model(voting_pipeline, X, Y)

Voting Regressor:
[CV] MSE: 1001855845.8300 (30064232.9448)
[CV] MAE: 17787.0179 (686.2564)
[CV] R^2: 0.8401 (0.0147)
[CV] RMSLE: 0.140481 (0.0053)
CPU times: user 7.48 s, sys: 850 ms, total: 8.33 s
Wall time: 35.6 s


In [40]:
parameters = {
    'voting_regressor__weights': [None, [0.6, 0.3, 0.1]],
}

paramSearch = GridSearchCV(
   estimator=voting_pipeline,
   scoring=make_scorer(neg_rmsle),
   param_grid=parameters,
   cv=2,
   n_jobs=-1, 
   verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

## Predict Submissions

In [41]:
voting_pipeline.fit(X, Y)

x_test = test_df[baseline_features]
y_test_predicted = voting_pipeline.predict(x_test)

y_test_predicted = np.rint(y_test_predicted).astype(int)

submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': y_test_predicted,
})

submission_df.to_csv('./data/submission_voting.csv', index=False)

In [42]:
xgb_pipeline.fit(X, Y)

x_test = test_df[baseline_features]
y_test_predicted = xgb_pipeline.predict(x_test)

y_test_predicted = np.rint(y_test_predicted).astype(int)

submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': y_test_predicted,
})

submission_df.to_csv('./data/submission_xgb.csv', index=False)

In [43]:
stacking_regressor_pipeline.fit(X, Y)

x_test = test_df[baseline_features]
y_test_predicted = stacking_regressor_pipeline.predict(x_test)

y_test_predicted = np.rint(y_test_predicted).astype(int)

submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': y_test_predicted,
})

submission_df.to_csv('./data/submission_stacking.csv', index=False)