In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, \
    RobustScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_squared_log_error, make_scorer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, learning_curve
from sklearn.pipeline import Pipeline
from sklearn import set_config

In [2]:
np.random.seed(42)
set_config(display='diagram')

plt.rcParams['figure.figsize'] = (12, 8)
sns.set_theme(style="whitegrid")

In [3]:
train_df = trainDf = pd.read_csv("data/train.csv")
test_df = testDf = pd.read_csv("data/test.csv")
full_df = fullDf = pd.concat([trainDf, testDf], sort=True).reset_index(drop=True)

## Misc Functions

In [4]:
def plot_learning_curve(estimator, X_train, y_train, cv, train_sizes=np.linspace(0.1, 1, 10)):
    plt.style.use('seaborn-darkgrid')
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X_train, y_train, 
        scoring='neg_mean_squared_error',
        cv=cv, 
        n_jobs=-1, 
        train_sizes=train_sizes,
        shuffle=True,
        random_state=42
        )
    train_mean_scores = np.mean(train_scores, axis=1)
    test_mean_scores = np.mean(test_scores, axis=1)

    plt.title('Learning curve')
    plt.plot(train_sizes, train_mean_scores, 'y', label='Train Learning curve')
    plt.plot(train_sizes, test_mean_scores, 'b', label='Test Learning curve')
    plt.legend()

In [5]:
def neg_rmsle(y_true, y_pred):
    return -1 * np.sqrt(mean_squared_log_error(y_true, y_pred))

In [6]:
def score_model(model, X, Y):
    
    scores = cross_validate(
        model, X, Y, 
        scoring=['r2', 'neg_mean_absolute_error', 'neg_mean_squared_error'], cv=2,
        n_jobs=-1, verbose=0)

    rmsle_score = cross_val_score(model, X, Y, cv=2, scoring=make_scorer(neg_rmsle))

    mse_score = -1 * scores['test_neg_mean_squared_error'].mean()
    mse_std = scores['test_neg_mean_squared_error'].std()

    mae_score = -1 * scores['test_neg_mean_absolute_error'].mean()
    mae_std = scores['test_neg_mean_absolute_error'].std()

    r2_score_mean = scores['test_r2'].mean()
    r2_std = scores['test_r2'].std()

    print('[CV] MSE: %.4f (%.4f)' % (mse_score, mse_std))
    print('[CV] MAE: %.4f (%.4f)' % (mae_score, mae_std))
    print('[CV] R^2: %.4f (%.4f)' % (r2_score_mean, r2_std))
    print('[CV] RMSLE: %.6f (%.4f)' % (rmsle_score.mean(), rmsle_score.std()))

## Data Cleaning and Preparation

In [7]:
num_features = [f for f in full_df.columns if full_df.dtypes[f] != 'object']
num_features.remove('Id')
num_features.remove('SalePrice')

cat_features = [f for f in full_df.columns if full_df.dtypes[f] == 'object']

In [8]:
for feature in (
    'PoolQC', 
    'FireplaceQu', 
    'Alley', 
    'Fence', 
    'MiscFeature', 
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'GarageType', 
    'GarageFinish', 
    'GarageQual', 
    'GarageCond',
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'MasVnrType',
    'MSSubClass',
):
    train_df[feature] = train_df[feature].fillna('None')
    test_df[feature] = test_df[feature].fillna('None')
    full_df[feature] = full_df[feature].fillna('None')

for feature in (
    'BsmtFinSF1', 
    'BsmtFinSF2', 
    'BsmtUnfSF',
    'TotalBsmtSF', 
    'BsmtFullBath', 
    'BsmtHalfBath',
    'MasVnrArea',
    'GarageCars',
    'GarageArea',
    'GarageYrBlt',
):
    train_df[feature] = train_df[feature].fillna(0)
    test_df[feature] = test_df[feature].fillna(0)
    full_df[feature] = full_df[feature].fillna(0)

for feature in (
    'Electrical', 
    'KitchenQual', 
    'Exterior1st',
    'Exterior2nd', 
    'SaleType',
    'MSZoning',
    'Utilities',
):
    train_df[feature] = train_df[feature].fillna(train_df[feature].mode()[0])
    test_df[feature] = test_df[feature].fillna(test_df[feature].mode()[0])
    full_df[feature] = full_df[feature].fillna(test_df[feature].mode()[0])

train_df['Functional'] = train_df['Functional'].fillna('Typical')
test_df['Functional'] = test_df['Functional'].fillna('Typical')
full_df['Functional'] = full_df['Functional'].fillna('Typical')

In [9]:
for dataframe in [train_df, test_df]:
    dataframe['Exterior1st'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)
    dataframe['Exterior2nd'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)

In [10]:
ordinal_feature_mapping = {
    'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4}, 
    'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageFinish': {'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'PoolQC': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'Fence': {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'CentralAir': {'N': 0, 'Y': 1},
    'Alley': {'None': 0, 'Pave': 1, 'Grvl': 2},
    'Street': {'Pave': 0, 'Grvl': 1},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7}
}

non_ordinal_cat_features = list(set(cat_features) - set(ordinal_feature_mapping.keys()))

for cat_feature in non_ordinal_cat_features:
    train_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(train_df[cat_feature])
    test_df[cat_feature + 'Enc'] = LabelEncoder().fit_transform(test_df[cat_feature])

for ordinal_feature, feature_mapping in ordinal_feature_mapping.items():
    train_df[ordinal_feature + 'Enc'] = train_df[ordinal_feature].map(feature_mapping)
    test_df[ordinal_feature + 'Enc'] = test_df[ordinal_feature].map(feature_mapping)

### Num Feature Scaling

In [11]:
# https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

## Feature Engineering

In [12]:
for dataframe in [train_df, test_df]:
    dataframe['Has2ndFloor'] = dataframe['2ndFlrSF'].apply(lambda x: int(x > 0))
    dataframe['HasBsmnt'] = dataframe['TotalBsmtSF'].apply(lambda x: int(x > 0))
    dataframe['HasGarage'] = dataframe['GarageArea'].apply(lambda x: int(x > 0))
    dataframe['HasPool'] = dataframe['PoolArea'].apply(lambda x: int(x > 0))
    dataframe['HasFence'] = dataframe['Fence'].apply(lambda x: int(x != 'None'))
    dataframe['HasFireplace'] = dataframe['Fireplaces'].apply(lambda x: int(x > 0))
    dataframe['HasMasVnr'] = dataframe['MasVnrType'].apply(lambda x: int(x != 'None'))

    dataframe['HouseAge'] = dataframe['YrSold'].astype('int') - dataframe['YearBuilt'].astype('int')
    dataframe['HouseAgeSinRemod'] = dataframe['YrSold'].astype('int') - dataframe['YearRemodAdd'].astype('int')

In [13]:
# Remove outliers
train_df.drop(
    train_df[(train_df['GrLivArea'] > 4000) & (train_df['SalePrice'] < 700000)].index
);

In [14]:
subclassCategories = [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180, 190]
basementFinishCategories = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
electricalCategories = ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr']

exteriorCategories = ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']

conditionCategories = train_df['Condition1'].unique()
neighborhoodCategories = full_df['Neighborhood'].unique()
saleCondCategories = full_df['SaleCondition'].unique()
garageTypeCategories = full_df['GarageType'].unique()
lotConfigCategories = full_df['LotConfig'].unique() # feature was removed from the model
lotShapeCategories = full_df['LotShape'].unique()
landSlopeCategories = full_df['LandSlope'].unique()

In [15]:
year_built_bins = np.linspace(1871, 2010, 10) # 10 bins

year_built_bins

array([1871.        , 1886.44444444, 1901.88888889, 1917.33333333,
       1932.77777778, 1948.22222222, 1963.66666667, 1979.11111111,
       1994.55555556, 2010.        ])

In [16]:
# Reference: https://www.kaggle.com/cerberus4229/voting-regressor-with-pipelines

for dataframe in [train_df, test_df]:
    dataframe['TotalBathrooms'] = (dataframe['FullBath'] + (0.5 * dataframe['HalfBath']) +
                                dataframe['BsmtFullBath'] + (0.5 * dataframe['BsmtHalfBath']))

    dataframe['OverallHouseQCBin'] = pd.qcut(dataframe['OverallQual'] + dataframe['OverallCond'], q=3, labels=[0, 1, 2]) # performs worse than OverallQual
    dataframe['IsPavedDrive'] = (dataframe['PavedDrive'] == 'Y') * 1 # performs worse
    dataframe['IsNeighborhoodElite'] = (dataframe['Neighborhood'].isin(['NridgHt', 'CollgeCr', 'Crawfor', 'StoreBr', 'Timber'])) * 1 # worse
    dataframe['YearBuiltBin'] = pd.cut(dataframe['YearBuilt'], bins=year_built_bins, labels=range(1, 10)) 

In [17]:
# https://www.kaggle.com/humananalog/xgboost-lasso

In [18]:
baseline_features = [
    '1stFlrSF',
    '2ndFlrSF',
    'BsmtFinSF1', 
    'BsmtFinSF2',
    'BsmtUnfSF', #
    'BsmtFinType1Enc',
    'BsmtFinType2Enc',
    'OverallQual',
    'GarageCars',
    'OverallCond', 
    'Neighborhood',
    'MSSubClass', 
    'LotShape',
    'LandSlope',
    'BsmtCondEnc',
    'BsmtQualEnc', #
    'SaleCondition',
    'CentralAirEnc',
    'Condition1',
    'Condition2',
    'TotalBathrooms',
    'GarageFinishEnc',
    'KitchenQualEnc',
    'BedroomAbvGr',
    #'IsNeighborhoodElite',
    #'YearBuiltBin',    
]

X = train_df[baseline_features]
Y = train_df['SalePrice']

## Modeling

In [19]:
# Build feature transformer

logTransformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

featureTransformer = ColumnTransformer([
        ('basement_area_log', logTransformer, ['BsmtFinSF1', 'BsmtFinSF2']),
        ('neighborhood_onehot', OneHotEncoder(categories=[neighborhoodCategories]), ['Neighborhood']),
        ('subclass_onehot', OneHotEncoder(categories=[subclassCategories]), ['MSSubClass']),
        ('lot_shape_onehot', OneHotEncoder(categories=[lotShapeCategories]), ['LotShape']),
        ('land_slope_onehot', OneHotEncoder(categories=[landSlopeCategories]), ['LandSlope']),
        ('sale_condtion_onehot', OneHotEncoder(categories=[saleCondCategories]), ['SaleCondition']),
        ('condition_onehot', OneHotEncoder(categories=[conditionCategories, conditionCategories]), ['Condition1', 'Condition2']), #
    ],
    remainder='passthrough'
)

### ElasticNet

### SVR

In [20]:
# https://www.kaggle.com/marktsvirko/votingregressor-xgb-svm-top-10

### Random Forest

In [21]:
random_forest_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('random_forest', RandomForestRegressor(
        bootstrap=True,
        max_depth=25,
        max_features='sqrt',
        min_samples_leaf=1,
        min_samples_split=2,
        n_estimators=1000,
        random_state=42,
    )),
])

# Measure performance

score_model(random_forest_pipeline, X, Y)

## RF(max_depth=15, max_features='sqrt', min_samples_leaf=1, min_samples_split=3, n_estimators=3650)
## CV=2, ?
#[CV] MSE: 904073278.0912 (34098840.2146)
#[CV] MAE: 17896.4251 (795.0885)
#[CV] R^2: 0.8563 (0.0036)

## RF(max_depth=25, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=1000)
## CV=2, 0.15007
#[CV] MSE: 889831012.8445 (46061331.0130)
#[CV] MAE: 17721.0043 (817.3113)
#[CV] R^2: 0.8587 (0.0015)

[CV] MSE: 889831012.8445 (46061331.0130)
[CV] MAE: 17721.0043 (817.3113)
[CV] R^2: 0.8587 (0.0015)
[CV] RMSLE: -0.147753 (0.0071)


In [22]:
random_forest_pipeline.fit(X, Y)
random_forest_pipeline.named_steps['random_forest'].feature_importances_

features_list = sorted(zip(random_forest_pipeline.named_steps['random_forest'].feature_importances_, X.columns), reverse=True)

features_list

[(0.05519694734604319, '1stFlrSF'),
 (0.02045785438167182, 'LotShape'),
 (0.014138410467452114, 'BsmtFinType1Enc'),
 (0.0044588645275103235, 'TotalBathrooms'),
 (0.0037306632246330842, '2ndFlrSF'),
 (0.0034934266166744892, 'BsmtUnfSF'),
 (0.003006848415812425, 'CentralAirEnc'),
 (0.002968784317162521, 'LandSlope'),
 (0.0028891298209607653, 'OverallQual'),
 (0.0026660984904356413, 'OverallCond'),
 (0.0019403479623190366, 'BsmtFinSF1'),
 (0.001765659505820689, 'GarageFinishEnc'),
 (0.0016733526160927765, 'Condition1'),
 (0.0012974703679510767, 'Condition2'),
 (0.0012167302515332097, 'GarageCars'),
 (0.0011803714262773907, 'BsmtQualEnc'),
 (0.0010871917076662066, 'MSSubClass'),
 (0.0009796199960676042, 'BsmtCondEnc'),
 (0.0009447523171870062, 'BsmtFinType2Enc'),
 (0.0008163424115430446, 'SaleCondition'),
 (0.000740781374314616, 'Neighborhood'),
 (0.0006476959841260213, 'BedroomAbvGr'),
 (0.0005070349684867455, 'BsmtFinSF2'),
 (0.00011473826000025075, 'KitchenQualEnc')]

In [23]:
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram

parameters = {
    'random_forest__bootstrap': [True],
    'random_forest__max_depth': [8, 10, 15, 20, 25, 30],
    'random_forest__max_features': ['sqrt'],
    'random_forest__min_samples_leaf': [1, 2, 5],
    'random_forest__min_samples_split': [1, 2, 5],
    'random_forest__n_estimators': [800, 1000, 1500, 2000]
}

#paramSearch = GridSearchCV(
#  estimator=random_forest_pipeline,
#  scoring=make_scorer(neg_rmsle),
#  param_grid=parameters, 
#  cv=2,
#  n_jobs=-1, 
#  verbose=2
#)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

#{'random_forest__bootstrap': True,
# 'random_forest__max_depth': 25,
# 'random_forest__max_features': 'sqrt',
# 'random_forest__min_samples_leaf': 1,
# 'random_forest__min_samples_split': 2,
# 'random_forest__n_estimators': 1000}

In [24]:
#plot_learning_curve(random_forest_pipeline, X, Y, cv=3)

### XGBoostRegressor

In [25]:
from xgboost import XGBRegressor

xgb_pipeline = Pipeline([
    ('preprocessing', featureTransformer),
    ('xgb_regressor', XGBRegressor(
        max_depth=5,
        n_estimators=7050,
        learning_rate=0.01,
        min_child_weight=1.5,
        subsample=0.2,
        gamma=0,
        reg_alpha=1,
        reg_lambda=0.1,
        objective='reg:gamma',
        booster='gbtree'
    )),
])

# Measure Performance

score_model(xgb_pipeline, X, Y)

## XGB (max_depth=8, n_estimators=200, learning_rate=0.06, min_child_weight=3, booster='gbtree', subsample=0.7, gamma=0)
## CV=2, 0.13494
# [CV] MSE: 782138754.8110 (37985553.8400)
# [CV] MAE: 16999.5631 (458.6830)
# [CV] R^2: 0.8758 (0.0017)
# [CV] RMSLE: -0.133976

# XGB (max_depth=4, n_estimators=7000, learning_rate=0.01, min_child_weight=1.5, subsample=0.2, gamma=0, reg_alpha=1, reg_lambda=0.1, booster='gbtree')
## CV=2, 0.13105
# [CV] MSE: 782885721.1446 (6743520.5426)
# [CV] MAE: 16302.8672 (471.5882)
# [CV] R^2: 0.8754 (0.0067)
# [CV] RMSLE: -0.130275 (0.0048)

# XGB (max_depth=5, n_estimators=7000, learning_rate=0.01, min_child_weight=1.5, subsample=0.2, gamma=0, reg_alpha=1, reg_lambda=0.1, booster='gbtree')
## CV=2, ?
# [CV] MSE: 783160286.1791 (1231674.5549)
# [CV] MAE: 16241.7409 (441.4740)
# [CV] R^2: 0.8752 (0.0080)
# [CV] RMSLE: -0.130159 (0.0045)

##n_estimators=7050
# CV=2, 0.13104
#[CV] MSE: 783066174.7649 (1966607.6233)
#[CV] MAE: 16238.1809 (437.5458)
#[CV] R^2: 0.8752 (0.0081)
#[CV] RMSLE: -0.130143 (0.0044)

[CV] MSE: 783066174.7649 (1966607.6233)
[CV] MAE: 16238.1809 (437.5458)
[CV] R^2: 0.8752 (0.0081)
[CV] RMSLE: -0.130143 (0.0044)


In [26]:
xgb_pipeline.fit(X, Y)
xgb_pipeline.named_steps['xgb_regressor'].feature_importances_

features_list = sorted(zip(xgb_pipeline.named_steps['xgb_regressor'].feature_importances_, X.columns), reverse=True)

features_list

[(0.014051641, 'BsmtQualEnc'),
 (0.013857039, 'OverallCond'),
 (0.013152896, '1stFlrSF'),
 (0.01233868, 'SaleCondition'),
 (0.009447186, 'CentralAirEnc'),
 (0.009291395, 'BedroomAbvGr'),
 (0.0064481725, 'LandSlope'),
 (0.005881669, 'TotalBathrooms'),
 (0.0055863447, 'MSSubClass'),
 (0.005530742, 'BsmtUnfSF'),
 (0.0054482636, 'BsmtFinSF1'),
 (0.0052987006, 'GarageFinishEnc'),
 (0.00524099, 'GarageCars'),
 (0.004854181, 'OverallQual'),
 (0.004814755, 'BsmtCondEnc'),
 (0.0047640437, 'Neighborhood'),
 (0.004680495, 'BsmtFinType2Enc'),
 (0.0046787006, 'LotShape'),
 (0.004650164, 'Condition1'),
 (0.0043684505, '2ndFlrSF'),
 (0.0042710085, 'Condition2'),
 (0.0042392905, 'BsmtFinType1Enc'),
 (0.00390006, 'KitchenQualEnc'),
 (0.003170629, 'BsmtFinSF2')]

In [27]:
parameters = {
    'xgb_regressor__objective': ['reg:gamma'], # 'reg:squarederror', 'reg:squaredlogerror'
    'xgb_regressor__learning_rate': [0.01],
    'xgb_regressor__n_estimators': [6900, 7000, 7100],
    'xgb_regressor__max_depth': [5],
    'xgb_regressor__booster': ['gbtree'],
    'xgb_regressor__min_child_weight': [1.5],
    'xgb_regressor__gamma': [0],
    'xgb_regressor__subsample': [0.2],
    'xgb_regressor__reg_alpha': [1],
    'xgb_regressor__reg_lambda': [0.1],
}

paramSearch = GridSearchCV(
   estimator=xgb_pipeline,
   scoring=make_scorer(neg_rmsle),
   param_grid=parameters,
   cv=2,
   n_jobs=-1, 
   verbose=3
)

#paramSearch.fit(X, Y)
#paramSearch.best_params_, paramSearch.best_score_

#({'xgb_regressor__booster': 'gbtree',
#  'xgb_regressor__gamma': 0.03,
#  'xgb_regressor__learning_rate': 0.04,
#  'xgb_regressor__max_depth': 5,
#  'xgb_regressor__min_child_weight': 2,
#  'xgb_regressor__n_estimators': 2000,
#  'xgb_regressor__objective': 'reg:gamma',
#  'xgb_regressor__subsample': 0.7},
# -0.1293080016874709)

#({'xgb_regressor__booster': 'gbtree',
#  'xgb_regressor__gamma': 0.03,
#  'xgb_regressor__learning_rate': 0.06,
#  'xgb_regressor__max_depth': 5,
#  'xgb_regressor__min_child_weight': 2,
#  'xgb_regressor__n_estimators': 2000,
#  'xgb_regressor__objective': 'reg:gamma',
#  'xgb_regressor__subsample': 0.7},
# -0.13023288188100762)

#({'xgb_regressor__booster': 'gbtree',
#  'xgb_regressor__gamma': 0.02,
#  'xgb_regressor__learning_rate': 0.07,
#  'xgb_regressor__max_depth': 2,
#  'xgb_regressor__min_child_weight': 1,
#  'xgb_regressor__n_estimators': 800,
#  'xgb_regressor__objective': 'reg:gamma',
#  'xgb_regressor__subsample': 0.8},
# -0.12779856264607697)

### StackRegressor

In [28]:
# https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition

### VotingRegressor

In [29]:
# https://www.kaggle.com/marktsvirko/votingregressor-xgb-svm-top-10

## Predict Submissions

In [30]:
random_forest_pipeline.fit(X, Y)

x_test = test_df[baseline_features]
y_test_predicted = random_forest_pipeline.predict(x_test)

submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': y_test_predicted,
})

submission_df.to_csv('./data/submission_random_forest.csv', index=False)

In [31]:
xgb_pipeline.fit(X, Y)

x_test = test_df[baseline_features]
y_test_predicted = xgb_pipeline.predict(x_test)

submission_df = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': y_test_predicted,
})

submission_df.to_csv('./data/submission_xgb.csv', index=False)