# House Prices Models Benchmark<br/><sup>Regression</sup>


### **Dataset:**  [house-prices](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)
#### Classification Benchmark: [Titanic Models Benchmark](https://www.kaggle.com/aravrs/titanic-models-benchmark)

<sup style="color:red;">Work in progess.</sup><br/>

---

In [None]:
import os
import time
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# sns.set_style("whitegrid")

sns.set_palette('Set1')
plt.rcParams['figure.figsize'] = (20, 8)
plt.rcParams['figure.dpi'] = 200

# plt.rcParams['axes.grid'] = False

import warnings
warnings.filterwarnings('ignore')

### Load Data

In [None]:
DATA_DIR = '../input/house-prices-advanced-regression-techniques/'
os.listdir(DATA_DIR)

train_df = pd.read_csv(DATA_DIR + 'train.csv')
test_df = pd.read_csv(DATA_DIR + 'test.csv')
sub_df = pd.read_csv(DATA_DIR + 'sample_submission.csv')

print(' Train:', train_df.shape, ' Test:', test_df.shape, ' Sub:', sub_df.shape)

# Basic EDA

In [None]:
fig = sns.heatmap(train_df.isnull(), cbar=False, cmap='hot_r', yticklabels=[]).set_title('Missing Values', fontsize=24, y=1.1);

In [None]:
sns.countplot(train_df['Neighborhood']).set_title('Neighborhood count plot', fontsize=24, y=1.02);

In [None]:
# cols = train_df.select_dtypes(include=['object']).columns
cols = ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
        'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
        'RoofStyle', 'RoofMatl','MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
        'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
        'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
        'SaleType', 'SaleCondition']

n_rows, n_cols = len(cols)//3, 3
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*6, n_rows*5))
fig.suptitle('Count Plots', fontsize=26, y=1.015)

for r in range(n_rows):
    for c in range(n_cols):
        try:
            i = r*n_cols+c
            ax = axs[r][c]
            sns.countplot(train_df[cols[i]], ax=ax)
            ax.set_title(cols[i]+' count', fontsize=20)
        except:
            break
plt.tight_layout()

In [None]:
# cols = list(train_df.select_dtypes(include=['float64']).columns) + list(train_df.select_dtypes(include=['int64']).columns)
cols = ['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
        'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtUnfSF',
        'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LotFrontage', 'MasVnrArea', 'GarageYrBlt', 
        'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr',
        'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 
        'OpenPorchSF', 'MoSold', 'YrSold']

n_rows, n_cols = len(cols)//3, 3
fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*6, n_rows*5))
fig.suptitle('Distribution Plots', fontsize=26, y=1.015)

for r in range(n_rows):
    for c in range(n_cols):
        try:
            i = r*n_cols+c
            ax = axs[r][c]
            sns.distplot(train_df[cols[i]], color='#222f3e', ax=ax)
            ax.set_title(cols[i]+' distribution', fontsize=20)
        except:
            print(cols[i])
plt.tight_layout()

In [None]:
sns.distplot(train_df['SalePrice'], color='g').set_title('Sale Price distribution plot', fontsize=24, y=1.02);

In [None]:
sns.heatmap(train_df.corr(), annot=False, cmap='RdBu_r',
            center=0, vmin=-1, vmax=1, linewidth=2, annot_kws={"fontsize":12},
            square=False, cbar=True).set_title('Correlation matrix', fontsize=24, y=1.02);

# Multi Model Benchmark

Install the necessary libraries and setup the environment

In [None]:
!pip install pycaret -q

In [None]:
from pycaret.utils import version
from pycaret.regression import *
print('Pycaret Verion:', version())

In [None]:
reg = setup(train_df, target='SalePrice', session_id=42, experiment_name='house-prices', log_experiment=True, silent=True)

Compare various models to find the best model

In [None]:
models = compare_models(sort='RMSLE', n_select=25)

## Analyse all models

In [None]:
#### hacky

plot_types = ['residuals', 'error', 'cooks', 'learning', 'vc', 'manifold', 'feature'] # 'rfe'

# to plot same plots for different models
def plot_util(models, plot, title='Comparison plot'):
    imgs = []
    for model in models:
        try: imgs.append(plt.imread(plot_model(model, plot=plot, save=True)))
        except: imgs.append(np.ones((1100, 1600, 4)))

    n_rows, n_cols = len(imgs)//2, 2
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*4, n_rows*3))
    fig.suptitle(title, fontsize=12, y=0.89)
    
    for r in range(n_rows):
        for c in range(n_cols):
            i = r*n_cols+c
            plt.subplot(n_rows, n_cols, i+1)
            plt.imshow(imgs[i])
            plt.axis('off')
    fig.subplots_adjust(wspace=0, hspace=0)
    
# to plot all plots for same model
def model_all_plots(model, title='Model plot'):
    imgs = []
    for plot in plot_types:
        try: imgs.append(plt.imread(plot_model(model, plot=plot, save=True)))
        except: imgs.append(np.ones((1100, 1600, 4)))
    
    n_rows, n_cols = len(imgs)//2, 2
    fig, axs = plt.subplots(n_rows, n_cols, figsize=(n_cols*4, n_rows*3))
    fig.suptitle(title, fontsize=12, y=0.89)

    for r in range(n_rows):
        for c in range(n_cols):
            i = r*n_cols+c
            plt.subplot(n_rows, n_cols, i+1)
            plt.imshow(imgs[i])
            plt.axis('off')
    fig.subplots_adjust(wspace=0, hspace=0)

Various analytical plots of models from best to worst. <br/>
> If a metric/plot is not possible for a particular model, it's left blank.

In [None]:
# test plot

plot_model(models[0], plot='feature')

# plot params

sns.set_palette('Set1')
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['savefig.bbox'] = 'tight'
plt.rcParams['savefig.pad_inches'] = 0.3

In [None]:
plot_util(models, 'residuals', 'Residual Comparison Plot')

In [None]:
plot_util(models, 'error', 'Prediction Error Comparison Plot')

In [None]:
plot_util(models, 'cooks', "Cook's Distance Outlier Comparison Plot")

In [None]:
plot_util(models, 'learning', 'Learning Curve Comparison Plot')

In [None]:
plot_util(models, 'vc', 'Validation Curve Comparison Plot')

In [None]:
plot_util(models, 'feature', 'Feature Importance Comparison Plot')

In [None]:
plot_model(model, plot='manifold')

## The Best Model

In [None]:
final_model = models[0]

In [None]:
model_all_plots(final_model, 'Final Model Plots')

In [None]:
interpret_model(final_model, plot='summary')

### Make predictions

In [None]:
predictions = predict_model(final_model, data=test_df)

In [None]:
submission = predictions[['Id', 'Label']].rename(columns={'Label': 'SalePrice'})
submission.to_csv('submission.csv', index=False)
print('Saved submission.csv')
submission.head()

---