In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
try:
    import pycaret
except:
    !pip install pycaret

try:
    import missingno
except:
    !pip install missingno

<hr style="border: solid 3px blue;">

# Introduction

![](https://live.staticflickr.com/3203/5873183944_6927e3e0b9_b.jpg)

Picture Credit: https://live.staticflickr.com



We want to analyze datasets and model them simply and clearly. However, we will not be able to do everything simply. The strategy we can take in this situation is to keep things simple and focus more on things that require complexity.

What complex tasks do we need to focus on? I think it's two things:
1. EDA: It seems that we should focus on understanding the dataset we want to analyze as much as possible and processing it to fit the model.
2. Model interpretation: It seems that we should focus on things to understand the results of the model and gain insight to improve performance.

On the other hand, it seems that the following tasks can be easily performed using good libraries.
1. Outlier Detection and removal: Finding outliers in high-dimensional datasets is a very difficult task. Attempts to visually find outliers are very difficult and sometimes unsuccessful. We will simplify outlier detection/removal using Pycaret's anomaly library.
2. Modeling: Deciding which model to use is a difficult task. Pycaret and Fastai will make these tasks simpler.
3. Tunung Hyperparameters: Tuning hyperparameters is also complex and difficult. If they also use Pycaret and Fastai, they will be able to make their work simpler.

Our strategy is to keep the simple things as simple as possible and focus on the complex things to focus on.

------------------------------------------
# Setting up

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from matplotlib import rcParams

In [None]:
PATH = '../input/house-prices-advanced-regression-techniques'

In [None]:
train = pd.read_csv(PATH+'/train.csv')
X_test = pd.read_csv(PATH+'/test.csv')
submission_data = pd.read_csv(PATH+'/sample_submission.csv')

In [None]:
house_df = pd.concat([train,X_test],ignore_index = True, sort = False)
tr_idx = house_df['SalePrice'].notnull()

<hr style="border: solid 3px blue;">

# Anomaly Detection and Removal

![](https://zindpublic.blob.core.windows.net/public/uploads/blog_post/image/17/big_thumb_4d7a83bc-af4a-4912-81b2-8b95d4b08322.jpg)

Picture Credit: https://zindpublic.blob.core.windows.net

We are not experts, and we lack domain-knowledge. It is very difficult to find outliers in this situation. Even if we are experts, it can be very difficult to find outliers in the case of a large number of features like this dataset. In this case, we will be able to effectively detect and remove them using a well-crafted library.


In [None]:
from pycaret.anomaly import *

In [None]:
pycaret.anomaly.setup(
    data=train,
    silent=True)

In [None]:
pca = pycaret.anomaly.create_model('pca')

In [None]:
pca_df = pycaret.anomaly.assign_model(pca)

In [None]:
abnormal_data = pca_df[pca_df.Anomaly == 1].sort_values(by='Anomaly_Score', ascending=False)
print("the size of anomaly = ",len(abnormal_data))
abnormal_data.head().style.set_properties(**{'background-color': 'black',
                           'color': 'white',
                           'border-color': 'white'})

<span style="color:Blue"> Observation:

In all, there are 73 anomaly data. The number of anomaly data is not large compared to the size of the entire dataset.

In [None]:
tuned_pca = tune_model(model = 'pca', supervised_target = 'SalePrice')

In [None]:
plt.style.use("dark_background")
plot_model(tuned_pca,plot='umap')

In [None]:
plot_model(tuned_pca,plot='tsne')

Let's visually check outliers!

In [None]:
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
plt.subplots_adjust(wspace=0.3)
ax1 = sns.regplot(data=house_df, x='GrLivArea',y='SalePrice')
ax1.set_title('Outliers in GrLivArea',fontsize=20)
plt.axhline(y=250000, color='Green', linestyle='--', linewidth=3)
plt.axvline(x=4000, color='Green', linestyle='--', linewidth=3)
plt.text(4500, 150000, 'Outliers',color='red')
plt.subplot(1,2,2)
ax2 = sns.regplot(data=house_df, x='TotalBsmtSF',y='SalePrice')
ax2.set_title('Outliers in TotalBsmtSF',fontsize=20)
plt.axhline(y=250000, color='Green', linestyle='--', linewidth=3)
plt.axvline(x=4500, color='Green', linestyle='--', linewidth=3)
plt.text(5000, 200000, 'Outliers',color='red')
sns.despine()

<span style="color:Blue"> Observation:

Even with our eyes, we can confirm that there are outliers.

We decide to remove the 73 anomaly data detected above.

In [None]:
house_df.drop(abnormal_data.index,axis=0,inplace=True)

In [None]:
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
plt.figure(figsize=(16,5))
plt.subplot(1,2,1)
plt.subplots_adjust(wspace=0.3)
ax1 = sns.regplot(data=house_df, x='GrLivArea',y='SalePrice')
ax1.set_title('Outliers in GrLivArea',fontsize=20)
plt.axhline(y=250000, color='Green', linestyle='--', linewidth=3)
plt.axvline(x=4000, color='Green', linestyle='--', linewidth=3)
plt.text(4100, 150000, 'Outliers',color='red')
plt.subplot(1,2,2)
ax2 = sns.regplot(data=house_df, x='TotalBsmtSF',y='SalePrice')
ax2.set_title('Outliers in TotalBsmtSF',fontsize=20)
plt.axhline(y=250000, color='Green', linestyle='--', linewidth=3)
plt.axvline(x=4500, color='Green', linestyle='--', linewidth=3)
plt.text(4800, 200000, 'Outliers',color='red')
sns.despine()

<span style="color:Blue"> Observation:

It was confirmed that outliers were effectively removed.

--------------------------------------------------
# EDA

Id feature is simply unique ID, so it is not helpful for learning. Let's remove it.

In [None]:
house_df.drop('Id',axis=1,inplace=True,errors='ignore')

In [None]:
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
plt.figure(figsize = (8,6))
ax = house_df.dtypes.value_counts().plot(kind='bar',grid = False,fontsize=20)
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+ p.get_width() / 2., height + 1, height, ha = 'center', size = 25)
sns.despine()

<span style="color:Blue"> Observation:

There are 37 numerical features and 43 object (string) types.
Among the numeric features, 25 are int types and 12 are float types.
There must be a reason for using a different type like this. Let's check some more.

# Categorizing Columns

In [None]:
categorical_cols = [cname for cname in house_df.loc[:,:'SaleCondition'].columns if
                    house_df[cname].dtype == "object"]

# Select numerical columns
int_cols = [cname for cname in house_df.loc[:,:'SaleCondition'].columns if 
                house_df[cname].dtype in ['int64']]
float_cols = [cname for cname in house_df.loc[:,:'SaleCondition'].columns if 
                house_df[cname].dtype in ['float64']]

numerical_cols = int_cols + float_cols

## Checking Missing Values

In [None]:
import missingno as msno
msno.matrix(house_df[tr_idx])

## Imputing Numerical Missing Valuse

Since it is a regression problem, filling it with KNN seems to be a wise choice.

> Each sample’s missing values are imputed using the mean value from n_neighbors nearest neighbors found in the training set. Two samples are close if the features that neither is missing are close.

Ref: https://scikit-learn.org/stable

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
house_df.loc[:,numerical_cols] = imputer.fit_transform(house_df.loc[:,numerical_cols])

## Checking Categorical Missing Values

Numerical missing values were filled in. Let's check again what the remaining missing values are.

In [None]:
isnull_series = house_df.loc[:,:'SaleCondition'].isnull().sum()
isnull_series[isnull_series > 0].sort_values(ascending=False)
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
plt.figure(figsize = (20,10))
ax = isnull_series[isnull_series > 0].sort_values(ascending=False).plot(kind='bar',
                                                                        grid = False,
                                                                        fontsize=20)
plt.legend(loc = 'upper right')
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+ p.get_width() / 2., height + 15, height, ha = 'center', size = 20)
sns.despine()

<span style="color:Blue"> Observation:
    
* Garage-related features have 157 to 159 missing values. It is unknown whether the houses lacked garages or were intentionally omitted.
* Basement-related features also have 79 to 82 missing values.

It seems that we need to focus more on the process of filling in the missing values ​​of the corresponding Garage and Basement features.

In [None]:
house_df.drop("PoolQC",axis=1,inplace=True)

----------------------------------------------------------
# Checking Target

The problem is a regression problem. Therefore, we analyze the distribution of the target and check whether there is necessary preprocessing based on this. If it is a classification problem
We need to check target imbalance.

In [None]:
house_df['SalePriceCpy'] = house_df['SalePrice'].copy()

In [None]:
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
plt.figure(figsize = (15,6))
ax1 = plt.subplot(1,2,1)
sns.histplot(house_df['SalePrice'],kde=True)
plt.axvline(x=house_df['SalePrice'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(house_df['SalePrice'].mean(), 125, "Mean", horizontalalignment='left', size=20, color='yellow', weight='semibold')
plt.title('Original Sale Price Histogram',fontsize=20)
ax2 = plt.subplot(1,2,2)
sns.regplot(data=house_df, y="SalePrice", x="GrLivArea",ax=ax2)
ax2.set_title('Orignaal Sale Price',fontsize=20)
sns.despine()

In [None]:
mean = house_df['SalePrice'].mean()
std = house_df['SalePrice'].std()
skew = house_df['SalePrice'].skew()
print('SalePrice : mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}'.format(mean, std, skew))

<span style="color:Blue"> Observation:

The skewness was about 1.88. Also, since the metric is RMSLE, we will perform log scaling.

In [None]:
house_df['SalePriceCpy'] = np.log1p(house_df['SalePrice'])

In [None]:
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
plt.figure(figsize = (15,6))
plt.subplots_adjust(hspace=0.3)
ax1 = plt.subplot(1,2,1)
sns.histplot(house_df['SalePriceCpy'],kde=True)
plt.axvline(x=house_df['SalePriceCpy'].mean(), color='g', linestyle='--', linewidth=3)
plt.text(house_df['SalePriceCpy'].mean(), 125, "Mean", horizontalalignment='left', size=20, color='yellow', weight='semibold')
ax1.set_title('Log transformed Sale Price Histogram',fontsize=20)
ax2 = plt.subplot(1,2,2)
sns.regplot(data=house_df, y="SalePriceCpy", x="GrLivArea",ax=ax2)
ax2.set_title('Log transformed Sale Price',fontsize=20)
sns.despine()

> Logarithm function increases the spacing between small numbers and reduces the spacing between large numbers. When certain features are dense with values in small values, by increasing these intervals, our models increase the intervals for small values, and we can improve the performance of the model when training and testing using these values.

Ref: https://www.kaggle.com/ohseokkim/preprocessing-linear-nonlinear-scaling

<span style="color:Blue"> Observation:
    
If you look at the regression plot between GrLivArea and SalePrice, you can see that a clearer regression line is drawn after conversion. Log transform will definitely help with learning.

In [None]:
mean = house_df['SalePriceCpy'].mean()
std = house_df['SalePriceCpy'].std()
skew = house_df['SalePriceCpy'].skew()
print('SalePrice : mean: {0:.4f}, std: {1:.4f}, skew: {2:.4f}'.format(mean, std, skew))

Skewness was also improved.

In [None]:
house_df.drop('SalePriceCpy',axis=1,inplace=True,errors='ignore')

---------------------------------------------------------------------------
# Doing EDA for Numerical Features

![](https://static-assets.codecademy.com/Courses/Hypothesis-Testing/Intro_to_variable_types_4.png)

Picture Credit: https://t3.ftcdn.net

-----------------------------------------------------------------------------
## Continous Numerical Features

In [None]:
i = 1
sns.set(font_scale = 1.5)
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
plt.figure(figsize=(20, 40))
#plt.subplots_adjust(hspace=1)
for feature in float_cols:
    plt.subplot(6,2,i)
    sns.histplot(house_df[feature],kde=True)
    i = i +1

<span style="color:Blue"> Observation:
    
* GarageCars, BsmtHalfBath, BsmtFullBath and GarageCars are discrete variables.
* Some features have a skewed shape to one side.

In [None]:
i = 1
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
plt.figure(figsize=(20,40))
plt.subplots_adjust(hspace=0.2)
for feature in float_cols:
    plt.subplot(6,2,i)
    sns.regplot(data=house_df, x=feature,y='SalePrice')
    i = i +1

<span style="color:Blue"> Observation:
    
It seems that there are outliers that deviate from the regression line. Let's check some more.

-------------------------------------------
## Discrete Numerical Features

In [None]:
i = 1
sns.set(style="ticks", context="talk",font_scale = 2)
plt.style.use("dark_background")
plt.figure(figsize=(50, 60))
plt.subplots_adjust(wspace=0.4,hspace=0.5)
for feature in int_cols:
    plt.subplot(9,4,i)
    sns.histplot(house_df[feature], kde=True)
    i = i +1

In [None]:
house_df['MSSubClass'] = house_df['MSSubClass'].apply(str)
house_df['YrSold'] = house_df['YrSold'].astype(str)
house_df['MoSold'] = house_df['MoSold'].astype(str)

Let's update the column lists again.

In [None]:
# "Cardinality" meancategorical_colss the number of unique values in a column
categorical_cols = [cname for cname in house_df.loc[:,:'SaleCondition'].columns if
                    house_df[cname].dtype == "object"]

# Select numerical columns
int_cols = [cname for cname in house_df.loc[:,:'SaleCondition'].columns if 
                house_df[cname].dtype in ['int64']]
float_cols = [cname for cname in house_df.loc[:,:'SaleCondition'].columns if 
                house_df[cname].dtype in ['float64']]

numerical_cols = int_cols + float_cols

------------------------------------------------------
# Adding New Derived Features using Numerical Feature

Let's create a new derived variable so that our model can learn better.

* TotalBsmtSF: Total square feet of basement area
* GrLivArea: Above grade (ground) living area square feet
* YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
* 1stFlrSF: First Floor square feet 
* 2ndFlrSF: Second floor square feet

## Question 1: Does the combination of underground and above-ground area have a high correlation with the Sale Price?

* TotalBsmtSF: Total square feet of basement area
* GrLivArea: Above grade (ground) living area square feet

In [None]:
house_df["AllArea"] = house_df.GrLivArea + house_df.TotalBsmtSF
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
sns.regplot(data=house_df, x='AllArea',y='SalePrice')
plt.title('AllArea-SalePrice',fontsize=20)
sns.despine()

## Question 2: If you recently remodeled and have a large basement, will your sale price increase?

* TotalBsmtSF: Total square feet of basement area
* YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)

In [None]:
house_df["NewBsmtSF"] = house_df['YearRemodAdd'] + house_df['TotalBsmtSF']
sns.regplot(data=house_df, x='NewBsmtSF',y='SalePrice')
plt.title('NewBsmtSF-SalePrice',fontsize=20)
sns.despine()

## Question 3: Can the combined area of the 1st and 2nd floors affect the sale price?
* 1stFlrSF: First Floor square feet 
* 2ndFlrSF: Second floor square feet

In [None]:
house_df["HighQualSF"] = house_df["1stFlrSF"] + house_df["2ndFlrSF"]
sns.regplot(data=house_df, x='NewBsmtSF',y='SalePrice')
plt.title('HighQualSF-SalePrice',fontsize=20)
sns.despine()

**Good derivative features come from good questions. Good questions come from a lot of domain-knowledge.**

Should I really be a real estate agent? If you have any good information, please share.

----------------------------------------------------
# Scaling

There are various scaling methods for numerical features. However, we did log scaling, which is a non-linear scaling of our target. Therefore, other numerical features are also subjected to log scaling.

In [None]:
# "Cardinality" meancategorical_colss the number of unique values in a column
categorical_cols = [cname for cname in house_df.loc[:,:'SaleCondition'].columns if
                    house_df[cname].dtype == "object"]

# Select numerical columns
int_cols = [cname for cname in house_df.loc[:,:'SaleCondition'].columns if 
                house_df[cname].dtype in ['int64']]
float_cols = [cname for cname in house_df.loc[:,:'SaleCondition'].columns if 
                house_df[cname].dtype in ['float64']]

numerical_cols = int_cols + float_cols

First, let's check skewness. A skewness greater than 1 is generally judged to be skewed, so check mainly those greater than 1.

In [None]:
from scipy.stats import skew
plt.figure(figsize=(18, 10))
skew_features = house_df[numerical_cols].apply(lambda x : skew(x))
skew_features = skew_features[skew_features > 1].sort_values(ascending=False)
ax = sns.barplot( x =skew_features.index,y=skew_features.values,color='grey')
for p in ax.patches:
    height = p.get_height().round(1)
    ax.text(p.get_x()+ p.get_width()//2, height+0.5, height, ha = 'left', size = 30)
plt.xticks(rotation=45)
plt.text(5, 1.2, 'Threshold',color='red')
plt.axhline(y=1, color='green', linestyle='--', linewidth=3)
plt.title('Skewness',fontsize=30)
sns.despine()

<span style="color:Blue"> Observation:
    
It is conformed that 14 features are skewed. We will do log transformation for these features.

In [None]:
house_df[skew_features.index] = np.log1p(house_df[skew_features.index])

In [None]:
from scipy.stats import skew
plt.figure(figsize=(10, 5))
skew_features = house_df[numerical_cols].apply(lambda x : skew(x))
skew_features = skew_features[skew_features > 1].sort_values(ascending=False)
ax = sns.barplot( x =skew_features.index,y=skew_features.values,color='grey')
for p in ax.patches:
    height = p.get_height().round(1)
    ax.text(p.get_x(), height+0.5, height, ha = 'left', size = 25)
plt.xticks(rotation=45)
plt.axhline(y=1, color='green', linestyle='--', linewidth=3)
plt.text(4, 1.2, 'Threshold',color='red')
plt.title('Skewness',fontsize=30)
sns.despine()

The number of skewed features is reduced from 14 to 9. The remaining 4 skewness was greatly reduced.

In [None]:
i = 1
plt.figure(figsize=(20,40))
plt.subplots_adjust(hspace=0.4)
for feature in skew_features.index:
    plt.subplot(10,2,i)
    sns.histplot(house_df[feature])
    i = i +1

Some features still have skewness greater than 1, but further improvement seems difficult.

----------------------------------------------------------------
# Doing EDA for Categorical Features

![](https://miro.medium.com/max/698/1*A2zAEX3OydZ0r_Gk4gYjEg.png)

Picture Credit: https://miro.medium.com

Categorical data can be classified into ordinal data and nominal data. In the case of an ordinal type, there is a difference in importance for each level. This value plays an important role in the case of regression, so encode it with care.

In [None]:
categorical_cols = [cname for cname in house_df.columns if
                    house_df[cname].dtype == "object"]

------------------------------------------------------------------------
## Filling missing values

A good way to fill in the missing values of categorical features in the absence of domain-knowledge is to take the most-frequent strategy.

> Imputation is the standard approach, and it usually works well. However, imputed values may be systematically above or below their actual values (which weren't collected in the dataset). Or rows with missing values may be unique in some other way. In that case, your model would make better predictions by considering which values were originally missing.

![](https://i.imgur.com/UWOyg4a.png)

> In this approach, we impute the missing values, as before. And, additionally, for each column with missing entries in the original dataset, we add a new column that shows the location of the imputed entries.
> 
> In some cases, this will meaningfully improve results. In other cases, it doesn't help at all.

Ref: https://www.kaggle.com/alexisbcook/missing-values

## Checking Missig Values Again

Only features with more than 20 missing values are checked. It was judged that the smaller features were probably omitted due to a mistake during the recording process.

In [None]:
isnull_series = house_df.loc[:,:'SaleCondition'].isnull().sum()
isnull_series[isnull_series >= 20].sort_values(ascending=False)

plt.figure(figsize = (20,10))
ax = isnull_series[isnull_series >= 20].sort_values(ascending=False).plot(kind='bar',
                                                                        grid = False,
                                                                        fontsize=20)
plt.legend(loc = 'upper right')
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+ p.get_width() / 2., height + 20, height, ha = 'center', size = 20)
sns.despine()

In [None]:
def plot_boxen_reg(feature):    
    plt.figure(figsize=(20,6))
    plt.subplots_adjust(wspace=0.3)
    plt.subplot(1,2,1)
    sns.boxenplot(data=house_df, x=feature,y='SalePrice',palette='Set2')
    plt.subplot(1,2,2)
    sns.regplot(data=house_df, x=feature,y='SalePrice',color='lightblue')

## MiscFeature: Miscellaneous feature not covered in other categories

**Question: Is there a difference in house price with and without miscellaneous features?**

In [None]:
house_df['HasMiscFeature'] = house_df['MiscFeature'].notnull().astype(int)
plot_boxen_reg('HasMiscFeature')

## Alley: Type of alley access to property

**Question: Is there a difference in house price with and without Alley access?**

In [None]:
house_df['HasAlley'] = house_df['Alley'].notnull().astype(int)
plot_boxen_reg('HasAlley')

## Fence: Fence quality

**Question: Is there a difference in house price with and without fence?**

In [None]:
house_df['HasFence'] = house_df['Fence'].notnull().astype(int)
plot_boxen_reg('HasFence')

## FireplaceQu: Fireplace quality

**Question: Is there a difference in house prices with and without Fireplace?**

In [None]:
house_df['HasFireplaceQu'] = house_df['FireplaceQu'].notnull().astype(int)
plot_boxen_reg('HasFireplaceQu')

## Garage Features

**Question: Is there a difference in house price with and without Garage?**

In [None]:
house_df['HasGarageQual'] = house_df['GarageQual'].notnull().astype(int)
plot_boxen_reg('HasGarageQual')

## Basement features

**Question: Is there a difference in the house price with and without a Basement?**

In [None]:
house_df['HasBsmtQual'] = house_df['BsmtQual'].notnull().astype(int)
plot_boxen_reg('HasBsmtQual')

## MasVnrType: Masonry veneer type

**Question: Is there a difference in house price with and without Masonry veneer?**

In [None]:
house_df['HasMasVnrType'] = house_df['MasVnrType'].notnull().astype(int)
plot_boxen_reg('HasMasVnrType')

In [None]:
fill_missing_features = ['Alley','GarageType','GarageCond','Fence','Street','LotShape',
                        'LandContour','BsmtFinType1','BsmtFinType2','CentralAir','MiscFeature',
                        'Utilities','SaleCondition']
house_df[fill_missing_features] = house_df[fill_missing_features].fillna('missing')

house_df['MasVnrType'] = house_df['MasVnrType'].fillna('None')
house_df["Functional"] = house_df["Functional"].fillna("Typ")

> For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with scikit-learn estimators which assume that all values in an array are numerical, and that all have and hold meaning. A basic strategy to use incomplete datasets is to discard entire rows and/or columns containing missing values. However, this comes at the price of losing data which may be valuable (even though incomplete). A better strategy is to impute the missing values, i.e., to infer them from the known part of the data. 

Ref: https://scikit-learn.org/stable/modules/impute.html#impute

In [None]:
fill_mode_features = ['MSZoning','BsmtQual','BsmtCond','FireplaceQu','GarageFinish',
                     'GarageQual','BsmtExposure','Electrical','Exterior1st','Exterior2nd',
                     'KitchenQual','SaleType']

imputer = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
house_df[fill_mode_features] = imputer.fit_transform(house_df[fill_mode_features])

-----------------------------------------------------
# Checking Ordinal Features
In some cases, it is easy to judge that there is an order on a commonsense level. However, there are many cases where it is difficult to judge that there is an order. The method used in this notebook to determine whether the features are ordinal or not was determined to have a certain order through visualization. However, if you have real estate knowledge, you will be able to determine the order of each level by classifying ordinal features smarter than me.





## MSZoning

In [None]:
house_df['MSZoning'] = house_df['MSZoning'].replace({'C (all)':1,'RM':2,'RH':3,'RL':4,'FV':5})

## Condition1/Condition2

In [None]:
house_df['Condition1'] = house_df['Condition1'].replace({'Artery':1,
                                                         'RRAe':1,
                                                         'RRNn':1,
                                                         'Feedr':1,
                                                         'RRNe':1,
                                                         'RRAn':1,
                                                         'Norm':2,
                                                         'PosA':3,
                                                         'PosN':3})

In [None]:
house_df['Condition2'] = house_df['Condition2'].replace({'RRNn':1,
                                                         'Artery':2, 
                                                         'Feedr':2,
                                                         'RRAn':2,
                                                         'RRAe':2,    
                                                         'Norm':2,
                                                         'PosA':3,
                                                         'PosN':3})

## HouseStyle

In [None]:
def HouseStyleToInt(x):
    if(x=='1.5Unf'):
        r = 0
    elif(x=='SFoyer'):
        r = 1
    elif(x=='1.5Fin'):
        r = 2
    elif(x=='2.5Unf'):
        r = 3
    elif(x=='SLvl'):
        r = 4
    elif(x=='1Story'):
        r = 5
    elif(x=='2Story'):
        r = 6  
    elif(x==' 2.5Fin'):
        r = 7          
    else:
        r = 8
    return r

house_df['HouseStyle'] = house_df['HouseStyle'].apply(HouseStyleToInt)

## MasVnrType

In [None]:
def MasVnrTypeToInt(x):
    if(x=='Stone'):
        r = 3
    elif(x=='BrkFace'):
        r = 2
    elif(x=='BrkCmn'):
        r = 1        
    else:
        r = 0
    return r

house_df['MasVnrType'] = house_df['MasVnrType'].apply(MasVnrTypeToInt)

## Foundation

In [None]:
foundation_label = {'Slab':1,'BrkTil':2,'Stone':2,'CBlock':3,'Wood':4,'PConc':5}
house_df['Foundation'] = house_df['Foundation'].replace(foundation_label)

## GarageType

In [None]:
garagetype_label = {'CarPort':1,'Basment':2,'Detchd':2,'Attchd':3,'2Types':3,'BuiltIn':4}
house_df['GarageType'] = house_df['GarageType'].replace(garagetype_label)

## GarageFinish

In [None]:
house_df['GarageFinish'] = house_df['GarageFinish'].replace({'Unf':1,'RFn':2,'Fin':3})

## PavedDrive

In [None]:
house_df['PavedDrive'] = house_df['PavedDrive'].replace({'N':1,'P':2,'Y':3})

## SaleCondition

In [None]:
salecon_label = {'AdjLand':1,'Abnorml':2,'Family':2,'Alloca':2,'Normal':3,'Partial':4}
house_df['SaleCondition'] = house_df['SaleCondition'].replace(salecon_label)

## Exterior1st / Exterior2nd

In [None]:
ext_lable = {'AsbShng':1,'AsphShn':1,
             'MetalSd':2,'Wd Sdng':2,'WdShing':2, 'Wd Shng':2,'Stucco':2,'CBlock':2,
             'HdBoard':3,'BrkFace':3,'Plywood':3,'Other':3,
             'VinylSd':4,'CemntBd':4,'BrkComm':4,'CmentBd':4,'Brk Cmn':4,
             'Stone':5,'ImStucc':5 }
house_df['Exterior1st'] = house_df['Exterior1st'].replace(ext_lable)
house_df['Exterior2nd'] = house_df['Exterior2nd'].replace(ext_lable)

## BsmtExposure

In [None]:
def BsmtExposureToInt(x):
    if(x=='Gd'):
        r = 4
    elif(x=='Av'):
        r = 3
    elif(x=='Mn'):
        r = 2
    elif(x=='No'):
        r = 1
    else:
        r = 0
    return r

house_df['BsmtExposure'] = house_df['BsmtExposure'].apply(BsmtExposureToInt)

## BsmtFinType1

In [None]:
def BsmtFinType1ToInt(x):
    if(x=='GLQ'):
        r = 6
    elif(x=='ALQ'):
        r = 5
    elif(x=='BLQ'):
        r = 4
    elif(x=='Rec'):
        r = 3   
    elif(x=='LwQ'):
        r = 2
    elif(x=='Unf'):
        r = 1        
    else:
        r = 0
    return r

house_df['BsmtFinType1_int'] = house_df['BsmtFinType1'].apply(BsmtFinType1ToInt)

## Quality Features

In [None]:
quality_label = {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
quality_features = ['ExterQual','ExterCond','KitchenQual','HeatingQC','BsmtQual',
                    'BsmtCond','FireplaceQu','GarageQual']

house_df[quality_features] = house_df[quality_features].replace(quality_label)

In [None]:
categorical_cols = [cname for cname in house_df.columns if
                    house_df[cname].dtype == "object"]

--------------------------------------------------------
# Making Derived Features for Categorical Data

## Question 1: Is total house quality correlated with sale price?

In [None]:
house_df['Total_Home_Quality'] = (house_df['OverallQual'] + house_df['OverallCond'])**2
plt.figure(figsize=(20,6))
plt.subplots_adjust(wspace=0.3)
plt.subplot(1,2,1)
sns.boxenplot(data=house_df, x='Total_Home_Quality',y='SalePrice',palette='Set2')
plt.xticks(rotation=90)
plt.subplot(1,2,2)
sns.regplot(data=house_df, x='Total_Home_Quality',y='SalePrice')

## Question 2: Is the total number of bathrooms correlated with the sale price?

In [None]:
house_df['Total_Bathrooms'] = (house_df['FullBath'] + (0.5 * house_df['HalfBath']) + (house_df['BsmtFullBath'] + (0.5 * house_df['BsmtHalfBath'])))
plt.figure(figsize=(20,6))
plt.subplots_adjust(wspace=0.3)
plt.subplot(1,2,1)
sns.boxenplot(data=house_df, x='Total_Bathrooms',y='SalePrice',palette='Set2')
plt.xticks(rotation=90)
plt.subplot(1,2,2)
sns.regplot(data=house_df, x='Total_Bathrooms',y='SalePrice')

## Question 3: Can Total Condition Affect Sale Price?

In [None]:
house_df['total_condition'] = house_df['Condition1'] + house_df['Condition2'] 
plt.figure(figsize=(20,6))
plt.subplots_adjust(wspace=0.3)
plt.subplot(1,2,1)
sns.boxenplot(data=house_df, x='total_condition',y='SalePrice',palette='Set2')
plt.subplot(1,2,2)
sns.regplot(data=house_df, x='total_condition',y='SalePrice')

## Question 4: Can area per room affect the sale price?

In [None]:
house_df["SqFtPerRoom"] = house_df["GrLivArea"] / (house_df["TotRmsAbvGrd"] +
                                                       house_df["FullBath"] +
                                                       house_df["HalfBath"] +
                                                       house_df["KitchenAbvGr"])
plt.figure(figsize=(8,8))
sns.regplot(data=house_df, x='SqFtPerRoom',y='SalePrice')

<hr style="border: solid 3px blue;">

# Selecting Features

![](https://www.limra.com/siteassets/solutions-and-services/selection/selection_t2_16-11.jpg)

Picture Credit: https://www.limra.com

**Feature selection**
> In machine learning and statistics, feature selection, also known as variable selection, attribute selection or variable subset selection, is the process of selecting a subset of relevant features (variables, predictors) for use in model construction. Feature selection techniques are used for several reasons:
> 
> 1. simplification of models to make them easier to interpret by researchers/users,
> 2. shorter training times,
> 3. to avoid the curse of dimensionality,
> 4. improve data's compatibility with a learning model class,
> 5. encode inherent symmetries present in the input space.

Ref: https://en.wikipedia.org/wiki/Feature_selection

Here, we want to check feature importance in a variety of ways and make feature selection effective.

In [None]:
corr=house_df[tr_idx].corr().round(3)
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
abs(corr['SalePrice']).sort_values(ascending=False)[1:].plot.bar(figsize=(30,10),legend=False)
plt.title('Correlation with SalePrice',fontsize=20)

<span style="color:Blue"> Observation:

* The newly created derivative variable AllArea feature and house price have a high correlation!
* Pool-related features and fireplaces with many missing values have a low correlation with house price. 


## Encoding nominal data using one-hot encoding.

In [None]:
house_df = pd.get_dummies(house_df, drop_first=True)

In [None]:
corr=house_df[tr_idx].corr().round(3)
plt.figure(figsize=(25, 10))
abs(corr['SalePrice']).sort_values(ascending=False)[1:11].plot.bar(figsize=(12,7),legend=False)
plt.title('Correlation with SalePrice',fontsize=20)

In [None]:
sort_list = abs(corr['SalePrice']).sort_values(ascending=False)[1:21].index

In [None]:
X_train = house_df[tr_idx].drop('SalePrice',axis=1)
y_train = house_df[tr_idx].pop('SalePrice')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import plot_partial_dependence

clf = RandomForestRegressor(n_estimators=100).fit(X_train, y_train)

In [None]:
def rf_feat_importance(clf,df):
    return pd.DataFrame({'cols':df.columns,'imp':clf.feature_importances_}).sort_values('imp',ascending=False)

fi = rf_feat_importance(clf,X_train)

In [None]:
def plot_fi(fi):
    return fi.plot('cols','imp','bar',figsize=(15,7),legend=False)

plot_fi(fi[:30])

In [None]:
remove_cols = fi[fi.imp<0.00005].cols.values

In [None]:
house_df.drop(remove_cols,axis=1,inplace=True)

In [None]:
all_cols = [cname for cname in house_df.columns]

---------------------------------------
## Partial Dependence

> Partial dependence plots (PDP) show the dependence between the target response and a set of input features of interest, marginalizing over the values of all other input features (the ‘complement’ features). Intuitively, we can interpret the partial dependence as the expected target response as a function of the input features of interest

Ref: https://scikit-learn.org/stable/modules/partial_dependence.html#partial-dependence

In [None]:
fig,ax = plt.subplots(figsize=(18,35))
sns.set(style="ticks", context="talk",font_scale = 1)
plt.style.use("dark_background")
fig.tight_layout()
plot_partial_dependence(clf, X_train,fi[:20].cols,ax=ax)

<span style="color:Blue"> Observation:

* When the value of AllArea is changed, the House Price is strongly changed. That is, AllArea can be determined as the most important feature.
* If the value of OverallQual is changed, the change in the value of House Price is also large. In other words, OverallQual can also be judged as an important feature.    

In [None]:
#from sklearn.ensemble import CatBoostRegressor
from catboost import CatBoostRegressor
clf = CatBoostRegressor(n_estimators=100).fit(X_train, y_train)

In [None]:
fig,ax = plt.subplots(figsize=(18,35))
sns.set(style="ticks", context="talk",font_scale = 1.2)
plt.style.use("dark_background")
fig.tight_layout()
plot_partial_dependence(clf, X_train, fi[:20].cols,ax=ax)

<span style="color:Blue"> Observation:

* When the values of AllArea and OverallQual increase, the house price clearly increases.
* When using the CatBoost model compared to RandomForest, the house price tends to increase when other features are changed.

Judging from the above results, it can be seen that the performance of the CatBoost model is higher than that of the RandomForest model.

------------------------------------------
## Feature importance based on feature permutation
> The estimator is required to be a fitted estimator. X can be the data set used to train the estimator or a hold-out set. The permutation importance of a feature is calculated as follows. First, a baseline metric, defined by scoring, is evaluated on a (potentially different) dataset defined by the X. Next, a feature column from the validation set is permuted and the metric is evaluated again. The permutation importance is defined to be the difference between the baseline metric and metric from permutating the feature column.

Ref: https://scikit-learn.org/stable

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    clf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)

forest_importances = pd.Series(result.importances_mean)

In [None]:
sorted_idx = result.importances_mean.argsort()
sorted_idx = sorted_idx[150:]
plt.rcParams.update({'font.size': 1})
fig, ax = plt.subplots(figsize=(15,12))
ax.boxplot(
    result.importances[sorted_idx].T, vert=False, labels=X_train.columns[sorted_idx]
)
ax.set_title("Permutation Importances")
fig.tight_layout()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.show()

<span style="color:Blue"> Observation:

* Even if it is checked by the feature permutation method, AllArea and OverallQual features seem to be more important when compared with other features.

In [None]:
house_df['SalePriceCpy'] = np.log1p(house_df['SalePrice'])
house_df.drop('SalePrice',axis=1,inplace=True)
house_df['SalePrice'] = house_df['SalePriceCpy'].copy()
house_df.drop('SalePriceCpy',axis=1,inplace=True)

In [None]:
house_train_x = house_df.drop('SalePrice',axis=1)
all_cols = [cname for cname in house_train_x.columns]
house_train_y = house_df['SalePrice']

<hr style="border: solid 3px blue;">

![](https://mblogthumb-phinf.pstatic.net/MjAxOTA0MTdfOTMg/MDAxNTU1NDY0MzYwMTcw.WhR2YPngYm7zJlHNN5VuFsYjS_HffTIQGHJnidoopCwg.kYhq2EOIUA5_FmhMPJMWYNjqljhr4E0Mm0IotOtt4gYg.JPEG.jaeminyx/simple-is-best.jpg?type=w800)

Picture Credit: https://mblogthumb-phinf.pstatic.net

> Simple can be harder than complex: you have to work hard to get your thinking clean to make it simple. But it's worth it in the end because once you get there, you can move mountains.
> 
> ― Steve Jobs

We will implement a deap learning model using fastai and make predictions after training. If we use Fastai, we can reduce our concerns a lot when we need to determine the hyperparameters.

In [None]:
from fastai.tabular.all import *

-------------------
## Defining TabularDataLoaders

![](https://miro.medium.com/max/1838/1*3vAYjhGh_EopD0cRdxrbOQ.png)

Picture Credit: https://miro.medium.com


> This class should not be used directly, one of the factory methods should be preferred instead. All those factory methods accept as arguments:
> 
> * cat_names: the names of the categorical variables
> * cont_names: the names of the continuous variables
> * y_names: the names of the dependent variables
> * y_block: the TransformBlock to use for the target
> * valid_idx: the indices to use for the validation set (defaults to a random split otherwise)
> * bs: the batch size
> * val_bs: the batch size for the validation DataLoader (defaults to bs)
> * shuffle_train: if we shuffle the training DataLoader or not
> * n: overrides the numbers of elements in the dataset
> * device: the PyTorch device to use (defaults to default_device())

Ref: https://docs.fast.ai/tabular.data.html

In [None]:
cat_cols = list(house_df[tr_idx].select_dtypes(include = ['object', 'bool']).columns)
num_cols = list(house_df[tr_idx].select_dtypes(exclude = ['object', 'bool']).columns)
num_cols = num_cols.remove('SalePrice')
train = house_df[tr_idx]
test = house_df[~tr_idx].drop('SalePrice',axis=1)
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter(valid_pct=0.2)(range_of(train))

In [None]:
to = TabularDataLoaders.from_df(train,
                                procs=procs, 
                                cat_names=cat_cols, 
                                cont_names=num_cols, 
                                splits = splits,
                                y_names="SalePrice",
                                bs = 4)

In [None]:
dls = to.dataloaders()

--------------------------------
## Defining Model

In [None]:
max_log_y = (np.max(train['SalePrice'])*1.2)
y_range = torch.tensor([0, max_log_y])

Since we did log scaling of skewed features and targets, choose rmse for metrics. 

In [None]:
learn = tabular_learner(dls,layers=[300,100,25,25,12,8],  
                        y_range = y_range,
                        metrics = rmse,
                        cbs = [ActivationStats(with_hist=True)])
learn.model

-------------------------------
## Find the proper learning rate

![](https://149695847.v2.pressablecdn.com/wp-content/uploads/2019/05/learning-rate.gif)

Picture Credit: https://149695847.v2.pressablecdn.com

> In machine learning and statistics, the learning rate is a tuning parameter in an optimization algorithm that determines the step size at each iteration while moving toward a minimum of a loss function. Since it influences to what extent newly acquired information overrides old information, it metaphorically represents the speed at which a machine learning model "learns". In the adaptive control literature, the learning rate is commonly referred to as gain.
> 
> In setting a learning rate, there is a trade-off between the rate of convergence and overshooting. While the descent direction is usually determined from the gradient of the loss function, the learning rate determines how big a step is taken in that direction. A too high learning rate will make the learning jump over minima but a too low learning rate will either take too long to converge or get stuck in an undesirable local minimum.

Ref: https://en.wikipedia.org/wiki/Learning_rate

Learning rate is one of the important parameters among hyperparameters. However, choosing a learning rate is not an easy task with many considerations.
Fastai finds the learning rate so that an appropriate learning rate can be determined.

In [None]:
plt.rcParams["figure.figsize"] = (8,6)
sr = learn.lr_find()
sr.valley

We decided to use the learning rate found with the Fastai library.

In [None]:
learn.fit_one_cycle(100, sr.valley)

In [None]:
learn.recorder.plot_loss()

-----------------------------------------------------------
## Checking learning rate and momemtum scheduling
![](https://www.andreaperlato.com/img/momentum.png)

Picture Credit: https://www.andreaperlato.com

Momentum combines the direction from the gradient descent optimization algorithm obtained from the previous otimization procedure with the direction obtained from the current procedure to overcome the noisy gradient well.

If you look at the figure below, fast.ai finds an appropriate convergence point after increasing the learning rate gradually, lowering the learning rate again to find an appropriate learning rate. As opposed to learning, modem started with a large value and changed to a low value.

In [None]:
learn.recorder.plot_sched()
plt.subplots_adjust(wspace=0.5)

-------------------------------------------
## Interpreting Model

In [None]:
def plot_layer_stats(self, idx):
    plt,axs = subplots(1, 3, figsize=(15,3))
    plt.subplots_adjust(wspace=0.5)
    for o,ax,title in zip(self.layer_stats(idx),axs,('mean','std','% near zero')):
        ax.plot(o)
        ax.set_title(title)

---------------------------------------------------
## Checking Layer Activation Statistics

Fastai provides histograms such as mean and standar for each layer of the network. If training is successful, mean and std are evenly distributed. If this value is concentrated and distributed near zero, it means that the learning is not done properly. In this case, it would be better to model again and learn again.

In [None]:
plt.style.use("dark_background")
plt.subplots_adjust(wspace=1)
plot_layer_stats(learn.activation_stats,-3)

In [None]:
plt.style.use("dark_background")
plt.subplots_adjust(wspace=1)
plot_layer_stats(learn.activation_stats,-2)

-------------------------------------------
## Predicting using model

In [None]:
test_dl = learn.dls.test_dl(test)
#test_dl.show_batch()

In [None]:
preds_fastai, test_labels = learn.get_preds(dl=test_dl)
preds_fastai = np.expm1(preds_fastai)

<hr style="border: solid 3px blue;">

# Ensemble


**In this notebook, I would like to organize the following three ensemble models.**
* Soft Voting Model

## Seting up

In [None]:
from pycaret.regression import *

In [None]:
reg = setup(data = house_df[tr_idx],
            target = 'SalePrice',
            normalize = False,
            preprocess = False,
            numeric_features = all_cols,
            silent = True)

## Creating Models

In [None]:
catboost = create_model('catboost')
br = create_model('br')
ridge = create_model('ridge')
gbr = create_model('gbr')
lr = create_model('lr')
lightgbm = create_model('lightgbm')
mlp = create_model('mlp')
dt = create_model('dt')

## Tuning Hyperparameters

In [None]:
tuned_catboost = tune_model(catboost,early_stopping=True,optimize='RMSE',search_library='optuna')
tuned_br = tune_model(br,early_stopping=True,optimize='RMSE',search_library='optuna')
tuned_gbr = tune_model(gbr,early_stopping=True,optimize='RMSE',search_library='optuna')
tuned_ridge = tune_model(ridge,early_stopping=True,optimize='RMSE',search_library='optuna')
tuned_lr = tune_model(lr,early_stopping=True,optimize='RMSE',search_library='optuna')
tuned_lightgbm = tune_model(lightgbm,early_stopping=True,optimize='RMSE',search_library='optuna')
tuned_mlp = tune_model(mlp,early_stopping=True,optimize='RMSE')

In [None]:
plt.figure(figsize=(10, 8))
with plt.rc_context({'figure.facecolor':'black','text.color':'white'}):
    plot_model(tuned_mlp, plot='learning')

In [None]:
params = { "min_samples_leaf": [70]}
tuned_dt = tune_model(dt,early_stopping=True,optimize='RMSE',custom_grid = params)

In [None]:
plt.figure(figsize=(10, 8))
with plt.rc_context({'figure.facecolor':'black','text.color':'white'}):
    plot_model(tuned_dt, plot='learning')

## Interpreting Models

In [None]:
with plt.rc_context({'figure.facecolor':'black','text.color':'blue'}):
    plot_model(tuned_dt, plot='tree')

<span style="color:Blue"> Observation

* We can confirm that it is divided first by OverallQual. In the decision tree, the OverallQual feature seems to be the most important feature.
* AllArea, GarageCars, and BsmtFinSF1 also appear to be important features in the decision tree.

In [None]:
plt.style.use("dark_background")
with plt.rc_context({'figure.facecolor':'lightgrey'}):
    interpret_model(tuned_catboost)

In [None]:
plt.style.use("dark_background")
with plt.rc_context({'figure.facecolor':'lightgrey'}):
    interpret_model(tuned_lightgbm)

<span style="color:Blue"> Observation:

* Newly created derived variables play an important role in model learning.
* When the models are different, the feature importance is also slightly different. This diversity is the power of ensemble. 

-------------------------------------------------------------------------------
## Soft Voting

![](https://miro.medium.com/max/806/1*bliKQZGPccS7ho9Zo6uC7A.jpeg)

Picture Credit: https://miro.medium.com

In [None]:
blend_soft = blend_models(estimator_list = [catboost,br,ridge,gbr,lightgbm,lr],
                          optimize = 'RMSE')

------------------------------------
## Finalizing the last model
> This function trains a given estimator on the entire dataset including the holdout set.

Ref: https://pycaret.readthedocs.io/en/latest/api/classification.html


**The blend model seems to be stable. Let's use this model as our final model.**

In [None]:
final_model = finalize_model(blend_soft)

In [None]:
sns.set(style="ticks", context="talk",font_scale = 1,rc={'figure.figsize':(8,6)})
plt.style.use("grayscale")
plot_model(final_model, plot='residuals')

In [None]:
sns.set(style="ticks", context="talk",font_scale = 1,rc={'figure.figsize':(8,6)})
plt.style.use("grayscale")
plot_model(final_model, plot='error')

<span style="color:Blue"> Observation:

* R squared is about 97%. This means that our model can explain the dataset by 97%. 

**R-squared**
> **R-squared** is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination, or the coefficient of multiple determination for multiple regression.
> 
> The definition of R-squared is fairly straight-forward; it is the percentage of the response variable variation that is explained by a linear model. Or:
> 
> * R-squared = Explained variation / Total variation
> 
> R-squared is always between 0 and 100%:
> 
> * 0% indicates that the model explains none of the variability of the response data around its mean.
> * 100% indicates that the model explains all the variability of the response data around its mean.
> In general, the higher the R-squared, the better the model fits your data. However, there are important conditions for this guideline that I’ll talk about both in this post and my next post.

Ref: https://blog.minitab.com/en

-------------------------------------------
# Submitting Result

We trained by taking the logarithm of the target values. Therefore, since predicted values ​​are also in the logarithm state, they take the process of converting back to actual values ​​through the exponential function.

In [None]:
X_test_df = house_df[~tr_idx].drop('SalePrice',axis=1)
preds_test_pycaret = np.expm1(final_model.predict(X_test_df))

In [None]:
submission_data.loc[:,'SalePrice'] = preds_test_pycaret
submission_data.to_csv('submission.csv', index=False)

<hr style="border: solid 3px blue;">