In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Table of Content

* [1. Introduction](#1.-Introduction)
    * [1.1. Goals](#1.1.-Goals)
    * [1.2. Libraries & Tools](#1.2.-Libraries-&-Tools)
* [2. Exploratory Data Analysis](#2.-Exploratory-Data-Analysis)
    * [2.1. Overview of the data](#2.1.-Overview-of-the-data)
    * [2.2. House Building Variables](#2.2.-House-Building-Variables)
    * [2.3. Continuous Variables](#2.3.-Continuous-Variables)
        * [2.3.1. Continuous Variables (EDA)](#2.3.1.-Continuous-Variables-(EDA))
        * [2.3.2. Correlations Among Continuous Variables](#2.3.2.-Correlations-Among-Continuous-Variables)
        * [2.3.3. Continuous Variables Pipeline](#2.3.3.-Continuous-Variables-Pipeline)
    * [2.4. Categorical Variables](#2.4.-Categorical-Variables)
        * [2.4.1. Categorical Variables (EDA)](#2.4.1.-Categorical-Variables-(EDA))
        * [2.4.2. Discardable Categorical Variables](#2.4.2.-Discardable-Categorical-Variables)
        * [2.4.3. Categorical Variables Pipeline](#2.4.3.-Categorical-Variables-Pipeline)
    * [2.5. General House Pricing Pipeline](#2.5.-General-House-Pricing-Pipeline)
* [3. House Pricing Model](#3.-House-Pricing-Model)
    * [3.1. Model Selection](#3.1.-Model-Selection)
    * [3.2. Fine Tune Model](#3.2.-Fine-Tune-Model)
    * [3.3. System Evaluation on the Test Set](#3.3.-System-Evaluation-on-the-Test-Set)
* [4. Conclusion](#4.-Conclusion)

# 1. Introduction

I started out this book [Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) 2nd Edition by [Aurélien Géron](https://www.oreilly.com/people/aurelien-geron/). Just through with the second chapter on <b style="color: #DE3163">End-to-End Machine Learning Project</b> where we worked on a housing dataset to predict sales prices. This was a regression problem and at the end of this chapter i was inspired to practice what I learnt. So I went out in search of a good dataset to practice with because i didn't want to make use of the same dataset that was used in the book. I came across this dataset in my search and its quiet an amazing dataset exceeding my expectations by far. So here we are my first kaggle competition after reading two chapters of a text book.

**_Description and context:_**
_Ask a home buyer to describe the home of their dreams and he probably won't start the description with "basement ceiling height" or "proximity to an east-west railroad". However, the data set of this competition proves that there are influences in the negotiation of houses in addition to the number of bedrooms or bathrooms. With approximately 80 explanatory variables describing virtually any residential aspect of homes in Ames, Iowa, this competition challenges the user to predict the final price of homes._


## 1.1. Goals

My goals in this **notebook** are to:

1. Discover and visualize the data to gain insights.
2. Prepare the data for Machine Learning algorithms.
3. Select a model and train it.
4. Fine-tune the model.
5. Present my solution.

### If you find this notebook inciteful, please don't forget to upvote :)

So let's get started.

## 1.2. Libraries & Tools

In [None]:
from IPython.display import Image
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score


%matplotlib inline

In [None]:
def pie_plot(column, data=None, title=None, ax=None, fontsize=15, explode=0,
             autopct='%1.1f%%', shadow=None, figsize=(10, 6), colors=None, color_with_label=None):
    """
    returns: a pie plot of the quantities of unique valiues from a column.
    """
    
    if type(column) == str:
        target = data[column].value_counts()
    else:
        target = column.value_counts()
    
    explode = [explode for _ in range(len(target))]
    
    if color_with_label:
        colors = [color_with_label[key] for key in target.index]
        
    if ax is not None:
        if title is not None:
            ax.set_title(title, fontsize=fontsize)

        ax.pie(target, labels=target.index, autopct=autopct, shadow=shadow, explode=explode, colors=colors)
    else:
        fig = plt.figure(figsize=figsize)

        if title is not None:
            plt.title(title, fontsize=fontsize)

        plt.pie(target, labels=target.index, autopct=autopct, shadow=shadow, explode=explode, colors=colors)
        

def bar_plot(column, data=None, title=None, ax=None, fontsize=15, figsize=(10, 6), color='b'):
    """
    returns: a bar plot of the quantities of unique valiues from a column.
    """
    
    if type(column) == str:
        target = data[column].value_counts()
    else:
        target = column.value_counts()
    
    if ax is not None:
        if title is not None:
            ax.set_title(title, fontsize=fontsize)

        ax.bar(target.index, target, color=color)
    else:
        fig = plt.figure(figsize=figsize)

        if title is not None:
            plt.title(title, fontsize=fontsize)

        plt.bar(target.index, target, color=color)
    

def compare_plots(shape, columns, titles=None, data=None, kind='pie', explode=0, color='b',
                  fontsize=15, autopct='%1.1f%%', figsize=(20, 10), shadow=None, color_with_label=True):
    """
    returns: a matplotlib.subplot consisting of several features as described by
            the 'columns' using the 'data' DataFrame using a preferred kind.
    """
    
    fig, axes = plt.subplots(*shape, figsize=figsize)
    
    for i, ax in enumerate(axes.ravel()):
        title = titles[i] if titles is not None else None
        
        if kind == 'pie':
            pie_plot(columns[i], data=data, title=title, ax=ax, fontsize=fontsize, colors=color,
                     autopct=autopct, figsize=figsize, explode=explode, shadow=shadow, 
                     color_with_label=color_with_label)
        elif kind == 'bar':
            bar_plot(columns[i], data=data, title=title, ax=ax, fontsize=fontsize, figsize=figsize, color=color)
        else:
            raise TypeError

def known_vs_unknown(columns, data, unknown=np.nan):
    """
    returns: pd.DataFrame object - consisting of the 'columns' features 
            distinguishing between known and unknown variables.
    """
    
    _data = data.copy()

    for col in columns:
        _data[col] = _data[col].replace(unknown, 'Unknown')
        _data[col][_data[col] != 'Unknown'] = 'Known'
    
    return _data.loc[:, columns]

def compare_models(data, labels, models, scoring="neg_mean_squared_error", cv=10):
    """
    returns: a dictionary comparing a series of models on a whole set and 
            a cross validation set
    """
    
    record = {}
    
    for name, model in models:
        m = model
        model.fit(data, labels)
        predictions = model.predict(data)
        model_mse = mean_squared_error(labels, predictions)
        model_rmse = np.sqrt(model_mse)
        scores = cross_val_score(model, data, labels, scoring=scoring, cv=cv)
        scores = np.sqrt(-scores)
        
        model_record = {
            "model": m,
            "mean_squared_error": model_mse,
            "root_mean_squared_error": model_rmse,
            "scores": scores,
            "scores_mean": scores.mean(),
            "scores_std": scores.std()
        }
        
        record[name] = model_record
    
    return record

# 2. Exploratory Data Analysis

The housing data-set has already been divided into two distinctive set - the train and test set. We'll start by loading and performing data analysis on the training-set.


## 2.1. Overview of the data

This dataset consists of 80 features with their descriptions below:

* **_1stFlrSF_**: Total area of ​​the first floor of the house
* **_2ndFlrSF_**: Total area of ​​the second floor of the house
* **_3SsnPorch_**: Varande area of ​​three seasons (?)
* **_Alley_**: Characteristic of the alley that gives access to housing
* **_BedroomAbvGr_**: Number of beds in the house (above the basement)
* **_BldgType_**: Type of housing
* **_BsmtCond_**: Classifies the basement's general condition
* **_BsmtExposure_**: Exposition of the basement of the dwelling
* **_BsmtFinSF1_**: Area covered by type 1 finish (BsmtFinType1 attribute)
* **_BsmtFinSF2_**: Area covered by type 2 finish (BsmtFinType2 attribute)
* **_BsmtFinType1_**: Score of the finish of the basement-related region
* **_BsmtFinType2_**: Score of the finish of the basement-related region (if more than one exists)
* **_BsmtFullBath_**: Full bathrooms in the basement-related area
* **_BsmtHalfBath_**: Incomplete bathrooms (half) of the basement-related area
* **_BsmtQual_**: Classifies the house according to the size of the basement
* **_BsmtUnfSF_**: Basement area without finishing
* **_CentralAir_**: Defines whether or not there is a central air conditioner (Boolean attribute)
* **_Condition1_**: Proximity to important points in the city
* **_Condition2_**: Proximity to important points in the city (if there are more than one)
* **_Electrical_**: Type of home electrical system
* **_EnclosedPorch_**: Closed balcony area in the house
* **_ExterCond_**: Condition of the external material on the observation date
* **_Exterior1st_**: External coverage of the house
* **_Exterior2nd_**: External roof of the house (if there is more than one roof)
* **_ExterQual_**: Quality of the material used abroad
* **_Fence_**: Quality of the enclosure present in the house
* **_FireplaceQu_**: Quality of fireplaces
* **_Fireplaces_**: Number of fireplaces in the house
* **_Foundation_**: Type of foundation used in construction
* **_FullBath_**: Number of full bathrooms in the house (above the basement)
* **_Functional_**: Describes features of the house under warranty
* **_GarageArea_**: Garage area in square meters
* **_GarageCars_**: Size of the garage related to the number of possible cars
* **_GarageCond_**: Score that defines the conditions of the garage
* **_GarageFinish_**: Internal garage finish
* **_GarageQual_**: Quality of the garage
* **_GarageType_**: Type of garage in the house
* **_GarageYrBlt_**: Year of construction of the garage
* **_GrLivArea_**: Total living room area
* **_HalfBath_**: Number of incomplete bathrooms (half) in the house (above the basement)
* **_Heating_**: Type of house heating
* **_HeatingQC_**: Heating quality
* **_HouseStyle_**: Housing style
* **_KitchenAbvGr_**: Number of kitchens in the house (above the basement)
* **_KitchenQual_**: Quality of the kitchens
* **_LandContour_**: Housing leveling
* **_LandSlope_**: Property slope
* **_LotArea_**: Allotment area
* **_LotConfig_**: Allotment configuration
* **_LotFrontage_**: Dimension of the front perimeter of the house
* **_LotShape_**: General housing format
* **_LowQualFinSF_**: Total area of ​​low quality finishes throughout the house
* **_MasVnrArea_**: Area covered by masonry
* **_MasVnrType_**: Type of masonry used
* **_MiscFeature_**: Some features not included in other categories
* **_MiscVal_**: Value of features not included in quantity criteria
* **_MoSold_**: Month in which the sale of the house was made
* **_MSSubClass_**: Identifies the type of residence
* **_MSZoning_**: Classifies the property by zone
* **_Neighborhood_**: Locality related to city boundaries
* **_variable_name_**: description
* **_OpenPorchSF_**: Open balcony area in the house
* **_OverallCond_**: Score of the general condition of the house
* **_OverallQual_**: Score of the material and finish of the house
* **_PavedDrive_**: Attribute that defines the paving of the street (inside the house)
* **_PoolArea_**: Pool area in the house
* **_PoolQC_**: Quality of the pool
* **_RoodMatl_**: Material used in the roof (roof)
* **_RoofStyle_**: Type of roof of the house (roof)
* **_SaleCondition_**: Conditions of sale
* **_SaleType_**: Type of sale
* **_ScreenPorch_**: Screen area on the balcony of the house
* **_Street_**: Characteristic of the street that gives access to housing
* **_TotalBsmtSF_**: Total hold area
* **_TotRmsAbvGrd_**: Total number of rooms in the house (above the basement)
* **_Utilities_**: Utilities
* **_WoodDeckSF_**: Wooden deck area present in the house
* **_YearBuilt_**: Year of construction of the house
* **_YearRemodAdd_**: Year of remodeling of the house (same as YearBuilt if the house has not been remodeled)
* **_YrSold_**: Year in which the sale of the house was made

In [None]:
house_prices_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv", index_col='Id')
house_prices_train.head()

In [None]:
house_prices_train.shape

There are 1460 training data in this set with 80 features. This is alot to handle, but the huge amount of features makes up for the reduced number of training data, and so we definitely want to keep as many feature as necessary. 

In [None]:
house_prices_train.duplicated().any()

Thankfully there are no duplicates, so we'll keep all the data.

In [None]:
house_prices_train.describe()

The `describe` method here describes all the continuous data in the data-set which comes to a total of 37 continuous variables.

In [None]:
house_prices_train.columns

## 2.2. House Building Variables

The house building variables can generally be divided in two major categories which are the **continuous variables** consisting of floating point and integer data types and the **categorical variables** consisting of the numpy.object data type.

In [None]:
house_prices_train.info()

Now let's properly distinguish between continuous and categorical variables

In [None]:
continuous_col= list(house_prices_train.describe().columns)
categorical_col = [_d for _d in house_prices_train.columns if _d not in continuous_col]

That settled, we should also seperate continuous and categorical variables for easier and effetive data analysis

In [None]:
continuous_data = house_prices_train.loc[:, continuous_col]
categorical_data = house_prices_train.loc[:, categorical_col]

## 2.3. Continuous Variables

Let's look through the continuous variables to identify features containing null values. 

In [None]:
continuous_data.columns, f"Length: {len(continuous_col)}"

In [None]:
continuous_data.info()

### 2.3.1. Continuous Variables (EDA)

In [None]:
cont_d = continuous_data.isnull().any()
missing_continuous = list(cont_d[cont_d == True].index)
missing_continuous

We have identified three (3) features containing `null` values. 

1. `LotFrontage`: Linear feet of street connected to property
2. `MasVnrArea`: Masonry veneer area in square feet
3. `GarageYrBlt`: Year garage was built

We now have to compare quantity of known to the unknown values. This will help us determine useful an imputer will be to missing columns. It should also tell us if any of this features should be discarded. 

In [None]:
data = known_vs_unknown(missing_continuous, continuous_data)

In [None]:
colors = {
    "Unknown": "#FF5733",
    "Known": "#2471A3",
}

compare_plots((1, 3), columns=missing_continuous, titles=missing_continuous, data=data, color_with_label=colors)

We can see that the amount of unknown for `LotFrontage`, `MasVnrArea` & `GarageYrBlt` are relatively small. We can thus fill all the unknown with the median of each feature.

Before going into that, we need to extract the labels (`SalePrice`) from the continuous variables.

In [None]:
labels = continuous_data.SalePrice
continuous_data.drop("SalePrice", axis=1, inplace=True)
continuous_col = list(continuous_data.columns)

Now let's plot an histogram of all the features in the continuous variables to get a feel of the type of data we are dealing with.

In [None]:
plt.style.use("ggplot")
continuous_data.hist(bins=50, figsize=(20,20))
plt.show()

From the plots we notice these attributes have very different scales.

Another thing to notice is many histograms are tail-heavy -that is either having negative skew or positive skew (they extend much farther to the right of the median than to the left). This might make it a bit harder for some Machine Learning algorithms to detect patterns. We will have to transform some of these attributes to have more bell-shaped distributions.

We should also look for **correlations** among continuous variables. Say the factors that relates best with

1. `YearBuilt`: Original construction date
2. `1stFlrSF`: First Floor square feet
3. `GarageArea`: Size of garage in square feet

### 2.3.2. Correlations Among Continuous Variables

Correlation coefficients ranges from –1 to 1. When it is close to 1, it means that there is a strong positive correlation and vice versa.

In [None]:
Image("../input/correlation/correlation.png")

In [None]:
corr_matrix = continuous_data.corr()

In [None]:
corr_matrix["YearBuilt"].sort_values(ascending=False)

In [None]:
corr_matrix["1stFlrSF"].sort_values(ascending=False)

In [None]:
corr_matrix["GarageArea"].sort_values(ascending=False)

We can tell most of the features are correlated. Now let's see correlations between just a few features.

In [None]:
sns.set_style("ticks")
attributes = ["TotRmsAbvGrd", "GrLivArea", "TotalBsmtSF", "2ndFlrSF", "BedroomAbvGr", "OverallQual", "1stFlrSF", 
              "GarageArea"]
sns.pairplot(continuous_data[attributes])

Now we have two means we wish to apply in handling the continuous variables.

1. Replacing missing values with the median for each column. To achieve this we will be making use of the `sklearn.impute.SimpleImputer` which will calculate the median for all features and automatically replace missing values with the median.
2. Scaling the data to all have a bell-shaped distribution. To achieve this we will apply standardization using the `sklearn.preprocessing.StandardScaler`.

### 2.3.3. Continuous Variables Pipeline

To effectively carry this out - even to new datasets, we should create a pipeline for continuous variables.

In [None]:
continuous_data_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('num_scaler', StandardScaler()),
])

In [None]:
continuous_data_tr = continuous_data_pipeline.fit_transform(continuous_data)

Now that we have succesfully analysed and created a pipeline for the continuous variables, we can switch focus to the categorical variables.

## 2.4. Categorical Variables

Let's look through the categorical variables to identify features containing null values. 

In [None]:
categorical_data.columns, f"Length: {len(categorical_col)}"

In [None]:
categorical_data.info()

### 2.4.1. Categorical Variables (EDA)

In [None]:
cat_d = categorical_data.isnull().any()
missing_categorical = list(cat_d[cat_d == True].index)
len(missing_categorical), missing_categorical

Just like we did with the continuous variables. We have identified sixteen (16) features containing `null` values. 

1. `Alley`: Type of alley access to property
2. `MasVnrType`: Masonry veneer type
3. `BsmtQual`: Evaluates the height of the basement
4. `BsmtCond`: Evaluates the general condition of the basement
5. `BsmtExposure`: Refers to walkout or garden level walls
6. `BsmtFinType1`: Rating of basement finished area
7. `BsmtFinType2`: Rating of basement finished area (if multiple types)
8. `Electrical`: Electrical system
9. `FireplaceQu`: Fireplace quality
10. `GarageType`: Garage location
11. `GarageFinish`: Interior finish of the garage
12. `GarageQual`: Garage quality
13. `GarageCond`: Garage condition
14. `PoolQC`: Pool quality
15. `Fence`: Fence quality
16. `MiscFeature`: Miscellaneous feature not covered in other categories

We now have to compare quantity of known to the unknown values. This will help us determine useful columns and columns that should be discarded. 

In [None]:
data = known_vs_unknown(missing_categorical, categorical_data)

In [None]:
compare_plots((4, 4), columns=missing_categorical, titles=missing_categorical, data=data, 
              figsize=(20, 20), color_with_label=colors)

### 2.4.2. Discardable Categorical Variables

Right from the info method on code line 26, we could tell that the features `Alley`, `FireplaceQu`, `PoolQC`, `Fence` & `MiscFeature` are really missing alot of values, but visualizing them left us see the impart of this missing values on this features.

In situations like this where majority or close to have of the dataset values is unknown. It will only make sense to discard of this values. Thus discarding `["Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"]`

In [None]:
drop_categorical = ["Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"]
categorical_data.drop(drop_categorical, axis=1, inplace=True)
categorical_col = list(categorical_data.columns)

The proportions of unknown in other features are significantly small enough. Let's preview the category of each feature in a bar plot to understand them better.

In [None]:
f, axes = plt.subplots(9, 3, figsize=(20, 35))

for ax, col in zip(axes.ravel(), categorical_data.columns):
    target = categorical_data[col].value_counts()
    ax.bar(target.index, target)
    ax.set_title(col)

There's a positive skewness across every categorical feature in terms of frequencies. So one way we can input missing values is by imputing the most frequent values.

We also have to encode every column in the categorical variable. From the [data description](../input/house-prices-advanced-regression-techniques/data_description.txt), we can tell most of the categorical features of the dataset are organized (ordered), therefore an OrdinalEncoder should work prefectly for encoding each column.

### 2.4.3. Categorical Variables Pipeline

we would now create a pipeline for categorical variables.

In [None]:
categorical_data_pipeline = Pipeline([
    ('freq_imputer', SimpleImputer(strategy='most_frequent')),
    ('cat_encoder', OrdinalEncoder())
])

In [None]:
categorical_data_tr = categorical_data_pipeline.fit_transform(categorical_data)

## 2.5. General House Pricing Pipeline

With pipelines for both continuous and categorical variables created, we can create a general pipeline for the housing price dataset and then a function that will autonomously prepare the dataset for a Machine Learning model.

In [None]:
housing_price_pipeline = ColumnTransformer([
    ("continous", continuous_data_pipeline, continuous_col),
    ("categorical", categorical_data_pipeline, categorical_col),
])

In [None]:
def prepare_data(data, drop_cols, fit_trasform=True):
    data.drop(drop_cols, axis=1, inplace=True)
    if fit_trasform:
        return housing_price_pipeline.fit_transform(data)
    else:
        return housing_price_pipeline.transform(data)

In [None]:
data = house_prices_train.copy()

In [None]:
data = prepare_data(data, drop_categorical + ["SalePrice"])

In [None]:
data.shape

Now we're all set to train models

# 3. House Pricing Model

Since we are dealing with a regression problem, let's hand four (4) regression models and compare them to each other. We'll be making use of `Linear Regression`, `Decision Tree Regression`, `Random Forest Regression`, `Support Vector Regression`. We well use the `mean_squared_error` metrics to evaluate the model.

## 3.1. Model Selection

Let's compare the models.

In [None]:
lin_reg = LinearRegression()
dt_reg = DecisionTreeRegressor()
rf_reg = RandomForestRegressor()
svm_reg = SVR()

In [None]:
models = zip(
    ["Linear Regression", "Decision Tree Regression", "Random Forest Regression", "Support Vector Regression"],
    [lin_reg, dt_reg, rf_reg, svm_reg]
)

The `compare_models` function performs the following operation on each model.

1. Train a model with the whole training set the gets both the `mean_squared_error` & the `root_mean_squared_error` to evaluate the model performance on the whole dataset.
2. Then perform cross validation (with cross validation folds set to 10) to see how well the model scores to new instance after being trained.

In [None]:
records = compare_models(data, labels, models)

In [None]:
records["Linear Regression"]

For the `Linear Regression` model, we have a rmse of 31785.37 after the model trained on the whole dataset and an overall mean of 44628698410.08 after training on a cross validation set, which tells us that the `Linear Regression` model is underfitting.

In [None]:
records["Decision Tree Regression"]

For the `Decision Tree Regression` model, we have a rmse of 0.0 after the model trained on the whole dataset. This is actually a perfect score but after training on a cross validation set we get an overall mean of 39577.57, which tells us that the `Decision Tree Regression` model is overfitting.

In [None]:
records["Random Forest Regression"]

For the `Random Forest Regression` model, we have a rmse of 10654.21 after the model trained on the whole dataset. This is better than the `Linear Regression` model and after training on a cross validation set we get an overall mean of 28638.76, which tells us that the `Random Forest Regression` model is not overfitting like the `Decision Tree Regression`.

In [None]:
records["Support Vector Regression"]

The `Support Vector Regression` model on the other is clearly underfitting on both whole training set and the cross validation set.

## 3.2. Fine Tune Model

We'll now make use of the Scikit-Learn’s GridSearchCV to manually fiddle the best hyperparameters for our model. 

In [None]:
param_grid = [
    {'bootstrap': [True, False], 'n_estimators': [10, 30, 40], 'max_features': [6, 8, 10, 12]}
]

grid_search = GridSearchCV(records["Random Forest Regression"]["model"], param_grid, cv=5, 
                           scoring='neg_mean_squared_error', return_train_score=True)

In [None]:
grid_search.fit(data, labels)

In [None]:
grid_search.best_estimator_

After running the grid search model we get the parameters to be `{'bootstrap': False, 'max_features': 12, 'n_estimators': 40}`

Now that we have successfully fine-tuned our model, let's set our final model.

In [None]:
model = grid_search.best_estimator_

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [None]:
attributes = continuous_col + categorical_col

We can now check the importance of each feature to the model and exclude features below the threshold of 0.01.

In [None]:
sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
most_important_features = ['OverallQual', 'GarageCars', 'ExterQual', 'GrLivArea', 'GarageArea', 'TotalBsmtSF', 
                          'BsmtQual', '1stFlrSF', 'FullBath', 'YearBuilt', '2ndFlrSF', 'BsmtFinSF1', 
                          'YearRemodAdd', 'KitchenQual', 'LotArea', 'TotRmsAbvGrd', 'GarageFinish', 'MasVnrArea', 
                          'Fireplaces']

In [None]:
continuous_col = [_c for _c in continuous_col if _c in most_important_features]
categorical_col = [_c for _c in categorical_col if _c in most_important_features]

In [None]:
housing_price_pipeline = ColumnTransformer([
    ("continous", continuous_data_pipeline, continuous_col),
    ("categorical", categorical_data_pipeline, categorical_col),
])

## 3.3. System Evaluation on the Test Set

Now we evaluate the system on the test set

In [None]:
data = house_prices_train.copy()
data = prepare_data(data, drop_categorical)

In [None]:
model.fit(data, labels)

In [None]:
house_prices_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv", index_col="Id")

In [None]:
test_data = house_prices_test.copy()

In [None]:
test_data.shape

In [None]:
test_data = prepare_data(test_data, drop_categorical)

In [None]:
predictions = model.predict(test_data)

In [None]:
sales_price = pd.Series(predictions, name="SalePrice")

In [None]:
result = pd.DataFrame({
    "Id": house_prices_test.index, 
    "SalePrice": sales_price
})

In [None]:
result = result.set_index("Id")

In [None]:
result

# 4. Conclusion

In this notebook I try making use of techniques I learnt from the book [Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) to create a model to try and predict housing prices in line with the competition challenges to predict the final price of each home.

Practice Skills Gained

1. Creative feature engineering 
2. Advanced regression techniques (Random Forest) 

I wish to end this note by thanking Dean De Cock and all who compiled The Ames Housing dataset to be use in data science education.

### Next Notebook

[EDA on the various categorical data (0.80464)](https://www.kaggle.com/ganiyuolalekan/eda-on-the-various-categorical-data-0-80464)