In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Objective 
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges is to predict the final price of each home.

# Data Description
Here's a brief version of what you'll find in the data description file.

SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.
MSSubClass: The building class
MSZoning: The general zoning classification
LotFrontage: Linear feet of street connected to property
LotArea: Lot size in square feet
Street: Type of road access
Alley: Type of alley access
LotShape: General shape of property
LandContour: Flatness of the property
Utilities: Type of utilities available
LotConfig: Lot configuration
LandSlope: Slope of property
Neighborhood: Physical locations within Ames city limits
Condition1: Proximity to main road or railroad
Condition2: Proximity to main road or railroad (if a second is present)
BldgType: Type of dwelling
HouseStyle: Style of dwelling
OverallQual: Overall material and finish quality
OverallCond: Overall condition rating
YearBuilt: Original construction date
YearRemodAdd: Remodel date
RoofStyle: Type of roof
RoofMatl: Roof material
Exterior1st: Exterior covering on house
Exterior2nd: Exterior covering on house (if more than one material)
MasVnrType: Masonry veneer type
MasVnrArea: Masonry veneer area in square feet
ExterQual: Exterior material quality
ExterCond: Present condition of the material on the exterior
Foundation: Type of foundation
BsmtQual: Height of the basement
BsmtCond: General condition of the basement
BsmtExposure: Walkout or garden level basement walls
BsmtFinType1: Quality of basement finished area
BsmtFinSF1: Type 1 finished square feet
BsmtFinType2: Quality of second finished area (if present)
BsmtFinSF2: Type 2 finished square feet
BsmtUnfSF: Unfinished square feet of basement area
TotalBsmtSF: Total square feet of basement area
Heating: Type of heating
HeatingQC: Heating quality and condition
CentralAir: Central air conditioning
Electrical: Electrical system
1stFlrSF: First Floor square feet
2ndFlrSF: Second floor square feet
LowQualFinSF: Low quality finished square feet (all floors)
GrLivArea: Above grade (ground) living area square feet
BsmtFullBath: Basement full bathrooms
BsmtHalfBath: Basement half bathrooms
FullBath: Full bathrooms above grade
HalfBath: Half baths above grade
Bedroom: Number of bedrooms above basement level
Kitchen: Number of kitchens
KitchenQual: Kitchen quality
TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
Functional: Home functionality rating
Fireplaces: Number of fireplaces
FireplaceQu: Fireplace quality
GarageType: Garage location
GarageYrBlt: Year garage was built
GarageFinish: Interior finish of the garage
GarageCars: Size of garage in car capacity
GarageArea: Size of garage in square feet
GarageQual: Garage quality
GarageCond: Garage condition
PavedDrive: Paved driveway
WoodDeckSF: Wood deck area in square feet
OpenPorchSF: Open porch area in square feet
EnclosedPorch: Enclosed porch area in square feet
3SsnPorch: Three season porch area in square feet
ScreenPorch: Screen porch area in square feet
PoolArea: Pool area in square feet
PoolQC: Pool quality
Fence: Fence quality
MiscFeature: Miscellaneous feature not covered in other categories
MiscVal: $Value of miscellaneous feature
MoSold: Month Sold
YrSold: Year Sold
SaleType: Type of sale
SaleCondition: Condition of sale

# Data Inspection

Import libraries like numpy,pandas,matplotlib and seaborn for visualization,warnings,missingo for visualization of missing values.

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings("ignore")
import missingno as msno
import sklearn
import category_encoders as ce
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder


Read our data in 'house_price'

In [None]:
house_price = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')


Read our test data

In [None]:
house_price_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

Check head of our dataframe

In [None]:
house_price.head()

Check shape of our dataframe.

In [None]:
house_price.shape

check info about our variables.

In [None]:
house_price.info()

Check statistical description of our different numerical variables.

In [None]:
house_price.describe()

Check for null value percentage in our dataframe.

In [None]:
null_check = pd.Series(round(100*(house_price.isnull().sum()/house_price.shape[0]),2))
null_check.sort_values(ascending=False)

Visualise null values of different variables.White lines are denoted as null values 

In [None]:
msno.matrix(house_price)
plt.show()

Check variables those have null values greater than 30%.

In [None]:
null_check[null_check>30.00]

Remove variables more than 30% of null values.

In [None]:
house_price.drop(['Alley','FireplaceQu','PoolQC','Fence','MiscFeature'],axis=1,inplace=True)

Again visualise null values.

In [None]:
msno.matrix(house_price)
plt.show()

In [None]:
null_check[(null_check>0.00) & (null_check<30.00)]

* There is a pattern in null values those houses which don't have garage have null values in GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond.
* Those houses which don't have basements have null values in BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinSF2.
* LotFrontage having randomly missing values.
* Those houses which don't have Masonry veneer type for them has missing values in MasVnrType and area.

Remove Those records who don't have garage.

In [None]:
house_price = house_price[~house_price['GarageType'].isnull()]

Recheck null value percentage of variables.

In [None]:
null_check_new = pd.Series(round(100*(house_price.isnull().sum()/house_price.shape[0]),2))
null_check_new.sort_values(ascending=False)

Check variables with top null values.

In [None]:
null_check_new[(null_check_new>0.00)&(null_check<30.00)]

* Impute missing values of numerical columns with their mean/average values.
* Impute missing values of categorical columns with their mode/most frequent values.


In [None]:
impute_list = ['LotFrontage','MasVnrType','MasVnrArea','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical']
for i in impute_list:
    if (house_price[i].dtype=='float64'):
        house_price[i].fillna(house_price[i].mean() , inplace = True)
    else:
        house_price[i].fillna(house_price[i].mode().values[0] , inplace = True)


Check null values of the dataframe.

In [None]:
cols = list(house_price.columns)
null_sum = 0
for i in cols:
    null_sum = null_sum+house_price[i].isnull().sum()
print("Null Valuse in DataFrame : ",null_sum)
    

# EDA

In [None]:
num_cols = []
cat_cols = []
for i in cols[1:len(cols)]:
    if (house_price[i].dtype=='float64')|(house_price[i].dtype=='int64'):
        num_cols.append(i)
    else:
        cat_cols.append(i)


Two different lists are created one contains object type variables and another list contains float and int type variables.

Used violin plots to check the distribution of those numerical variables contains more than 25 unique values less than that can be considered as categorical numerical variable for those I used count plots.

**violin plots** are a method of plotting numeric data and can be considered a combination of the box plot with a kernel density plot.
For more information about violin plots see the link provided : [https://towardsdatascience.com/violin-plots-explained-fb1d115e023d](http://)

In [None]:
plt.figure(figsize=(30,20))

violin_plots = []
countplot_cols = []

for i in num_cols:
    if (house_price[i].nunique()>25):
        violin_plots.append(i)
    else:
        countplot_cols.append(i)
#print(len(boxplot_cols))
        
for i in enumerate(violin_plots):
        #print(i[1])
    plt.subplot(5,4,i[0]+1)
    ax =sns.violinplot(house_price[i[1]]) ## KDE with narrow bandwidth to show individual probability lumps
    #print(i[0]+1)
    ax.set_xlabel(i[1],fontsize=15)
plt.tight_layout()
plt.show()

**Comments:**
* LotFrontage: Linear feet of street connected to property variable is basically left skewered and belongs from range between 50-100.
* LotArea: Lot size in square feet is also left skewered and maximum observation gathered in range of 0-2000 (approx).
* YearBuilt: Original construction date and GarageYrBlt: Year garage was built are overall distributed but maximum data belongs to 2000-2020 yr.
* YearRemodAdd: Remodel date overall distributed.
* MasVnrArea: Masonry veneer area in square feet , BsmtFinType1 & 2: Quality of basement finished area are basically left skewered with some outliers.
* TotalBsmtSF: Total square feet of basement area is distributed but maximum observations beongs from lower range with vary small lower range outliers.
* 1stFlrSF: First Floor square feet and 2ndFlrSF: Second floor square feet are mainly left skewered but 2ndFlrSF gathered around two different ranges.
* GarageArea: Size of garage in square feet are mostly gathered in two ranges 400-600 and 200-400.
* WoodDeckSF: Wood deck area in square feet left skewered mostly with some out liers.
* OpenPorchSF: Open porch area in square feet left skewered mostly with some out liers.
* EnclosedPorch: Enclosed porch area in square feet left skewered mostly with some out liers.
* 3SsnPorch: Three season porch area in square feet left skewered mostly with some out liers.
* ScreenPorch: Screen porch area in square feet left skewered mostly with some out liers.


In [None]:
plt.figure(figsize=(30,30))
for i in enumerate(countplot_cols):
    plt.subplot(6,3,i[0]+1)
    ax = sns.countplot(x=i[1],data=house_price)
    ax.set_xlabel(i[1],fontsize=15)
plt.tight_layout()
plt.show()

**Comments:**
* MSSubClass: The building class max count 20.
* OverallCond: Overall condition rating max count 5,6.
* LowQualFinSF: Low quality finished square feet (all floors) max counts 0, BsmtFullBath: Basement full bathrooms max counts 0 and BsmtHalfBath: Basement half bathrooms mostly counts 0.
* TotRmsAbvGrd: Total rooms above grade (does not include bathrooms) mostly 6.
* Kitchen: Number of kitchens and Fireplaces: Number of fireplaces mostly 1.
* GarageCars: Size of garage in car capacity max 2 
* FullBath: Full bathrooms above grade and Bedroom: Number of bedrooms above basement level max counts2.



In [None]:
plt.figure(figsize=(30,30))

for i in enumerate(violin_plots):
        #print(i[1])
    plt.subplot(5,4,i[0]+1)
    ax =sns.scatterplot(x=i[1],y='SalePrice',data=house_price) ## KDE with narrow bandwidth to show individual probability lumps
    #print(i[0]+1)
    ax.set_xlabel(i[1],fontsize=15)
    ax.set_ylabel("Sale Price",fontsize=15)

plt.tight_layout()
plt.show()

**Comments:**
* Sale price increasing with recent YearBuilt: Original construction date same for GarageYrBlt: Year garage was built
* Sale price increasing with 1stFlrSF: First Floor square feet and 2ndFlrSF: Second floor square feet althogh there are some houses with no 2nd floor but high selling price.
* Sale price also increasing with GrLivArea: Above grade (ground) living area square feet.

In [None]:
plt.figure(figsize=(30,40))
for i in enumerate(countplot_cols):
    plt.subplot(6,3,i[0]+1)
    ax = sns.boxplot(x=i[1],y="SalePrice",data=house_price)
    ax.set_xlabel(i[1],fontsize=15)
plt.tight_layout()
plt.show()

**Comments:**
* Sale price increasing with OverallQual: Overall material(9) and finish quality but few high selling houses don't have max OverallQual: Overall material and finish quality (5)
* Sale price increasing with max FullBath: Full bathrooms above grade.
* Sale price increasing with more fireplace and more rooms.
* Sale price is also increasing with more cars in garages.

In [None]:
plt.figure(figsize = (30, 30))

# ----------------------------------------------------------------------------------------------------
# plot the data
# the idea is to iterate over each class
# extract their data ad plot a sepate density plot
large_cat_cols = []
small_cat_cols = []
for j in cat_cols:
    if (house_price[j].nunique()>5):
        large_cat_cols.append(j)
    else:
        small_cat_cols.append(j)

for i in enumerate(small_cat_cols):
    for cyl_ in house_price[i[1]].unique():
    # extract the data
        x = house_price[house_price[i[1]] == cyl_]["SalePrice"]
    # plot the data using seaborn
        plt.subplot(6,4,i[0]+1)
        ax = sns.kdeplot(x, shade=True, label = "{}".format(cyl_))
        ax.set_xlabel(i[1],fontsize=15)
        plt.setp(ax.get_legend().get_texts(), fontsize='20') # for legend text
        #plt.setp(ax.get_legend().get_title(), fontsize='32') # for legend title
# set the title of the plot
plt.tight_layout()
plt.show()


Distribution of different features by their values.

In [None]:
plt.figure(figsize=(40,30))
for i in enumerate(large_cat_cols):
    plt.subplot(4,4,i[0]+1)
    ax = sns.scatterplot(x=i[1],y='SalePrice',hue=i[1],data=house_price)
    ax.set_xlabel(i[1],fontsize=15)
    ax.tick_params(axis="x", labelsize=15 , rotation=45)
    #plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text


plt.tight_layout()
plt.show()

**Comments:**
* high Sale price depends on following type of features
* Condition1: Proximity to various conditions Norm
* HouseStyle: Style of dwelling 2Story	
* RoofStyle: Type of roof Hip and Gable
* RoofMatl: Roof material CompShg	Standard (Composite) Shingle
* Foundation: Type of foundation PConc	Poured Contrete	
* Heating: Type of heating GasA	Gas forced warm air furnace
* GarageType attached and sale codition normal is more preferable.

# Data Preprocessing

Divide X_train and y_train.

In [None]:
y_train = house_price.pop("SalePrice")
X_train = house_price

**One Hot Encoding** refers to splitting the column which contains numerical categorical data to many columns depending on the number of categories present in that column. Each column contains “0” or “1” corresponding to which column it has been placed.

In [None]:
encoder = ce.OneHotEncoder(cols=cat_cols)

X_train = encoder.fit_transform(X_train) ## one hot encoding on all variables

In [None]:
X_test = house_price_test.copy() ## also done one hot encoding on test set as well

X_test = encoder.fit_transform(X_test)

In [None]:
X_test.shape ## check shape of test

In [None]:
X_train.shape ## check shape of train

**Remove Constant Features**

Constant features are those that show the same value, just one value, for all the observations of the dataset. This is, the same value for all the rows of the dataset. These features provide no information that allows a machine learning model to discriminate or predict a target.
Variance threshold from sklearn is a simple baseline approach to feature selection. It removes all features which variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

In [None]:
sel = VarianceThreshold(threshold=0.1)
sel.fit(X_train)  # fit finds the features with zero variance

In [None]:
# if we sum over get_support, we get the number of features that are not constant
sum(sel.get_support())

In [None]:
X_train = X_train[X_train.columns[sel.get_support()]] ## select variables with proper distribution of values

**Recursive Feature Elemination**

It is a greedy optimization algorithm which aims to find the best performing feature subset. It repeatedly creates models and keeps aside the best or the worst performing feature at each iteration. It constructs the next model with the left features until all the features are exhausted. It then ranks the features based on the order of their elimination.

Recursive feature elimination performs a greedy search to find the best performing feature subset. It iteratively creates models and determines the best or the worst performing feature at each iteration. It constructs the subsequent models with the left features until all the features are explored. It then ranks the features based on the order of their elimination. In the worst case, if a dataset contains N number of features RFE will do a greedy search for 2N combinations of features.

In [None]:
# Importing RFE and LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression


In [None]:
lm = LinearRegression().fit(X_train,y_train)
## created linear regression model and fit our train data
rfe = RFE(lm,42).fit(X_train,y_train)
## select 42 features after running rfe

In [None]:
rfe_cols = X_train.columns[rfe.support_]
## choose features provided by rfe

In [None]:
## check column names provided by rfe 
rfe_cols

In [None]:
X_train = X_train[rfe_cols] ## X_train contains only features selected by rfe.

Let's identify some predictor variables that are co realated and remove one of them.

In [None]:
plt.figure(figsize=(30,15))
sns.heatmap(X_train.corr(),annot=True)
plt.show()

 Remove some variables those are co realted with another variable like 'LotShape_2','LotConfig_3','ExterQual_2','BsmtQual_2','KitchenQual_2','Exterior2nd_1','Exterior2nd_2'

In [None]:
X_train.drop(['LotShape_2','LotConfig_3','ExterQual_2','BsmtQual_2','KitchenQual_2','Exterior2nd_1','Exterior2nd_2'],axis=1,inplace=True)

In [None]:
X_train.shape ## checking shape

**Variance inflation factor (VIF)** is a measure of the amount of multicollinearity in a set of multiple regression variables. Mathematically, the VIF for a regression model variable is equal to the ratio of the overall model variance to the variance of a model that includes only that single independent variable.

We calculate VIF and remove high vif features and again check the vifs of all features . Do this iteratively unless got all the features less than 5.

In [None]:
# Calculate the VIFs for the new model
import statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('TotRmsAbvGrd',axis=1,inplace=True) ## drop feature 

In [None]:
vif = pd.DataFrame() ## again compute VIF
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('OverallQual',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('OverallCond',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('BedroomAbvGr',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('FullBath',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('SaleType_1',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('GarageCars',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('Foundation_1',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('Condition1_1',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('BldgType_1',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
X_train.drop('SaleCondition_1',axis=1,inplace=True)

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

Check final shape of X_train

In [None]:
X_train.shape 

In [None]:
X_test = X_test[X_train.columns]

Prepare X_test with the same feature as X_train.

In [None]:
X_test.shape ## check shape

In [None]:
X_test.isnull().sum() ## check for null value

In [None]:
X_test['BsmtFullBath'].fillna(X_test['BsmtFullBath'].mode().values[0] , inplace = True) ## impute null value of BsmtFullBath with most frequent value

In [None]:
null_count = 0
for i in X_test.columns:
    null_count = null_count+X_test[i].isnull().sum()
print("Null Values in X_test : ",null_count)
    

# Model Build

A **random forest** is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

For more information : [https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html](http://)

In [None]:
from sklearn.ensemble import RandomForestRegressor ## import libraries

rfr = RandomForestRegressor(random_state=1).fit(X_train,y_train)

In [None]:
y_train_pred = rfr.predict(X_train) ## predict sale price

Check r2 score and root mean squared error.

In [None]:
from sklearn.metrics import r2_score
r2_score_default = r2_score(y_train,y_train_pred) ## check r2 score of the model

In [None]:
r2_score_default

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_train,y_train_pred)

In [None]:
mse

Model looks like overfitted Hence it is a very unstable model.Let's do some hyper parameter tuning to make a stable model.

# Hyper parameter Tuning

In [None]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [1, 2, 5, 10, 20 ],
    'min_samples_leaf': [10, 20, 50, 100 , 200 , 400],
    'max_features': [4 , 8 , 15 , 20],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [None]:
from sklearn.model_selection import GridSearchCV
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rfr, param_grid=params, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "r2")

In [None]:
%%time
grid_search.fit(X_train,y_train)

Chose the best estimator and fit it on the data.

In [None]:
rf_best = grid_search.best_estimator_

In [None]:
rf_best

In [None]:
rf_best = rf_best.fit(X_train,y_train)

In [None]:
y_train_pred_tune = rf_best.predict(X_train)

In [None]:
r2_score_best = r2_score(y_train,y_train_pred_tune)

check r2 score and root mean squared error of our best estimator random forest regressor model.

In [None]:
r2_score_best

In [None]:
mse_best = mean_squared_error(y_train,y_train_pred_tune)

In [None]:
mse_best

predict on test data using our bet model.

In [None]:
test_pred = rf_best.predict(X_test)

In [None]:
house_price_test['SalePrice'] = test_pred

In [None]:
house_price_test = house_price_test[['Id','SalePrice']]

In [None]:
house_price_test.to_csv("Submission_house_price.csv",index=False)


# Conclusion
We can use other proccess to make more better prediction we can also use different set of parameters to tune hyper parameters of our model and also chose different proccess to reduce dimensionality of the data set.