In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this notebook, we will explore factors that impacted Saleprice of Ames houses. The goal is to make sense of provided Ames Housing dataset and turn these insights to predict the House Prices using Advanced regression techniques like random forest and gradient boosting.
What I have done in this notebook is:
1. Understanding the problem 
2. Study of the Independent and Dependent variable
3. Data Cleaning 
4. Creating the Regression Model
5. Prediction

# 1. Importing data and Libraries

In [None]:
!pip install pycaret

In [None]:
from pycaret.regression import *
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
import sys
import warnings
import catboost as cb
warnings.filterwarnings("ignore")
sns.set()
np.set_printoptions(threshold=sys.maxsize)

**Import the training and testing datasets**

In [None]:
#Training Dataset
pd.set_option("max_columns",None)
train_data=pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
train_data.head()

In [None]:
#Testing Dataset
pd.set_option("max_columns",None)
test_data=pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
test_data.head()

In [None]:
#description of the data
description_data=open("../input/house-prices-advanced-regression-techniques/data_description.txt",'r')
print(description_data.read())

In [None]:
#Analyse the the Columns
train_data.columns   

# 2. PLOT THE CORRELATION MATRIX OF HEAT MAP STYLE 

#### To find out which independent variables correlates with the salesprice(dependent variable)

In [None]:
corr_mat=train_data.corr() 
cmap = sns.diverging_palette(230, 20, as_cmap=True) #custom diverging colormap
f,ax =plt.subplots(figsize=(12,12)) #figSize
sns.heatmap(corr_mat,cmap=cmap,cbar=True, xticklabels=corr_mat.columns,yticklabels=corr_mat.columns,square=True,vmax=1)

From the heatmap above we can see that ***Overallqual, Glivarea , garagecars , garagearea, totalbsmntsf,1stFirSf***,  correlates the most with the ***SalePrice***. We can also see that ***yearbuilt, FullBath*** can also be considered as the potential independent variable for obtaining predictions. To get the clear picture about these independent variable, let's zoom in the heatmap and add the annotations. We are trying to get the top 10 variables who strongly corelates with the 'SalePrice'.  

In [None]:
k = 10 #number of variables for heatmap
cols = corr_mat.nlargest(k, 'SalePrice')['SalePrice'].index
corrm = np.corrcoef(train_data[cols].values.T)
f1,ax1 =plt.subplots(figsize=(8,8))
sns.set(font_scale=1.25)
hm = sns.heatmap(corrm, cmap=cmap,cbar=True, annot=True, square=True, 
                 fmt='.2f', annot_kws={'size': 12}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

From the above heatmap(zoomed style), OverallQual, Glivarea , garagecars , garagearea, totalbsmntsf, 1stFlrSF,'FullBath', TotRmsAbvGrd,Yearbuilt are strong correlations with SalePrice.  

# 3.Study of the dependent variable-SalePrice 

We have to predict the Saleprice of the house and hence it is highly recommended to study the main focus of the notebook.

In [None]:
train_data['SalePrice'].describe()

In [None]:
#plot the distribution plot
sns.distplot(train_data['SalePrice'])
plt.title('Distribution plot of SalePrice') 
plt.xlabel('SalePrice') # add x-label
plt.show()

In [None]:
#skewness and Kurtosis
print('Skewness: ', train_data['SalePrice'].skew())
print('Kurtosis: ',train_data['SalePrice'].kurt())

Since Kurtosis is greater than 3 its Leptokurtic in nature which means outliers are present. 

## 4. Studying the relationship between the dependent and independent variable as per the heatmap

'SalePrice' is the independent variable. 
'OverallQual', 'GrLivArea', 'GarageCars','TotalBsmtSF', 'YearBuilt','GarageArea','1stFlrSF','FullBath','TotRmsAbvGrd'
are the Dependent variables. Here, we are plotting the Scatter,box,line plots to get clear understanding of the variation of the independent variables with respect  to the SalePrice. 

In [None]:
#GrLivArea
f, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=train_data,x='GrLivArea',y='SalePrice')

In [None]:
#TotalBsmtSF
f, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=train_data,x='TotalBsmtSF',y='SalePrice')

In [None]:
#OverallQual
f, ax = plt.subplots(figsize=(8, 6))
sns.lineplot(data=train_data,x='OverallQual',y='SalePrice')

In [None]:
#YearBuilt
f, ax = plt.subplots(figsize=(18, 6))
sns.lineplot(data=train_data,x='YearBuilt',y='SalePrice')

In [None]:
#GarageCars
f, ax = plt.subplots(figsize=(14, 6))
sns.boxplot(data=train_data,x='GarageCars',y='SalePrice')

In [None]:
#GarageArea
f, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=train_data,x='GarageArea',y='SalePrice')

In [None]:
#1stFlrSF
f, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=train_data,x='1stFlrSF',y='SalePrice')

In [None]:
#Fullbath
f, ax = plt.subplots(figsize=(12, 6))
sns.boxplot(data=train_data,x='FullBath',y='SalePrice')

In [None]:
#TotRmsAbvGrd
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(data=train_data,x='TotRmsAbvGrd',y='SalePrice')

# 5. Data Cleaning - Handling Missing Data

Here comes the most important and time consuming step Data Cleaning. Missing Data is like the cavity in the data and its presence could be harmful while training the model. Hence we are removing the entire variable as per the following types,
* Type 1: Variables which are not generally considered while buying the house.
* Type 2: Variables that are not good enough to influence the SalePrice(as per HeatMap Analysis)
* Type 3: Variables whose data is already described in any other Variable 


In [None]:
#For Train Data Set 
total = train_data.isnull().sum().sort_values(ascending=False)
percent=(train_data.isnull().sum()/len(train_data)*100).sort_values(ascending=False)
miss_data=pd.concat([total,percent],axis=1,keys=['Total_Null','Percent'])
miss_data.head(25)

In [None]:
train_data.shape

In [None]:
#Dropping Missing Data where percentage > 2 as well those belong to Type 1,Type 2,Type 3
train=train_data.drop((miss_data[miss_data['Percent']>2]).index,1)

In [None]:
train.shape

In [None]:
#These Belong to Type1 and Type2
train.drop('MasVnrArea',axis=1,inplace=True)
train.drop('MasVnrType',axis=1,inplace=True)

In [None]:
train.shape

In [None]:
# Since only one Value is missing as for now lets delete that row only 
train=train.drop(train[train['Electrical'].isnull()].index)

In [None]:
train.shape

In [None]:
# Check for any missing Values
if train.isnull().sum().max()>0:
    print("Missing Values")
else:
    print('No Missing Values')

In [None]:
train.shape

## 5. Removing all unwanted Variables as per **Type1,Type2,Type3** 

In [None]:
numcols= [col for col in train.columns 
          if train[col].dtype in ['int64','float64']]
catcols= [col for col in train.columns 
          if train[col].dtype in ['object']]


In [None]:
print(numcols)

In [None]:
print(catcols)

In [None]:
train.drop(catcols,axis=1,inplace=True) # Since all the catcols belong to either of Type1,Type2,Type3
train.shape

In [None]:
train.head()

### Now, lets remove all the numerical variables as per the Types: Type1,Type2,Type3

In [None]:
var=['Id', 'MSSubClass', 'LotArea', 'OverallCond', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'GarageArea','BsmtUnfSF', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
train.drop(var,axis=1,inplace=True)

In [None]:
train.head()

In [None]:
train.shape

In [None]:
#for Test data set 

In [None]:
#For Train Data Set 
total = test_data.isnull().sum().sort_values(ascending=False)
percent=(test_data.isnull().sum()/len(test_data)*100).sort_values(ascending=False)
miss_test_data=pd.concat([total,percent],axis=1,keys=['Total_Null','Percent'])
miss_test_data.head(40)

In [None]:
test=test_data.drop((miss_test_data[miss_test_data['Total_Null']>1]).index,1)
test.shape


In [None]:
test.columns

In [None]:
vari=['Id', 'MSSubClass', 'LotArea', 'Street', 'LotShape', 'LandContour',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'OverallCond',
       'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'Heating', 'HeatingQC', 'CentralAir',
       'Electrical', '2ndFlrSF', 'LowQualFinSF', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'Fireplaces', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition']
test.drop(vari,axis=1,inplace=True)

In [None]:
test.shape

In [None]:
test.drop('GarageArea',axis=1,inplace=True) # Since it belongs to Type 3

In [None]:
test.head()

In [None]:
test['GarageCars'].fillna(value = test['GarageCars'].median(),inplace = True)
test['TotalBsmtSF'].fillna(value = test['TotalBsmtSF'].median(),inplace = True)



In [None]:
test.head()

In [None]:
test.shape

In [None]:
# Check any missing Values
if test.isnull().sum().max()>0:
    print("Missing Values")
else:
    print('No Missing Values')

# 6. Outliers

In [None]:
sns.boxplot(train['OverallQual'])

In [None]:
train.drop(train[train['OverallQual'] < 2].index, axis = 0, inplace = True)

In [None]:
train.shape

In [None]:
#Here we are modifying the YearBuilt to YearOld  
train['YearOld']=2020-train['YearBuilt']
train.drop('YearBuilt',axis=1,inplace=True)
train.head()

In [None]:
test['YearOld']=2020-test['YearBuilt']
test.drop('YearBuilt',axis=1,inplace=True)
test.head()

In [None]:
sns.boxplot(train['TotalBsmtSF'])

In [None]:
train.drop(train[train['TotalBsmtSF'] > 3500].index, axis = 0, inplace = True)
train.shape

In [None]:
sns.boxplot(train['1stFlrSF'])

In [None]:
train.drop(train[train['1stFlrSF'] > 2500].index, axis = 0, inplace = True)
train.shape

In [None]:
sns.boxplot(train['GrLivArea'])

In [None]:
train.drop(train[train['GrLivArea'] > 4000].index, axis = 0, inplace = True)
train.shape

In [None]:
sns.boxplot(train['FullBath'])
#No outlier

In [None]:
sns.boxplot(train['TotRmsAbvGrd'])

In [None]:
train.drop(train[train['TotRmsAbvGrd'] > 11].index, axis = 0, inplace = True)
train.shape

In [None]:
train.head()

In [None]:
sns.boxplot(train['GarageCars'])

In [None]:
train.drop(train[train['GarageCars'] > 3.5].index, axis = 0, inplace = True)
train.shape

In [None]:
sns.boxplot(train['YearOld'])

In [None]:
train.drop(train[train['YearOld'] > 145].index, axis = 0, inplace = True)
train.shape

In [None]:
#Now no Outliers are present

In [None]:
train=train.reindex(columns=['OverallQual', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath',
       'TotRmsAbvGrd', 'GarageCars', 'YearOld','SalePrice'])
#train.head()
train.shape

# 7: Scaling the data

In [None]:
train.head()

In [None]:
test.head()

In [None]:
xtrain=train.drop('SalePrice',axis=1)
xtrain.head()

In [None]:
ytrain=train['SalePrice']

In [None]:
xtest=test

In [None]:
ytrain

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
xtrain[:] = scaler.fit_transform(xtrain[:])

In [None]:
xtest[:] = scaler.fit_transform(xtest[:])

In [None]:
xtrain.head()

In [None]:
xtest.head()

# 9. Model

In [None]:
## comparing all models
experiment = setup(train,target="SalePrice",normalize= True,use_gpu = True)

In [None]:
compare_models()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
# Running RFE with the output number of the variable equal to 35
lm = LinearRegression()
lm.fit(xtrain, ytrain)

rfe = RFE(lm, 15)             # running RFE
rfe = rfe.fit(xtrain, ytrain)

In [None]:
list(zip(xtrain.columns,rfe.support_,rfe.ranking_))

In [None]:
#RFE has taken it all 

In [None]:
# Columns selected by RFE
cols = xtrain.columns[rfe.support_]
cols

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfreg = RandomForestRegressor(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)

In [None]:
rfreg.fit(xtrain[cols],ytrain)


In [None]:
yhat=rfreg.predict(xtest[cols])

In [None]:
train_dataset = cb.Pool(xtrain, ytrain)
test_dataset = cb.Pool(xtest)

In [None]:
model = cb.CatBoostRegressor(loss_function='RMSE')

In [None]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)

In [None]:
pred = model.predict(xtest)

In [None]:
finaloutput = pd.DataFrame({'Id': test_data.Id,'SalePrice':pred})
finaloutput.head(15)

In [None]:
finaloutput.to_csv('submission.csv',index = False)