# **House Prices - Advanced Regression Techniques**
Predict sales prices and practice feature engineering, RFs, and gradient boosting

Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.


Method 1: Correlation Heatmap

Method 2: VIF

Method 3: Variance Threshold

Method 4: SelectKBest

Method 5: ExtraTreesRegressor

Method 6: Mutual Information Gain

Method 7: Recursive Feature Elimination

Method 8: Forward Feature selection

Method 9: Backward elimination 

Method 10: Bidirectional/ Stepwise feature elimination


In [None]:
#Importing Packages/Libraries
#from google.colab import files
from sklearn import svm
from sklearn import metrics
import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns
import pandas_profiling
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import matplotlib.pylab as pylab
import warnings
import pandas_profiling
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import GaussianNB
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile
%matplotlib inline
warnings.filterwarnings('ignore')

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#For setting the charts size globally
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large',
         'legend.title_fontsize':'x-large'}
pylab.rcParams.update(params)

In [None]:
Train=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
Test=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
print(Train.shape)
print(Test.shape)

In [None]:
Train.head()

In [None]:
Train.columns

In [None]:
Train.info()

### Checking Missing values percentage

In [None]:
# Number of of null values in each column
count=round(Train.isnull().sum(),2)
percent=round((Train.isnull().sum()/Train.shape[0])*100,2)
data=pd.concat([count,percent],axis=1)
data.reset_index(inplace=True)
data.rename(columns={0: 'Missing Values Count',1: 'Missing Values %'},inplace=True)
data[data['Missing Values Count']!=0]

Alley, PoolQC, Fence, MiscFeature columns can be dropped as they have more than 80% of nulls in respective columns

### Checking for duplicate records

In [None]:
# No duplicates.
Train.duplicated().sum()

### Checking for unique values count in each column

In [None]:
#How many unique values are present in each column
features=Train.columns
print("Number of unique values are as below:\n")
for i in features:
  uniqueValues=Train[i].nunique()
  uniqueValues_per=round(Train[i].nunique()/Train.shape[0],2)
  print(i,uniqueValues)

### Dropping variables obtained from above observations

In [None]:
#dropping variables obtained from abov observations as they are not useful
dropColList=['Alley', 'PoolQC', 'Fence', 'MiscFeature','Id']
Train.drop(dropColList,axis=1,inplace=True)
Test.drop(dropColList,axis=1,inplace=True)

In [None]:
print(Train.shape)
print(Test.shape)

# Univariate Analysis

In [None]:
# Mean and Median are almost same
Train.describe(include = 'all').T

### Separating Numerical and categorical features for plotting the data

In [None]:
#separating Numerical and categorical variables
NumericData=Train.select_dtypes(include=['float64','int64'])
rows,col=(NumericData.shape)
print("Number of Numeric columns are:",col)
print(NumericData.columns)

In [None]:
#separating Numerical and categorical variables
CategoricData=Train.select_dtypes(include=['object','category'])
rows,col=(CategoricData.shape)
print("Number of Categorical columns are:",col)
print(CategoricData.columns)

## Univariate Analysis Plots for categorical data

In [None]:
CategoricData.head()

In [None]:
#melting the dataframe to bring the data into single column
ConvertedCatDataMelt=CategoricData.melt()

#Univariate Analysis for CountPlot for categorical variables
CatFacetGrid = sns.FacetGrid(ConvertedCatDataMelt, col='variable',sharex=False, dropna=True, sharey=False, size=4,col_wrap=4)
CatFacetGrid.set_xticklabels(rotation=90)
countPlot=CatFacetGrid.map(sns.countplot,'value')
plt.show()

'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','Bedroom','Kitchen','KitchenQual','TotRmsAbvGrd',
'Fireplaces','GarageType','MiscVal','MoSold','YrSold' variables are in int64/float64 type, but they can be treated as categorical. Below are the bar plots for the same

In [None]:
#melting the dataframe to bring the data into single column
CategoricalConData=Train[['OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','KitchenQual','TotRmsAbvGrd','GarageCars',
'Fireplaces','GarageType','MiscVal','MoSold','YrSold']].copy()

CategoricalConDataMelt=CategoricalConData.melt()

#Univariate Analysis for CountPlot for categorical variables
CatFacetGrid = sns.FacetGrid(CategoricalConDataMelt, col='variable',sharex=False, dropna=True, sharey=False, size=4,col_wrap=4)
CatFacetGrid.set_xticklabels(rotation=90)
countPlot=CatFacetGrid.map(sns.countplot,'value')
plt.show()


## Univariate Analysis plot for Numerical data

In [None]:
#histograms for numerical variables
NumericData.drop(['OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','GarageCars',
'Fireplaces','MiscVal','MoSold','YrSold'],axis=1,inplace=True)
NumericDataMelt=NumericData.melt()

CatFacetGrid = sns.FacetGrid(NumericDataMelt, col='variable',sharex=False, dropna=True, sharey=False, size=4,col_wrap=4)
#CatFacetGrid.set_xticklabels(rotation=90)
countPlot=CatFacetGrid.map(sns.histplot,'value')
plt.show()

In [None]:
NumericData.shape

In [None]:
#distplot and boxplots for numerical variables
fig, axes = plt.subplots(nrows=24,ncols=2,  figsize=(20, 90))
fig.subplots_adjust(hspace = .8, wspace=.3)
i = 0
for col in NumericData.columns:
    if NumericData[col].dtype == 'int64' or NumericData[col].dtype == 'float64':
        sns.distplot(NumericData[col], ax=axes[i][0]).set_title("Hisotogram of " + col)
        sns.boxplot(NumericData[col], ax=axes[i][1]).set_title("Boxplot of " + col)
        i = i + 1

# Bivariate Analysis

## Categorical vs Target variable

In [None]:
CategoricalConData.columns

In [None]:
size = (20, 50)
CategoricalConData['SalePrice']=Train['SalePrice']

fig, axs = plt.subplots(ncols=2,nrows=8,figsize=size)
fig.subplots_adjust(hspace = .2, wspace=.2)

sns.boxplot(x=CategoricalConData['OverallQual'],y=CategoricalConData['SalePrice'],ax=axs[0][0], ).set_title("Boxplot of OverallQual")
sns.boxplot(x=CategoricalConData['OverallCond'],y=CategoricalConData['SalePrice'],ax=axs[0][1] ).set_title("Boxplot of OverallCond")

sns.boxplot(x=CategoricalConData['BsmtFullBath'],y=CategoricalConData['SalePrice'],ax=axs[1][0], ).set_title("Boxplot of BsmtFullBath")
sns.boxplot(x=CategoricalConData['BsmtHalfBath'],y=CategoricalConData['SalePrice'],ax=axs[1][1] ).set_title("Boxplot of BsmtHalfBath")

sns.boxplot(x=CategoricalConData['FullBath'],y=CategoricalConData['SalePrice'],ax=axs[2][0], ).set_title("Boxplot of FullBath")
sns.boxplot(x=CategoricalConData['HalfBath'],y=CategoricalConData['SalePrice'],ax=axs[2][1] ).set_title("Boxplot of HalfBath")

sns.boxplot(x=CategoricalConData['BedroomAbvGr'],y=CategoricalConData['SalePrice'],ax=axs[3][0], ).set_title("Boxplot of BedroomAbvGr")
sns.boxplot(x=CategoricalConData['KitchenAbvGr'],y=CategoricalConData['SalePrice'],ax=axs[3][1] ).set_title("Boxplot of KitchenAbvGr")

sns.boxplot(x=CategoricalConData['KitchenQual'],y=CategoricalConData['SalePrice'],ax=axs[4][0], ).set_title("Boxplot of KitchenQual")
sns.boxplot(x=CategoricalConData['TotRmsAbvGrd'],y=CategoricalConData['SalePrice'],ax=axs[4][1] ).set_title("Boxplot of TotRmsAbvGrd")

sns.boxplot(x=CategoricalConData['GarageCars'],y=CategoricalConData['SalePrice'],ax=axs[5][0], ).set_title("Boxplot of GarageCars")
sns.boxplot(x=CategoricalConData['Fireplaces'],y=CategoricalConData['SalePrice'],ax=axs[5][1] ).set_title("Boxplot of Fireplaces")

sns.boxplot(x=CategoricalConData['GarageType'],y=CategoricalConData['SalePrice'],ax=axs[6][0], ).set_title("Boxplot of GarageType")
sns.boxplot(x=CategoricalConData['MiscVal'],y=CategoricalConData['SalePrice'],ax=axs[6][1] ).set_title("Boxplot of MiscVal")

sns.boxplot(x=CategoricalConData['MoSold'],y=CategoricalConData['SalePrice'],ax=axs[7][0], ).set_title("Boxplot of MoSold")
sns.boxplot(x=CategoricalConData['YrSold'],y=CategoricalConData['SalePrice'],ax=axs[7][1] ).set_title("Boxplot of YrSold")

In [None]:
CategoricData['SalePrice']=Train['SalePrice'].copy()

In [None]:
size = (20, 60)

fig, axs = plt.subplots(ncols=2,nrows=8,figsize=size)
fig.subplots_adjust(hspace = .4, wspace=.2)

sns.boxplot(x=CategoricData['MSZoning'],y=CategoricData['SalePrice'],ax=axs[0][0], ).set_title("Boxplot of MSZoning")
sns.boxplot(x=CategoricData['Street'],y=CategoricData['SalePrice'],ax=axs[0][1] ).set_title("Boxplot of OverallCond")

sns.boxplot(x=CategoricData['LotShape'],y=CategoricData['SalePrice'],ax=axs[1][0], ).set_title("Boxplot of LotShape")
sns.boxplot(x=CategoricData['LandContour'],y=CategoricData['SalePrice'],ax=axs[1][1] ).set_title("Boxplot of LandContour")

sns.boxplot(x=CategoricData['Utilities'],y=CategoricData['SalePrice'],ax=axs[2][0], ).set_title("Boxplot of Utilities")
sns.boxplot(x=CategoricData['LotConfig'],y=CategoricData['SalePrice'],ax=axs[2][1] ).set_title("Boxplot of LotConfig")

sns.boxplot(x=CategoricData['LandSlope'],y=CategoricData['SalePrice'],ax=axs[3][0], ).set_title("Boxplot of LandSlope")
ax=sns.boxplot(x=CategoricData['Neighborhood'],y=CategoricData['SalePrice'],ax=axs[3][1] )
ax.set_title("Boxplot of Neighborhood")
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

sns.boxplot(x=CategoricData['Condition1'],y=CategoricData['SalePrice'],ax=axs[4][0], ).set_title("Boxplot of Condition1")
sns.boxplot(x=CategoricData['Condition2'],y=CategoricData['SalePrice'],ax=axs[4][1] ).set_title("Boxplot of Condition2")

sns.boxplot(x=CategoricData['BldgType'],y=CategoricData['SalePrice'],ax=axs[5][0], ).set_title("Boxplot of BldgType")
sns.boxplot(x=CategoricData['HouseStyle'],y=CategoricData['SalePrice'],ax=axs[5][1] ).set_title("Boxplot of HouseStyle")

sns.boxplot(x=CategoricData['RoofStyle'],y=CategoricData['SalePrice'],ax=axs[6][0], ).set_title("Boxplot of RoofStyle")
sns.boxplot(x=CategoricData['RoofMatl'],y=CategoricData['SalePrice'],ax=axs[6][1] ).set_title("Boxplot of RoofMatl")

ax=sns.boxplot(x=CategoricData['Exterior1st'],y=CategoricData['SalePrice'],ax=axs[7][0], )
ax.set_title("Boxplot of Exterior1st")
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax=sns.boxplot(x=CategoricData['Exterior2nd'],y=CategoricData['SalePrice'],ax=axs[7][1] )
ax.set_title("Boxplot of Exterior2nd")
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

## Continous vs Target variable

In [None]:
#Bivariate Analysis for actual categorical Variables with Attrition
NumericData['SalePrice']=Train['SalePrice']
sns.pairplot(NumericData)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(round(NumericData.corr(),2),annot=True,mask=None,cmap='GnBu')
plt.show()

## Missing values imputation

In [None]:
# Number of of null values in each column
count=round(Train.isnull().sum(),2)
percent=round((Train.isnull().sum()/Train.shape[0])*100,2)
data=pd.concat([count,percent],axis=1)
data.reset_index(inplace=True)
data.rename(columns={0: 'Missing Values Count',1: 'Missing Values %'},inplace=True)
missingData=data[data['Missing Values Count']!=0]
missingData

In [None]:
missingDf=Train[['LotFrontage', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']].copy()
missingDf

In [None]:
msno.heatmap(Train[missingDf.columns])

In [None]:
# filling a null values of 'GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond' using fillna()  andupdating as 'No Garage'
Train["GarageType"].fillna("No Garage", inplace = True)  
Train["GarageYrBlt"].fillna("No Garage", inplace = True)  
Train["GarageFinish"].fillna("No Garage", inplace = True)  
Train["GarageQual"].fillna("No Garage", inplace = True)  
Train["GarageCond"].fillna("No Garage", inplace = True)  

In [None]:
# filling a null values of 'GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond' using fillna()  andupdating as 'No Garage'
Test["GarageType"].fillna("No Garage", inplace = True)  
Test["GarageYrBlt"].fillna("No Garage", inplace = True)  
Test["GarageFinish"].fillna("No Garage", inplace = True)  
Test["GarageQual"].fillna("No Garage", inplace = True)  
Test["GarageCond"].fillna("No Garage", inplace = True)  

In [None]:
# filling a null values of 'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2' using fillna()  and updating as 'No Basement'
Train["BsmtQual"].fillna("No Basement", inplace = True)  
Train["BsmtCond"].fillna("No Basement", inplace = True)  
Train["BsmtExposure"].fillna("No Basement", inplace = True)  
Train["BsmtFinType1"].fillna("No Basement", inplace = True)  
Train["BsmtFinType2"].fillna("No Basement", inplace = True)

In [None]:
# filling a null values of 'BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2' using fillna()  and updating as 'No Basement'
Test["BsmtQual"].fillna("No Basement", inplace = True)  
Test["BsmtCond"].fillna("No Basement", inplace = True)  
Test["BsmtExposure"].fillna("No Basement", inplace = True)  
Test["BsmtFinType1"].fillna("No Basement", inplace = True)  
Test["BsmtFinType2"].fillna("No Basement", inplace = True)

In [None]:
# filling a null values of 'MasVnrType','MasVnrArea' using fillna()  and updating as 'No None'
Train["MasVnrType"].fillna("None", inplace = True)  
Train["MasVnrArea"].fillna("None", inplace = True)  

In [None]:
# filling a null values of 'MasVnrType','MasVnrArea' using fillna()  and updating as 'No None'
Test["MasVnrType"].fillna("None", inplace = True)  
Test["MasVnrArea"].fillna("None", inplace = True)  

In [None]:
# filling a null values of 'Electrical' with mode
Train['Electrical'].fillna(Train['Electrical'].mode()[0], inplace=True)

In [None]:
# filling a null values of 'Electrical' with mode
Test['Electrical'].fillna(Train['Electrical'].mode()[0], inplace=True)

In [None]:
# filling a null values of 'FireplaceQu' using fill na and updating as 'No Fireplace'
Train["FireplaceQu"].fillna("No Fireplace", inplace = True)  

In [None]:
# filling a null values of 'FireplaceQu' using fill na and updating as 'No Fireplace'
Test["FireplaceQu"].fillna("No Fireplace", inplace = True)  

In [None]:
Train['LotFrontage'].describe()

In [None]:
# filling a null values of 'LotFrontage' with mean
Train['LotFrontage'].fillna(Train['LotFrontage'].mean(), inplace=True)

In [None]:
# filling a null values of 'LotFrontage' with mean
Test['LotFrontage'].fillna(Test['LotFrontage'].mean(), inplace=True)

In [None]:
# # filling a null values of 'LotFrontage' with mean
# Train['LotFrontage'].fillna(Train['LotFrontage'].mean(), inplace=True)

In [None]:
# Number of of null values in each column after imputation
count=round(Train.isnull().sum(),2)
percent=round((Train.isnull().sum()/Train.shape[0])*100,2)
data=pd.concat([count,percent],axis=1)
data.reset_index(inplace=True)
data.rename(columns={0: 'Missing Values Count',1: 'Missing Values %'},inplace=True)
missingData=data[data['Missing Values Count']!=0]
missingData

In [None]:
missing=Test[['MSZoning','Utilities','Exterior1st','Exterior2nd','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','KitchenQual','Functional','GarageArea','SaleType']].copy()
missing

In [None]:
missing.info()

In [None]:
# filling a null values of 'LotFrontage' with mean
Test['TotalBsmtSF'].fillna(Test['TotalBsmtSF'].mean(), inplace=True)
# filling a null values of 'LotFrontage' with mean
Test['GarageArea'].fillna(Test['GarageArea'].mean(), inplace=True)

In [None]:
Test = Test.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [None]:
# Number of of null values in each column after imputation
count=round(Test.isnull().sum(),2)
percent=round((Test.isnull().sum()/Test.shape[0])*100,2)
data=pd.concat([count,percent],axis=1)
data.reset_index(inplace=True)
data.rename(columns={0: 'Missing Values Count',1: 'Missing Values %'},inplace=True)
missingData=data[data['Missing Values Count']!=0]
missingData

# Feature Selection

In [None]:
CategoricData=Train.select_dtypes(include=['object','category'])
rows,col=(CategoricData.shape)
print("Number of Categorical columns are:",col)
print(CategoricData.columns)

In [None]:
# #Displaying the number of occurences for each categorical variable
# CatCol = CategoricData.select_dtypes(include = "object").columns
# print(CatCol)
# print("\n")
# for col in CatCol:
#     print(CategoricData[col].value_counts())
#     print("\n")

In [None]:
# #taking databackup into temp for plotting a correlation plot
# temp=CategoricData.copy()

# #selecting only categorical variables for Label encoding
# CatCol = temp.select_dtypes(include = "object").columns
# print(CatCol)

# #instantiating LabelEncoder() object
# le = LabelEncoder()

# #Label encoding the categorical columns by converting them into string type
# for feat in CatCol:
#     temp[feat] = le.fit_transform(temp[feat].astype(str))

# #plottting correlational plot checking Correlations and dependencies for imputation
# plt.figure(figsize=(25,15))
# sns.heatmap(round(temp.corr(method='kendall'),2),annot=True,mask=None,cmap='GnBu')
# plt.show()

In [None]:
# #plottting correlational plot checking Correlations and dependencies for imputation
# plt.figure(figsize=(25,15))
# sns.heatmap(round(NumericData.corr(method='pearson'),2),annot=True,mask=None,cmap='GnBu')
# plt.show()

In [None]:
# NumericData=Train.select_dtypes(include=['float64','int64'])
# # NumericData.drop(['OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','GarageCars',
# # 'Fireplaces','MiscVal','MoSold','YrSold'],axis=1,inplace=True)
# NumericData.isnull().sum()

# NumericData.drop(['SalePrice','BsmtUnfSF','2ndFlrSF','YrSold','YearRemodAdd'],axis=1,inplace=True)
# #checking for presence onf multi-collieanrity
# vif=pd.DataFrame()
# vif['Features']=NumericData.columns
# vif['VIF']=[variance_inflation_factor(NumericData.values,i) for i in range(NumericData.shape[1])]
# vif['VIF']=round(vif['VIF'],2)
# vif=vif.sort_values(by="VIF",ascending=False)
# vif

### One hot encoding for categorical data



In [None]:
Train.drop(['1stFlrSF','2ndFlrSF','LowQualFinSF','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotRmsAbvGrd','GarageCars'],axis=1,inplace=True)
Test.drop(['1stFlrSF','2ndFlrSF','LowQualFinSF','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotRmsAbvGrd','GarageCars'],axis=1,inplace=True)

In [None]:
#taking databackup 
Train_bkup=Train.copy()
temp=CategoricData.copy()

In [None]:
Train_bkup.drop(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType',
       'SaleCondition'],axis=1,inplace=True)

In [None]:
#taking databackup 
Test_bkup=Test.copy()

In [None]:
Test_bkup.drop(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType',
       'SaleCondition'],axis=1,inplace=True)

In [None]:
CategoricDataT=Test.select_dtypes(include=['object','category'])
rows,col=(CategoricDataT.shape)
print("Number of Categorical columns are:",col)
print(CategoricDataT.columns)

In [None]:
tempT=CategoricDataT.copy()

In [None]:
oheDataT=pd.get_dummies(tempT,drop_first=True)

In [None]:
oheData=pd.get_dummies(temp,drop_first=True)

In [None]:
oheData.columns

In [None]:
Train_bkup.columns

In [None]:
#concatenating with categoricalEncoded DF
oheData=pd.concat([oheData,Train_bkup],axis=1)

In [None]:
#concatenating with categoricalEncoded DF
oheDataT=pd.concat([oheDataT,Test_bkup],axis=1)

In [None]:
#one hot encoded dataframe shape
oheData.shape

In [None]:
oheDataT.shape

In [None]:
#taking backup of oheDataDF
oheDataDF=oheData.copy()

In [None]:
# import sys
# !{sys.executable} -m pip install -U pandas-profiling[notebook]
# !jupyter nbextension enable --py widgetsnbextension

In [None]:
# #Panda Profiling gives almost all the data analysis required for the EDA
# pandas_profiling.ProfileReport(CategoricData)

# Method:1 Correlation heatmap

In [None]:
CategoricData.columns

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(NumericData, 0.8)
len(set(corr_features))

In [None]:
corr_features

In [None]:
NumericData=Train.select_dtypes(include=['float64','int64'])

In [None]:
plt.figure(figsize=(25,15))
sns.heatmap(round(NumericData.corr(method='pearson'),2),annot=True,mask=None,cmap='GnBu')
plt.show()

From correlation heatmap below we understand variables are having high correlation

1stFlrSF', 'GarageArea', 'GarageYrBlt', 'TotRmsAbvGrd'}

# Method:2 VIF values

In [None]:
#checking for presence onf multi-collieanrity
vif=pd.DataFrame()
vif['Features']=NumericData.columns
vif['VIF']=[variance_inflation_factor(NumericData.values,i) for i in range(NumericData.shape[1])]
vif['VIF']=round(vif['VIF'],2)
vif=vif.sort_values(by="VIF",ascending=False)
vif

1stFlrSF+ 2ndFlrSF + LowQualFinSF= GrLivArea	
BsmtFinSF1	+ BsmtFinSF2 + BsmtUnfSF = TotalBsmtSF

6 variables can be dropped

In [None]:
#plotting after droppping
NumericData=Train.select_dtypes(include=['float64','int64'])
plt.figure(figsize=(25,15))
sns.heatmap(round(NumericData.corr(method='pearson'),2),annot=True,mask=None,cmap='GnBu')
plt.show()

# Method 3: VarianceThreshold

In [None]:
oheDataDF.shape

In [None]:
oheDataDF.columns

In [None]:
X=oheDataDF.drop(labels=['SalePrice'], axis=1)
y=oheDataDF['SalePrice']

from sklearn.model_selection import train_test_split
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    oheDataDF.drop(labels=['SalePrice'], axis=1),
    oheDataDF['SalePrice'],
    test_size=0.3,
    random_state=55)

In [None]:
var_thres=VarianceThreshold(threshold=0.05)
var_thres.fit(X_train)

In [None]:
### Finding non constant features
sum(var_thres.get_support())

In [None]:
# Lets Find non-constant features 
len(X_train.columns[var_thres.get_support()])

In [None]:
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

In [None]:
X_train.drop(constant_columns,axis=1,inplace=True)
X_test.drop(constant_columns,axis=1,inplace=True)

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
print(X_train.shape)
print(X_test.shape)

we are left with 97 variables and this can be used for further reduction

# Method 4 :SelectKBest

https://stats.stackexchange.com/questions/204141/difference-between-selecting-features-based-on-f-regression-and-based-on-r2/207396#207396?newreg=bc481cdb1ae54e85acad7fc29d220346

https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/

for classification score fucntion woould be score_func=chi2

In [None]:
oheDataDF.shape

In [None]:
#  #independent columns
# X = oheDataDF.iloc[:,0:665] 
#  #target column i.e price range
# y = oheDataDF.iloc[:,-1]   

In [None]:
X=X_train.copy()
y=y_train.copy()

In [None]:
#apply SelectKBest class to extract top 10 best features-above 99 features obtained
bestfeatures = SelectKBest(score_func=f_regression, k=10)
fit = bestfeatures.fit(X,y)

In [None]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [None]:
#concat two dataframes 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
#naming the dataframe columns
featureScores.columns = ['Specs','Score']  
#print 10 best features
print(featureScores.nlargest(15,'Score'))  

# Method 5: ExtraTreesRegressor

Feature importance gives you a score for each feature of your data, the higher the score more important or relevant is the feature towards your output variable.

In [None]:
#  #independent columns
# X = oheDataDF.iloc[:,0:665] 
#  #target column i.e price range
# y = oheDataDF.iloc[:,-1]   

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
import matplotlib.pyplot as plt
model = ExtraTreesRegressor()
model.fit(X,y)

In [None]:
# #use inbuilt class feature_importances of tree based classifiers
# print(model.feature_importances_) 

In [None]:
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(15).plot(kind='barh')
plt.show()

In [None]:
feat_importances.nlargest(15)

# Method 6: Mutual Information gain

In [None]:
# determine the mutual information
mutual_info = mutual_info_regression(X_train, y_train)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

In [None]:
mutual_info.sort_values(ascending=False).plot.bar(figsize=(15,5))

In [None]:
## Selecting the top 20 percentile
selected_top_columns = SelectPercentile(mutual_info_regression, percentile=15)
selected_top_columns.fit(X_train, y_train)


In [None]:
X_train.columns[selected_top_columns.get_support()]

# Method 7: RFE (Recursive Feature Elimination)

In [None]:
rfe = RFE(estimator=DecisionTreeRegressor(), n_features_to_select=15)
model = DecisionTreeRegressor()

In [None]:
rfe = RFE(model, 15)             # running RFE with 15 variables as output
rfe = rfe.fit(X_train, y_train)

In [None]:
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]

In [None]:
X_train.columns[~rfe.support_]

In [None]:
col

# Method 8: Forward feature Selection

https://www.analyticsvidhya.com/blog/2020/10/a-comprehensive-guide-to-feature-selection-using-wrapper-methods-in-python/

In [None]:
# Sequential Forward Selection(sfs)
sfs = SFS(LinearRegression(),
          k_features=15,
          forward=True,
          floating=False,
          scoring = 'r2',
          cv = 0)

In [None]:
#fitting and predicting for top 15 features
sfs.fit(X, y)
sfs.k_feature_names_

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

# Method 9: Backward elimination

In [None]:
sbs = SFS(LinearRegression(),
         k_features=15,
         forward=False,
         floating=False,
         cv=0)


In [None]:
sbs.fit(X.values, y.values)

In [None]:
sbs.k_feature_names_

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
fig1 = plot_sfs(sbs.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

# Method 10: Bidirectional/Step-wise feature Elimination

In [None]:
sffs = SFS(LinearRegression(),
         k_features=(3,15),
         forward=True,
         floating=True,
         cv=0)

In [None]:
sffs.fit(X.values, y.values)
sffs.k_feature_names_

In [None]:
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
fig1 = plot_sfs(sffs.get_metric_dict(), kind='std_dev')
plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

#Model Building

## Decision Tree

In [None]:
# Function to check performance metrics
def check_performance(model):
  print("Train_R_squared =", model.best_estimator_.score(X_train, y_train)) 
  print("Test_R_squared =", model.best_estimator_.score(X_test, y_test))  
  print("RMSE_train =", np.sqrt(mean_squared_error(y_train, model.best_estimator_.predict(X_train))))
  print("RMSE_test = ", np.sqrt(mean_squared_error(y_test, model.best_estimator_.predict(X_test))))
  print("MAPE_train =", mean_absolute_percentage_error(y_train, model.best_estimator_.predict(X_train)))
  print("MAPE_test = ", mean_absolute_percentage_error(y_test, model.best_estimator_.predict(X_test)))

In [None]:
# Function to calculate MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred =  np.array(y_true), np.array(pd.DataFrame(y_pred))
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
# # Building Decision Tree Regressor
# DTR = DecisionTreeRegressor(random_state=123)
# DTR.fit(X_train,y_train)

In [None]:
# print(X_train.shape)
# print(y_train.shape)

In [None]:
# #Using the model to predict on test data
# y_pred_dt = decisionTree.predict(X_test)

In [None]:
# %%time
# param_grid = {
#               'max_depth' : [4,6,8,10,12],
#               'min_samples_leaf' : [20,30,40,50], # 1-3% of length of dataset
#               'min_samples_split' : [40,60,80]
#              }

# DTR = DecisionTreeRegressor(random_state=123)

# DT_random = RandomizedSearchCV(estimator = DTR, param_distributions = param_grid, cv = 3)

# DT_random.fit(X_train,y_train)

In [None]:
# # Best parameters
# DT_random.best_params_

In [None]:
# check_performance(DT_random)

In [None]:
DT_Train=X_train[['ExterQual_Gd',
'ExterQual_TA',
'KitchenQual_TA',
'LotArea',
'MSSubClass',
'LotFrontage',
'OverallQual',
'YearBuilt',
'YearRemodAdd',
'TotalBsmtSF',
'GrLivArea',
'FullBath',
'Fireplaces',
'GarageArea',
'OpenPorchSF'
 ]].copy()



In [None]:
 DT_Test=X_test[['ExterQual_Gd',
'ExterQual_TA',
'KitchenQual_TA',
'LotArea',
'MSSubClass',
'LotFrontage',
'OverallQual',
'YearBuilt',
'YearRemodAdd',
'TotalBsmtSF',
'GrLivArea',
'FullBath',
'Fireplaces',
'GarageArea',
'OpenPorchSF'
 ]].copy()

In [None]:
y_train_log=np.log(y_train)

In [None]:
y_test_log=np.log(y_test)

In [None]:
# Building Decision Tree Regressor
DTR = DecisionTreeRegressor(random_state=123)
DTR.fit(DT_Train,y_train_log)

In [None]:
print(DT_Train.shape)
print(y_train_log.shape)

In [None]:
print(DT_Test.shape)
print(y_test_log.shape)

In [None]:
#Using the model to predict on test data
y_pred_dt = DTR.predict(DT_Test)

In [None]:
%%time
param_grid = {
              'max_depth' : [4,6,8,10,12],
              'min_samples_leaf' : [20,30,40,50], # 1-3% of length of dataset
              'min_samples_split' : [40,60,80]
             }

#DTR = DecisionTreeRegressor(random_state=123)

DT_random = RandomizedSearchCV(estimator = DTR, param_distributions = param_grid, cv = 3)

DT_random.fit(DT_Train,y_train_log)

In [None]:
# Best parameters
DT_random.best_params_

In [None]:
print("Train_R_squared =", DT_random.best_estimator_.score(DT_Train, y_train_log)) 
print("Test_R_squared =", DT_random.best_estimator_.score(DT_Test, y_test_log))  
print("RMSE_train =", np.sqrt(mean_squared_error(y_train_log, DT_random.best_estimator_.predict(DT_Train))))
print("RMSE_test = ", np.sqrt(mean_squared_error(y_test_log, DT_random.best_estimator_.predict(DT_Test))))
print("MAPE_train =", mean_absolute_percentage_error(y_train_log, DT_random.best_estimator_.predict(DT_Train)))
print("MAPE_test = ", mean_absolute_percentage_error(y_test_log, DT_random.best_estimator_.predict(DT_Test)))

## Random Forest

In [None]:
# Building Random Forest Regressor
RFR = RandomForestRegressor(random_state=123)
RFR.fit(DT_Train,y_train_log)

In [None]:
%%time
param_grid = {'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)],
              'max_features' : ['auto', 'sqrt'],
              'max_depth' : np.linspace(1, 20, 20, endpoint=True),
              'min_samples_leaf' : [10,20,30,50,70,90], # 1-3% of length of dataset
              'min_samples_split' : [30,40,50,60,70,80,90], # approx 3 times the min_samples_leaf
             }

RFR = RandomForestRegressor(random_state=123)

RF_random = RandomizedSearchCV(estimator = RFR, param_distributions = param_grid, cv = 3)

RF_random.fit(DT_Train,y_train_log)

In [None]:
# Best parameters
RF_random.best_params_

In [None]:
print("Train_R_squared =", DT_random.best_estimator_.score(DT_Train, y_train_log)) 
print("Test_R_squared =", DT_random.best_estimator_.score(DT_Test, y_test_log))  
print("RMSE_train =", np.sqrt(mean_squared_error(y_train_log, DT_random.best_estimator_.predict(DT_Train))))
print("RMSE_test = ", np.sqrt(mean_squared_error(y_test_log, DT_random.best_estimator_.predict(DT_Test))))
print("MAPE_train =", mean_absolute_percentage_error(y_train_log, DT_random.best_estimator_.predict(DT_Train)))
print("MAPE_test = ", mean_absolute_percentage_error(y_test_log, DT_random.best_estimator_.predict(DT_Test)))

#Testing on kaggle test data

In [None]:
DT_Test1=oheDataT[['ExterQual_Gd',
'ExterQual_TA',
'KitchenQual_TA',
'LotArea',
'MSSubClass',
'LotFrontage',
'OverallQual',
'YearBuilt',
'YearRemodAdd',
'TotalBsmtSF',
'GrLivArea',
'FullBath',
'Fireplaces',
'GarageArea',
'OpenPorchSF'
 ]].copy()

In [None]:
DT_Test1.isnull().sum()

In [None]:
#Using the model to predict on test data
y_pred_dt1 = RF_random.predict(DT_Test)

In [None]:
y_pred_dt1

In [None]:
dataset = pd.DataFrame({'Column1': y_pred_dt})
dataset

In [None]:
#download result from colab
dataset.to_csv('result.csv',index=False)
files.download("result.csv")