The house price prediction using the housing data. Approach is unique in dealing with missing values and filtering out the non-essential columns based on the contribution made by the highest occuring values.

### 1. Data Understanding and EDA

Importing the relavant libraries

In [None]:
import pandas as pd
from datetime import datetime
from calendar import month_abbr
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
warnings.filterwarnings('ignore')

Importing the data provided

In [None]:
train= pd.read_csv('../input/advance-house-price-predicitons/train.csv')
test=pd.read_csv('../input/advance-house-price-predicitons/test.csv')

In [None]:
train.info()

The above data has 81 attributes with mixure of both object and numeric types of data

Loading Data Dictionary

In [None]:
des=open('../input/advance-house-price-predicitons/data description.txt',"r")
print(des.read())

Loading the description of each of the attribute into a dictionary file for easy access and retrival for any future use or reference

In [None]:
dict_des={}
att_des={}
with open('../input/advance-house-price-predicitons/data description.txt','r') as f:
    for i in f:
        if len(re.findall('.:\s.',i))>0 and len(i.strip().split(':',1))==2 and len((i.strip().split(':',1))[0].split())==1 :
            dict_des[i.strip().split(':',1)[0]]=[]
            att_des[i.strip().split(':',1)[0]]=i.strip().split(':',1)[1]

list_dic=[]
with open('../input/advance-house-price-predicitons/data description.txt','r') as f:
    for i in f:
        list_dic.append(i.strip())


for i in range(0,len(dict_des)-1):
    for j in range(0,len(list_dic)):
        if list(dict_des.keys())[i]==list_dic[j].strip().split(':',1)[0]:
            k=1
            while list(dict_des.keys())[i+1]!=list_dic[j+k].strip().split(':',1)[0]:
                #if list_dic[j+k].strip().split(':',1)[0]!=['']:
                temp=list_dic[j+k].strip().split('\t',1)
                dict_des[list(dict_des.keys())[i]].append([int(i) if re.findall('^[0-9]+$',i) else i for i in temp])
                k=k+1
            dict_des[list(dict_des.keys())[i]]=[i for i in dict_des[list(dict_des.keys())[i]] if i!=['']]


list_del=[i for i in dict_des.keys() if len(dict_des[i])==0 ]
dict_des=dict([(key,value) for key,value in dict_des.items() if key not in list_del])

num_dic=[i for i in dict_des.keys() if  re.findall(pattern='^[0-9]+$',string=str(dict_des[i][0][0]))]
num_des=dict([(key,value) for key,value in dict_des.items() if key in num_dic])

The description of each attribute is as below

In [None]:
att_des

Displaying the decription of attributes which have numerical values

In [None]:
numeric_val_dic={i:att_des[i] for i in list(set(att_des).difference(dict_des.keys()))}
numeric_val_dic.pop('SaleCondition')
numeric_val_dic

In [None]:
dict_des

All Dictionaries were `NA` is not null value and has some meaning

In [None]:
NA_dic={i:dict_des[i] for i in dict_des.keys() if  re.findall(pattern='NA',string=str(dict_des[i]))}
NA_dic

Create a dictionary with the column names with the corresponding explaination of 'NA' for that particular column from the `NA_dic`

In [None]:
NA_impute={i:NA_dic[i][-1][1] for i in NA_dic.keys()}
NA_impute

Checking for the attribute description with only ordinal variables i.e., ordered categorical data and their corresponding description

In [None]:
num_des

Checking the metadata of the data file loaded

In [None]:
train.head()

In [None]:
test.head()

Now the data needs some priminary processing.

In [None]:
data=pd.concat([train.drop('SalePrice',axis=1),test],axis=0)

In [None]:
data['Id']=data.Id.apply(lambda x:str(x))

In [None]:
data.set_index('Id',inplace=True)

Checking for the number of columns where NA has some significance and the percentage of NA values in the respective columns

In [None]:
print((100*data.isnull().sum()/len(data))[NA_impute.keys()])

Replacing all the NA values with the respective valid strings from the list prepared previously

In [None]:
for i in NA_impute.keys():
    data[i].fillna(NA_impute[i],inplace=True)

1. Again Checking for the columns with null values in % terms

In [None]:
print((100*data.isnull().sum()/len(data))[(100*data.isnull().sum()/len(data))>0])

Some of the values appearing as numerical namely `MSSubClass`,`OverallQual`,`OverallCond` are actually ordinal variables. Hence using the dictionary file for mapping the values

In [None]:
num_des['MSSubClass']

In [None]:
(data['MSSubClass'].value_counts()/len(data)).reset_index().sort_values(by='index')

Since there are around 15 categories which may pose a problem while creating a dummy variable we can create a new class of objects based on the map data `MS_Class` to reduce it to 8 variables

In [None]:
MS_Class=[[i[0],i[1].split('STORY')[0]+'STORY'] if re.findall('STORY',i[1]) else [i[0],i[1].split('SPLIT')[0]+'SPLIT'] if re.findall('SPLIT',i[1]) else [i[0],i[1]] for i in dict_des['MSSubClass']]
MS_Class

In [None]:
dict_des['OverallQual']

Creating Mapping file to create four categories namely `Excellent`,`Good`,`Average`,`Fair` and `Poor` instead of above 10 categories

In [None]:
RateMap=[[i[0],i[1].replace('Very',' ').split()[0]] if re.findall('Very',i[1]) else [i[0],i[1].replace('Above',' ').split()[0]] if re.findall('Above',i[1]) else [i[0],i[1].replace('Below',' ').split()[0]] for i in dict_des['OverallQual']]

In [None]:
(data['OverallQual'].value_counts()/len(data)).reset_index().sort_values(by='index')

Creating a function to map `MSSubClass` and `OverallQual`& `OverallCond` variables to create new grouped classes

In [None]:
def map_data(x,data):
    return x.map({data[k][0]:data[k][1] for k in range(0,len(data))})

Mapping the variables `MSSubClass`,`OverallQual` and `OverallCond` using decription dictionary

In [None]:
data[['MSSubClass']]=data[['MSSubClass']].apply(lambda x:map_data(x,MS_Class))
data[['OverallQual']]=data[['OverallQual']].apply(lambda x:map_data(x,RateMap))
data[['OverallCond']]=data[['OverallCond']].apply(lambda x:map_data(x,RateMap))

The variables `YearBuilt`,`YearRemodAdd` and `GarageYrBlt` represent the years in which the property was built, year when remodelling was added and the year when garage was built. Year being an interval variable,
`Age_property`by deriving the age of the property, number of years since modification as `Modification_Age` and `Age_Garage` since were created as **derived metrics**.

In [None]:
data['Age_property']=data['YrSold']-data['YearBuilt']
data['Modification_Age']=data['YrSold']-data.YearRemodAdd
data['Age_Garage']=data['YrSold']-data.GarageYrBlt

Dropping the variables `YearBuilt`,`YearRemodAdd` and `GarageYrBlt` as the derived metrics has necessary information

In [None]:
data.drop(['YearBuilt','YearRemodAdd','GarageYrBlt'],inplace=True,axis=1)

In [None]:
data.YrSold.value_counts()

From the above we can see that the variable `YrSold` has only four types of data values and hence be converted into a categorical variable

In [None]:
data['YrSold']=data.YrSold.astype('object')

Month sold is also a categorical variable,, hence mapping the number of the month with the actual month name

In [None]:
data[['MoSold']]=data[['MoSold']].apply(lambda x:x.map({i:month_abbr[i] for i in range(1,13)}))

In [None]:
data[['MoSold']].value_counts()

Since, we have done some data processing, lets explore the numerical data

In [None]:
for i in data.select_dtypes(exclude=object).columns:
    plt.figure(figsize=(10, 100))
    plt.subplot(len(data.select_dtypes(exclude=object).columns),2,2)
    sns.boxplot(data[i])
    
    plt.subplot(len(data.select_dtypes(exclude=object).columns),2,1)
    try:
        sns.distplot(data[i])
    except(Exception):
        continue
    
    

From the above distribution and box plots for numerical data we can see that there are outliers in the data and also there are some variables which take only few selected integer values

Let's explore categorical data by box plotting categorical varibles with respect to sale price

Bivariate Analysis of variables with respect to `SalePrice`

In [None]:
sns.pairplot(data=train,x_vars=['LotFrontage', 'LotArea', 'MasVnrArea'],y_vars='SalePrice')

In [None]:
sns.pairplot(data=train,x_vars=['BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF'],y_vars='SalePrice')

There is some relationship between `TotalBsmtSF` and `SalePrice` as observed above

In [None]:
sns.pairplot(data=train,x_vars=['1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea',],y_vars='SalePrice')

There is some relationship between `GrLivArea` and `SalePrice` as observed above

In [None]:
sns.pairplot(data=train,x_vars=['GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF'],y_vars='SalePrice') 

In [None]:
sns.pairplot(data=train,x_vars=['EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'],y_vars='SalePrice') 

Nothing much can be said about the above two plots

In [None]:
sns.pairplot(data[['Age_property', 'Modification_Age', 'Age_Garage']]) 

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(train.corr(),cmap="YlGnBu",annot=True,center=0,vmax=1,vmin=-1)

In [None]:
train.corr()['SalePrice'][abs(train.corr()['SalePrice'])>0.5]

Now for the variable `TotalBsmtSF` we can see that it is the sum of `BsmtFinSF1`,`BsmtFinSF2`,`BsmtUnfSF` as dipicted below

In [None]:
(data.TotalBsmtSF-(data.BsmtFinSF1+data.BsmtFinSF2+data.BsmtUnfSF)).value_counts()

Also `GrLivArea` is also sum of `1stFlrSF`,`2ndFlrSF`,`LowQualFinSF` variables as dipicted below

In [None]:
(data.GrLivArea-(data['1stFlrSF']+data['2ndFlrSF']+data['LowQualFinSF'])).value_counts()

Let check variables `TotRmsAbvGrd`,`BedroomAbvGr`, `KitchenAbvGr` for any such relationships

In [None]:
(data['TotRmsAbvGrd']-(data['BedroomAbvGr']+data['KitchenAbvGr'])).plot.box()

From the above it is clear than there are additonal rooms other then Bedrooms and Kitchen which are included in total room above ground count. Hence creating a new variables `OtherroomsAbvGr` and dropping the `TotRmsAbvGrd` variable as it has already both the count of `BedroomAbvGr` and `KitchenAbvGr`

In [None]:
data['OtherroomsAbvGr']=data['TotRmsAbvGrd']-(data['BedroomAbvGr']+data['KitchenAbvGr'])

From the above heatmap plots and analyis we can see some significantly high correlations between some variables. Now to get list of varibles lets further analyze correlation matrix

In [None]:
co_dic={}
corr_mat=data.corr()
for i in corr_mat.columns:
    co_dic[i]=corr_mat[i][(abs(corr_mat[i])>=0.8) & (corr_mat[i]!=1) ]
pd.concat(co_dic).reset_index()

From the above analysis it is evident that some significant correlations exist between the above listed variables with **Pearson Coefficient of more than `0.80`**. Since the variable `GrLivArea` and `1stFlrSF` are linearly related(expressed as a sum of it with other variables) and also `1stFlrSF` is highly correlated to `TotalBsmtSF`, we can drop `1stFlrSF` variable. Since we are keeping `GrLivArea`, we have to drop `TotRmsAbvGrd` as it also correlated to `GrLivArea` and the information of the variable is already incorporated in `OtherroomsAbvGr`. Hence,dropping `GarageCars` as the `GarageArea` is more flexible variable which can float instead of other which is a integer type variable. Also, dropping the variable `Age_Garage` as it is highly correlated to `Age_property`, hence it makes more sense to keep this variable as it more general variable than the other.

In [None]:
data.drop(['1stFlrSF','TotRmsAbvGrd','GarageCars','Age_Garage'],inplace=True,axis=1)

Lets Analyze the no of unique values for the numeric data types and the percentage of contribution by the highest occuring value by the below analysis

In [None]:
dtype_ana=pd.DataFrame()
for i in data.select_dtypes(exclude=object):
    t=(pd.DataFrame(data[i].value_counts()/len(data)).reset_index())
    dtype_ana.loc[i,'No_unique_values']=len(data[i].value_counts())
    dtype_ana.loc[i,'Top Value']=t['index'][0]
    dtype_ana.loc[i,'%Contri']=t[i][0]
dtype_ana

Lets filter out the items with no of unique items less than 20

In [None]:
dtype_ana[(dtype_ana.No_unique_values<20)]

From the above it is evident that most of the variables have unique values ranging from 3 to 8

Now analyzing the data frame for the % contribution of more than 95

In [None]:
dtype_ana[(dtype_ana['%Contri']>0.95)]

Since these values donot contribute much to the data we can drop them and also the `LowQualFinSF` is in relation with `TotalBsmtSF`

In [None]:
data.drop(list(dtype_ana[(dtype_ana['%Contri']>0.95)].index),inplace=True,axis=1)

Again conducting the above analysis for the new data

In [None]:
dtype_ana=pd.DataFrame()
for i in data.select_dtypes(exclude=object):
    t=(pd.DataFrame(data[i].value_counts()/len(data)).reset_index())
    dtype_ana.loc[i,'No_unique_values']=len(data[i].value_counts())
    dtype_ana.loc[i,'Top Value']=t['index'][0]
    dtype_ana.loc[i,'%Contri']=t[i][0]
dtype_ana[(dtype_ana.No_unique_values<20)]

From the above we can see that the variable `BsmtHalfBath` still has more than 90% of the values being 0. hence we can drop this variable as this doesnot add much to our analysis

In [None]:
data.drop('BsmtHalfBath',inplace=True,axis=1)

Preparing the list of variables with number of unique values less than 20 as the list of variables with data type `int64`after all the above processing and removing the deleted variable from the list

In [None]:
list_int=list(dtype_ana[(dtype_ana.No_unique_values<20)].index)
list_int.remove('BsmtHalfBath')
list_int

Preparing another list of variables with dtype `float64`

In [None]:
list_float=[i for i in list(data.select_dtypes(exclude=object).columns) if i not in list_int]

In [None]:
list_float

Converting into the data types as per the above lists for better demarcation

In [None]:
data[list_float]=data[list_float].astype('float64')

In [None]:
data[list_int].isnull().sum()

In [None]:
data[list_int]=data[list_int].fillna(0)

In [None]:
data[list_int]=data[list_int].astype('int64')

In [None]:
data.info()

Checking the Missing values for the numeric data type including float and integer type

In [None]:
100*data.select_dtypes(exclude=object).isnull().sum()/len(data)

From the above it is clear that the numeric data two variables with missing values namely `LotFrontage` which is around 17% and `MasVnrArea` which is around 0.54%

Checking Missing Values for Object data type

In [None]:
(100*data.select_dtypes(include=object).isnull().sum()/len(data))[100*data.select_dtypes(include=object).isnull().sum()/len(data)>0]

From the above we can say the the highest number of missing values are for the variable `FireplaceQu` and others have below `6%` missing values. Storing the list of object datatype variables with missing values in a seperate list

In [None]:
null_obj=(100*data.select_dtypes(include=object).isnull().sum()/len(data))[100*data.select_dtypes(include=object).isnull().sum()/len(data)>0].index

#### Missing value and Outlier Treatment for Numeric datatype
Now we have 66 variables from the original 81 variables. Lets create a function to generate index of the data, filtering out the outliers

In [None]:
def quan_fil(data):
    q1=data.quantile(0.25)
    q3=data.quantile(0.75)
    return data[~((data<(q1-1.5*(q3-q1)))|(data>(q3+1.5*(q3-q1)))).any(axis=1)].index

Creating a Function to classify continuous variable into various grades are listed below
 <br /> `LO` which has `LOwer` values below 25 percentile value
 <br /> `LM` which has `Lower-Middle` values above 25percentile but below 50 percentile value
 <br /> `UM` which has `Upper-Middle` values above 50percentile but below 75 percentile value
 <br /> `HG`which has `HiGher` values above 75percentile
 <br /> `NS`which represents `NotSpecified`for `None` or `Null` (missing values)

In [None]:
def sub_grouping(data):
    q1,q2,q3=data.quantile(0.25),data.quantile(0.5),data.quantile(0.75)
    data=data.apply(lambda x:'NS'if (x==None or str(x)=='nan') else('LO' if x<=q1 else ('LM' if x<=q2 else ('UM' if  x<=q3 else 'HG' ))))
    return q1,q2,q3,data

Binning `LotFrontage` as it has more than 15% of missing values using the above function and also if any outliers present they will also be taken care

In [None]:
q1_LotFrontage,q2_LotFrontage,q3_LotFrontage,data['LotFrontage']=sub_grouping(data['LotFrontage'])

In [None]:
data['LotFrontage'].value_counts()

Since `MasVnrArea` has less than 1% of missing values we can impute them with median values as it has outliers

In [None]:
data['MasVnrArea'].fillna(data.MasVnrArea.median(),inplace=True)

In [None]:
100*data[list_float].isnull().sum()/len(data)

Fromt the list of float variables lets seperate the dependent variable `SalePrice` as it will be handled seperately for Outlier Treatment

In [None]:
list_float

Removing the variable `LotFrontage` from the list of float variables as it is now object type with zero missing values after binning tranformation

In [None]:
list_float.remove('LotFrontage')

Let's analyze what percentage of the values are outliers for float type of variables and also what is the most occuring value and its contribution to the data in %

In [None]:
float_data={}
for i in list_float:
    t=list(quan_fil(data[[i]]))
    float_data[i]=[(100-(100*(len(t)/len(data)))),pd.DataFrame(data[i].value_counts()/len(data)).index[0],100*pd.DataFrame(data[i].value_counts()/len(data)).iloc[0,0]]
float_data=pd.DataFrame(float_data).transpose().rename(columns={0:'%Outliers',1:'Most_Occ_Value',2:'%Contri_MOV'})
float_data

Checking for the variables which are having more than 85% of the data being a single value

In [None]:
float_data[float_data['%Contri_MOV']>80]

From the above we can see that the variables with most occuring values being zero and the outliers present in the data is also more than 5% of the data. Hence it doesnot add much to the data and hence can be dropped

In [None]:
lst_drp_fl=list(float_data[float_data['%Contri_MOV']>80].index)
lst_drp_fl

In [None]:
data.drop(lst_drp_fl,axis=1,inplace=True)

Again checking the data for outliers percentage after dropping the the above variables

In [None]:
float_data.drop(lst_drp_fl).sort_values(by='%Contri_MOV',ascending=False)

Updating the list of float variables for outlier treatement after removing the variables

In [None]:
list_float=list(float_data.drop(lst_drp_fl).sort_values(by='%Contri_MOV',ascending=False).index)
list_float

In [None]:
data[list_float].median()

Since the data provided has only 1460 datapoints and have to retain maximum amount of information, we will impute the outlier values which are less 10% of the data with median values as at most 6.7% is the highest percentage of outliers among the various variables of float type data

In [None]:
for i in list_float:
    plt.figure()
    sns.boxplot(data[i])
    
    

In [None]:
mean_float=data[list_float].mean()
data[list_float]=np.log(data[list_float]+mean_float)

Since the missing value treatment was taken care already them the below list of variables have null values in place of outliers

In [None]:
data[list_float].info()

Checking for the percentage of Outliers and Missing Values for `int64` type of data

In [None]:
int_data={}
for i in list_int:
    t=list(quan_fil(data[[i]]))
    int_data[i]=[len(data[i].value_counts()),100-(100*(len(t)/len(data))),(100*(data[i].isnull().sum()/len(data)))]
pd.DataFrame(int_data).transpose().rename(columns={0:'No of Unique Values',1:'%Outliers',2:'%NullValues'})

In [None]:
for i in list_int:
    plt.figure()
    sns.boxplot(data[i])

In [None]:
mean_int=data[list_int].mean()
data[list_int]=np.log(data[list_int]+mean_int)

In [None]:
data[list_int].info()

Analyzing  Object Type of data (categorical data) before all the transformations were performed on the varibles for **missing values**

In [None]:
100*data[null_obj].isnull().sum()/len(data)

In [None]:
obj_data={}
for i in null_obj:
    obj_data[i]=[100*(data[i].value_counts()/len(data)).min(),(100*(data[i].isnull().sum()/len(data)))]
obj_data=pd.DataFrame(obj_data).transpose().rename(columns={0:'%Least_Occ_Cat',1:'%Missing_Values'})

In [None]:
obj_data

Since this is categorical data, we would like to impute all the missing values which are contributing more than the lowest occuring category in % terms and also the data which has more than 5% of the missing values in it with a new category `NotSpec`. This methodology preserves the data without loosing much information.

In [None]:
for i in obj_data.index:
    if (obj_data['%Least_Occ_Cat'][i],0<obj_data['%Missing_Values'][i],0) or (obj_data['%Missing_Values'][i],0)>5:
        data[i]=data[i].fillna('NotSpec')

In [None]:
100*data[null_obj].isnull().sum()/len(data)

Checking the variable `FireplaceQu` after imputation to know its effect

In [None]:
data['FireplaceQu'].value_counts()

In [None]:
100*data.isnull().sum()/len(data)

Now as we see that we have sucessfully completed the missing value and outlier treatment for independent variables, lets analyze the dependent variable `SalePrice` for oulier treatment

In [None]:
sns.distplot(train.SalePrice)

In [None]:
y_train=train.SalePrice
y_train=np.log(y_train)
y_train

Since the % of outliers is less than 1%, `SalePrice` being an dependent variable, its safe to drop the missing values rather than imputing them

checkin the data using .decribe() and correlation heatmap plot

In [None]:
data.select_dtypes(exclude=object).describe()

In [None]:
plt.figure(figsize=(16,8))
sns.heatmap(data.corr(),cmap="YlGnBu",annot=True,center=0,vmax=1,vmin=-1)

From the above we can see that most of the variables are to some extent independent of each other

Creating dummy variables for object data

In [None]:
dummies={}
obj_col=data.select_dtypes(include=object).columns
for i in obj_col:
    dummies[i]=pd.get_dummies(data[i],prefix=i,drop_first = True)

In [None]:
dum_df=pd.concat([dummies[i] for i in dummies.keys()],axis=1)

In [None]:
dum_df

Dropping the original object data and combining with the dummy variables

In [None]:
data=data.drop(obj_col,axis=1)
data=pd.concat([data,dum_df],axis=1)

In [None]:
data.info(verbose=1)

In [None]:
data.shape

### Model Building and Evaluation

Spliting the data into train and test data

In [None]:
train['Id']=train.Id.apply(lambda x:str(x))

In [None]:
train.set_index('Id',inplace=True)

In [None]:
train.index

In [None]:
data.index

In [None]:
X_train=data.loc[list(train.index),:]

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test=data.drop(list(train.index),axis=0)

In [None]:
X_test.shape

Scaling the train data and test data using appropriate methodologies

In [None]:
X_train.describe()

In [None]:
X_train.shape

Since the number of features is high, to save computation time lets analyze the data using cross validation score to check for the number of features at which the Rsquared becomes negative

In [None]:
lm=LinearRegression()
lm.fit(X_train,y_train)
opt=pd.DataFrame()
for i in range(1,X_train.shape[1]+1):
    rfe=RFE(lm, n_features_to_select=i)
    folds = KFold(n_splits = 5, shuffle = True, random_state = 100)
    if cross_val_score(rfe, X_train, y_train, scoring='r2', cv=folds).mean()<0:
        break
    opt.loc[i,'cross_val_score']=cross_val_score(rfe, X_train, y_train, scoring='r2', cv=folds).mean()
   

The above code takes 15 minutes to compute the cross_val_scores for different features

In [None]:
opt=opt.reset_index().rename(columns={'index':'No_Attributes'})

In [None]:
ft=opt.No_Attributes.iloc[-1]
opt

In [None]:
plt.plot(opt.No_Attributes,opt.cross_val_score)

As per the above analysis we can see that 34 is the number of attributes that an RFE can select based with Rsquared Value within the realistic limits 

We now plot the Train and Validation scores using grid search to know the actual optimum number of features as shown below

In [None]:
folds=KFold(n_splits=5,shuffle=True,random_state=100)
hyper_params=[{'n_features_to_select':list(range(1,ft+1))}]
lm=LinearRegression()
lm.fit(X_train,y_train)
rfe=RFE(lm)
model_cv=GridSearchCV(estimator=rfe,param_grid=hyper_params,scoring='r2',cv=folds,verbose=1,return_train_score=True)
model_cv.fit(X_train,y_train)

cv_results = pd.DataFrame(model_cv.cv_results_)

The implementation of the above code takes 8 to 10 min based on the configuration of the PC or Laptop and the below are the results

In [None]:
# plotting cv results
plt.figure(figsize=(16,6))
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])

plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')

Plotting the difference between the two scores

In [None]:
plt.figure(figsize=(16,6))
plt.plot(cv_results["param_n_features_to_select"],(cv_results["mean_train_score"]-cv_results["mean_test_score"]))

Now filtering the data for more than 0.7 of Rsquared value but less than 0.05 difference between the two scores

In [None]:
mini=cv_results[((cv_results["mean_train_score"]-cv_results["mean_test_score"])<=0.05) & (cv_results["mean_train_score"]>0.7) & (cv_results["mean_test_score"]>0.7)][['param_n_features_to_select','mean_train_score','mean_test_score']]
mini['diff']=(cv_results["mean_train_score"]-cv_results["mean_test_score"])

From the above we can see that 21 is the optimum number of features on which we can work on using RFE

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

rfe = RFE(lm, n_features_to_select=45)             
rfe = rfe.fit(X_train, y_train)
cols=list(X_train.columns[rfe.support_])

In [None]:
def LRM_vif(y_train,X_train):
    import statsmodels.api as sm
    X_train_sm = sm.add_constant(X_train)
    lm = sm.OLS(y_train,X_train_sm).fit()
    summ=lm.summary()
    
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    VIF_Pval= pd.DataFrame({'Columns':X_train.columns, 'VIF':[round(variance_inflation_factor(X_train.values, i),2) for i in range(X_train.shape[1])]})
    VIF_Pval=VIF_Pval.set_index('Columns').join(pd.DataFrame(lm.pvalues).rename(columns={0:'P>|t|'}))
    VIF_Pval['P>|t|']=VIF_Pval['P>|t|'].apply(lambda x:round(x,3))
    return lm,summ,VIF_Pval.sort_values('P>|t|',ascending=False)

In [None]:
X_train[cols].reset_index()

In [None]:
X_train1=X_train[cols].reset_index().drop('Id',axis=1)
lm1,summ1,VIF_Pval1=LRM_vif(y_train,X_train1)
summ1,VIF_Pval1

In [None]:
for i in range(0,len(cols)):
    lm1,summ1,VIF_Pval1=LRM_vif(y_train,X_train[cols].reset_index().drop('Id',axis=1))
    if VIF_Pval1.sort_values(by='VIF',ascending=False).iloc[1,:]['VIF']>3:
        if list(VIF_Pval1.sort_values(by='P>|t|',ascending=False).index)[0] in cols:
            cols.remove(list(VIF_Pval1.sort_values(by='VIF',ascending=False).index)[0])
        
    if (VIF_Pval1.sort_values(by='P>|t|',ascending=False).iloc[1,:]['P>|t|']>0.05):
        if list(VIF_Pval1.sort_values(by='P>|t|',ascending=False).index)[0] in cols:
            cols.remove(list(VIF_Pval1.sort_values(by='P>|t|',ascending=False).index)[0])

summ1,VIF_Pval1

In [None]:
X_train1=X_train[cols].reset_index().drop('Id',axis=1)

### Regularization

#### Ridge Regression

In [None]:
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


ridge = Ridge()

# cross validation
folds = 5
model_cv_rd = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv_rd.fit(X_train1, y_train)
cv_results_rd = pd.DataFrame(model_cv_rd.cv_results_)

In [None]:
# plotting mean test and train scoes with alpha 
cv_results_rd['param_alpha'] = cv_results_rd['param_alpha'].astype('float32')

# plotting
plt.plot(np.log10(cv_results_rd['param_alpha']), cv_results_rd['mean_train_score'])
plt.plot(np.log10(cv_results_rd['param_alpha']), cv_results_rd['mean_test_score'])
plt.xlabel('log10(alpha)')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
cv_results_rd[cv_results_rd['mean_test_score']==cv_results_rd['mean_test_score'].max()][['param_alpha','mean_train_score','mean_test_score']]

In [None]:
alpha = 2
ridge = Ridge(alpha=alpha)

ridge.fit(X_train1, y_train)
ridge.coef_

In [None]:
sorted(list(zip(ridge.coef_, cols)))

'OverallQual','Exterior1st','Age_property','Condition2','2ndFlrSF','TotalBsmtSF' are the most important predictor variables

Model Evaluation on unseen Test Data

#### Lasso Regression

In [None]:
lasso = Lasso()
#params = {'alpha': list(map(lambda x:x*10**-3,list(range(0,20))))+list(map(lambda x:x*10**-2,list(range(0,20))))}

# cross validation
model_cv_ls = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            

model_cv_ls.fit(X_train1, y_train)
cv_results_ls = pd.DataFrame(model_cv_ls.cv_results_)

In [None]:
# plotting mean test and train scoes with alpha 
cv_results_ls['param_alpha'] = cv_results_ls['param_alpha'].astype('float32')

# plotting
plt.plot(np.log10(cv_results_ls['param_alpha']), cv_results_ls['mean_train_score'])
plt.plot(np.log10(cv_results_ls['param_alpha']), cv_results_ls['mean_test_score'])
plt.xlabel('log10(alpha)')
plt.ylabel('Negative Mean Absolute Error')

plt.title("Negative Mean Absolute Error and alpha")
plt.legend(['train score', 'test score'], loc='upper left')
plt.show()

In [None]:
cv_results_ls[cv_results_ls['mean_test_score']==cv_results_ls['mean_test_score'].max()]['param_alpha']

In [None]:
alpha =5

lasso = Lasso(alpha=alpha)
        
lasso.fit(X_train1, y_train) 
sorted(list(zip(abs(lasso.coef_), cols)))

In [None]:
lasso.coef_

In [None]:
list(zip(lasso.coef_, cols))

'OverallQual','Exterior1st','Age_property','Condition2','2ndFlrSF','TotalBsmtSF' are the most important predictor variables

In [None]:
colsn=[i[1] for i in list(zip(lasso.coef_[np.nonzero(lasso.coef_)],cols))]
X_train1=X_train[cols].reset_index().drop('Id',axis=1)
lm1,summ1,VIF_Pval1=LRM_vif(y_train,X_train1)
summ1,VIF_Pval1