**Problem Statement**

A US-based housing company named Surprise Housing has decided to enter the Australian market. The company uses data analytics to purchase houses at a price below their actual values and flip them on at a higher price. For the same purpose, the company has collected a data set from the sale of houses in Australia. The data is provided in the CSV file below.

 

The company is looking at prospective properties to buy to enter the market. You are required to build a regression model using regularisation in order to predict the actual value of the prospective properties and decide whether to invest in them or not.

 

The company wants to know:

Which variables are significant in predicting the price of a house, and

How well those variables describe the price of a house.

#### Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import metrics
from statsmodels.stats.outliers_influence import variance_inflation_factor

# hide warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
#functions 
#Function to print null values in all columns
def nulls(df):
    return (100*round(df.isnull().sum()/len(df),4).sort_values(ascending=False))
def getvif(df):
    if 'const' in list(df.columns):
        df1=df.drop('const', axis=1) 
    else:
        df1 = df.copy()
    vif=pd.DataFrame()
    vif['Features'] = df1.columns
    vif['VIF'] = [variance_inflation_factor(df1.values, i) for i in range(df1.shape[1])]
    vif['VIF'] = round(vif.VIF,2)
    vif = vif.sort_values(by = 'VIF', ascending = False)
    return vif

In [None]:
#import dataset
df = pd.read_csv('../input/house-prices-data/train.csv')

## EDA and Missing Value Treatment

In [None]:
#inspecting df
df.info()

In [None]:
df.head()

In [None]:
#checking null values
nulls(df)

In [None]:
df.describe()

In [None]:
#Let's drop the columns with more than 90% of null values
#If we impute null values here, the columns will be highly skewed and hence of no use to our model
nulls_list = nulls(df)
df.drop(list(nulls_list.loc[nulls_list>=90].index),axis=1,inplace=True)
nulls(df)

In [None]:
#Nulls in the Fence column inidcates no fence present in the house
df['Fence'].fillna('No Fence',inplace = True)

In [None]:
#Nulls in the FireplaceQu column inidcates no fireplace present in the house
df['FireplaceQu'].fillna('No Fireplace',inplace = True)

In [None]:
#Nulls in the GarageCond, GarageType, GarageYrBlt, GarageFinish, GarageQual column inidcates no Garage present in the house
df['GarageCond'].fillna('No Garage',inplace = True)
df['GarageType'].fillna('No Garage',inplace = True)
df['GarageYrBlt'].fillna('No Garage',inplace = True)
df['GarageQual'].fillna('No Garage',inplace = True)
df['GarageFinish'].fillna('No Garage',inplace = True)

In [None]:
#Nulls in the BsmtExposure, BsmtFinType2, BsmtFinType1, BsmtCond, BsmtQual column inidcates no basement present in the house
df['BsmtExposure'].fillna('No Basement',inplace = True)
df['BsmtFinType2'].fillna('No Basement',inplace = True)
df['BsmtFinType1'].fillna('No Basement',inplace = True)
df['BsmtCond'].fillna('No Basement',inplace = True)
df['BsmtQual'].fillna('No Basement',inplace = True)

In [None]:
#Nulls in the MasVnrArea, MasVnrType column inidcates no Masonry Veneer present in the house
df['MasVnrArea'].fillna(0,inplace = True)
df['MasVnrType'].fillna('None',inplace = True)

In [None]:
#checking Electrical
df.Electrical.value_counts()

In [None]:
#replacing nulls in LotFrontage with 0 (no frontage)
df['LotFrontage'].fillna(0,inplace = True)

In [None]:
#replacing nulls in Electrical with mode
df['Electrical'].fillna('SBrkr',inplace = True)

In [None]:
df.info()

In [None]:
#Checking YearBuilt to Age
df['Age'] = 2020 - df['YearBuilt']

In [None]:
#dropping the YearBuilt variables since we won't need to use these, age will suffice for our goal
df.drop(['YearBuilt','YrSold','YearRemodAdd','GarageYrBlt','MoSold'],axis=1,inplace=True)

In [None]:
#'SaleType' would not be available to us while making predicitons (it is part of target variable), hence we will drop it. 
df.drop(['SaleType'],axis=1,inplace=True)

In [None]:
df.info()

In [None]:
#We can perform label encoding for some cardinal categorical variables, which have an order to them

df['LandSlope'] = df['LandSlope'].map({'Gtl':3, 'Mod':2,'Sev':1})
df['LotShape'] = df['LotShape'].map({'Reg':4, 'IR1':3,'IR2':2,'IR3':1})
df['Utilities'] = df['Utilities'].map({'AllPub':4, 'NoSewr':3,'NoSeWa':2,'ELO':1})
df['ExterQual'] = df['ExterQual'].map({'Ex':5, 'Gd':4,'TA':3,'Fa':2,'Po':1})
df['ExterCond'] = df['ExterCond'].map({'Ex':5, 'Gd':4,'TA':3,'Fa':2,'Po':1})
df['BsmtQual'] = df['BsmtQual'].map({'Ex':5, 'Gd':4,'TA':3,'Fa':2,'Po':1,'No Basement':0})
df['BsmtCond'] = df['BsmtCond'].map({'Ex':5, 'Gd':4,'TA':3,'Fa':2,'Po':1,'No Basement':0})
df['BsmtExposure'] = df['BsmtExposure'].map({'Gd':4,'Av':3,'Mn':2,'No':1,'No Basement':0})
df['BsmtFinType1'] = df['BsmtFinType1'].map({'GLQ':6,'ALQ':5, 'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'No Basement':0})
df['BsmtFinType2'] = df['BsmtFinType2'].map({'GLQ':6,'ALQ':5, 'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'No Basement':0})
df['HeatingQC'] = df['HeatingQC'].map({'Ex':5, 'Gd':4,'TA':3,'Fa':2,'Po':1})
df['Electrical'] = df['Electrical'].map({'SBrkr':5, 'FuseA':4,'FuseF':3,'FuseP':2,'Mix':1})
df['KitchenQual'] = df['KitchenQual'].map({'Ex':5, 'Gd':4,'TA':3,'Fa':2,'Po':1})
df['Functional'] = df['Functional'].map({'Typ':8,'Min1':7,'Min2':6,'Mod':5, 'Maj1':4,'Maj2':3,'Sev':2,'Sal':1})
df['FireplaceQu'] = df['FireplaceQu'].map({'Ex':5, 'Gd':4,'TA':3,'Fa':2,'Po':1,'No Fireplace':0})
df['GarageFinish'] = df['GarageFinish'].map({'Fin':4,'RFn':3,'Unf':2,'No Garage':1})
df['GarageQual'] = df['GarageQual'].map({'Ex':5, 'Gd':4,'TA':3,'Fa':2,'Po':1,'No Garage':0})
df['GarageCond'] = df['GarageCond'].map({'Ex':5, 'Gd':4,'TA':3,'Fa':2,'Po':1,'No Garage':0})
df['PavedDrive'] = df['PavedDrive'].map({'Y':3, 'P':2,'N':1})
df['Fence'] = df['Fence'].map({'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1,'No Fence':0})

In [None]:
#We have dealt with all the null values in the dataset
nulls(df)

In [None]:
#Changing class to categorical
df['MSSubClass'] = df['MSSubClass'].astype('object')

In [None]:
#We have these categorical variables now
cat_vars = list(set(df.drop('Id',axis=1).columns) - set(df._get_numeric_data().columns))
cat_vars

In [None]:
#Adding convereted cateogrical variables too
for each in ['LandSlope','LotShape','Utilities','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure','BsmtQual','BsmtFinType1','BsmtFinType2','HeatingQC','Electrical','KitchenQual','Functional','FireplaceQu','GarageFinish','GarageQual','GarageCond','PavedDrive','Fence']:
    cat_vars.append(each)

In [None]:
len(cat_vars)

In [None]:
#We will be inspecting all the cateogrical columns now

plt.figure(figsize=(20,8*26))
for i,each in enumerate(cat_vars):
    plt.subplot(26,2,i+1)
    sns.countplot(y=df[each])

In [None]:
#from the above plots, we can spot some features with a consider amount of skewing in the data.
#We will analyze how much skewed the data exactly is, and then take action based on that.
#As a benchmark, features with more than 90% of data belonging to one category can be set as skewed
#We will find which of our categorical columns can be cateogirzed as skewed
skewed_cols = []
for each in cat_vars:
    if max(df[each].value_counts(normalize=True)*100) > 90:
        print(df[each].value_counts(normalize=True)*100)
        print('\n')
        skewed_cols.append(each)

In [None]:
#We now have the following heavily skewed columns where more than 90% of values belong to a single category only
skewed_cols

In [None]:
#Dropping heavily skewed columns for a cleaner dataset
df.drop(skewed_cols, axis=1, inplace=True)
df.info()

In [None]:
#We now look at how the numerical variables are spread
num_vars = list(df.drop('Id',axis=1)._get_numeric_data().columns)
num_vars

In [None]:
len(num_vars)

In [None]:
#We will be inspecting all the numerical columns now

plt.figure(figsize=(20,8*22))
for i,each in enumerate(num_vars):
    plt.subplot(27,2,i+1)
    sns.boxplot(x=df[each])

In [None]:
#Certain features are highly skewed here. We can drop them to simplify our model
#before dropping, we look at their actual spread
i=1
plt.figure(figsize=(20,30))
for each in ['LowQualFinSF','ExterCond','BsmtCond','BsmtFinType2','BsmtFinSF2','BsmtHalfBath','KitchenAbvGr','GarageQual','PoolArea','Fence','EnclosedPorch','ScreenPorch','MiscVal','3SsnPorch']:
    plt.subplot(4,4,i)
    sns.distplot(df[each])
    i+=1

In [None]:
#Dropping more skewed features
df.drop(['LowQualFinSF','ExterCond','BsmtCond','BsmtFinType2','BsmtFinSF2','BsmtHalfBath','KitchenAbvGr','GarageQual','PoolArea','Fence','EnclosedPorch','ScreenPorch','MiscVal','3SsnPorch'],axis=1,inplace=True)

In [None]:
#let's see how sale price is varying across overall quality rating of the house
data = pd.concat([df['SalePrice'], df['OverallQual']], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x='OverallQual', y="SalePrice", data=data)
# fig.axis(ymin=0, ymax=800000);

In [None]:
#Sale price also is related to age of house
#high age does not necessarily mean that the price would be low, but a general slightly downward trend is seen
data = pd.concat([df['SalePrice'], df['Age']], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x='Age', y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);

In [None]:
#We can see how some of the important looking features have an impact on each other
sns.pairplot(df[['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'Age']], size = 2.5)
plt.show()

* We find there is a considerable amount of variance in the data
* We choose not to perform outlier value treatment, since these features are what might be impacting prices of the house

## Transforming the dependent variable to a normal distribution

To satisfy the assumptions of linear regression, the target variable (dependent variable) must be normally distributed. We can see in the distplot below that ours is highly skewed towards the right. We will now deal with this problem to get better performance

In [None]:
#target variable
sns.distplot(df['SalePrice'])

In [None]:
#We note that the target (dependent) variable does not have a purely normal distribution and there some skewness.

In [None]:
#The variable is quite skewed, and all the values are positive (boxcox can be used here)
df.SalePrice.describe()

In [None]:
#There is a high degree of skew in this column
df.SalePrice.skew()

In [None]:
#We try log transformation
from math import log
sns.distplot(df.SalePrice.apply(lambda x: log(x)))

In [None]:
#We are able to get the skew down to a great extent by using log transform
df.SalePrice.apply(lambda x: log(x)).skew()

In [None]:
#We can use Box-Cox Transform to reduce the skew further and make the data resemble a normal distribution
from scipy import stats as ss
df['SalePrice'] = pd.Series(ss.boxcox(df.SalePrice)[0])
df.SalePrice.skew()

In [None]:
#We can see that the distribution is much closer to normal now, hence we can fit a linear regression model effectively
sns.distplot(df.SalePrice)

## Getting dummy variables for categorical features

In [None]:
#Updating list of categorical variables
cat_vars = list(set(df.drop('Id',axis=1).columns) - set(df._get_numeric_data().columns))
cat_vars

In [None]:
#creating dataframe with dummy variables
dummy = pd.get_dummies(df[cat_vars], drop_first = True)
dummy.head()

In [None]:
#merging dummy variables into original dataframe
df = pd.concat([df,dummy],axis = 1)
df.info()

In [None]:
#dropping original categorical variables that have been dummified
df.drop(cat_vars,axis=1,inplace=True)
df.shape

## Test-Train split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df.drop(['SalePrice','Id'], axis = 1)
y = df[['SalePrice']]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.75, random_state = 44)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

## Scaling of numerical features

In [None]:
#To be able to interpret the final model, and for a faster convergence, we will scale our features
#We have chosen MinMax Scaler for the process

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler() #initializing minmaxscaler
num_vars = list(set(num_vars) - set(['KitchenAbvGr', 'ScreenPorch', 'Fence', 'BsmtHalfBath', '3SsnPorch', 'PoolArea', 'BsmtCond', 'EnclosedPorch', 'LowQualFinSF', 'MiscVal', 'GarageQual', 'BsmtFinSF2', 'BsmtFinType2', 'ExterCond','SalePrice'])) #removing saleprice since it is not in x_train now

In [None]:
#We will be scaling only the orignal continous variables. Dummy variables can be used as-is.
x_train[list(num_vars)].describe()

In [None]:
#fitting the scaler on train set and transforming variables
x_train[num_vars] = scaler.fit_transform(x_train[num_vars])
x_train[num_vars].describe()

In [None]:
#Scaling features in test set using the scaler fitted on train set
x_test[num_vars] = scaler.transform(x_test[num_vars])

We will first begin by running Recursive Feature Elimination to determine the top 30 most important features by fitting a linear regression model on the data.

## Building the Model

## RFE

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
from sklearn.feature_selection import RFE
rfe = RFE(linreg, 30)             # running RFE with 30 variables as output
rfe = rfe.fit(x_train, y_train)
print(rfe.support_)

In [None]:
#columns chosen by running RFE
cols=x_train.columns[rfe.support_]
cols

In [None]:
import statsmodels.api as sm
x_train_rfe = sm.add_constant(x_train[list(cols)])

In [None]:
#Building a basic model with the selected features to gauge performance

lm = sm.OLS(y_train, x_train_rfe).fit()
print(lm.summary())

Now we can choose to proceed with the set of features identified by RFE, but we can also take the route of directly going to ridge and lasso techniques. We can let the lasso model take care of feature selection as well. 

## Ridge Regression

In [None]:
#List of parameters (alpha values) to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


ridge = Ridge()
%time
# cross validation
folds = KFold(n_splits = 5, shuffle = True, random_state = 1)
model_cv = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(x_train, y_train) 

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
#this is the best value of alpha identified by grid search cv
model_cv.best_params_

In [None]:
#this is the best score obtained by grid search on the validation set
model_cv.best_score_

In [None]:
# plotting mean test and train scoes with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')
plt.figure(figsize=(16,10))

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('R2 Score')
plt.title("R2 Score Against Alpha - Ridge Regression")
plt.legend(['train score', 'validation score'], loc='upper right')
plt.xscale('log')
plt.show()

In [None]:
#Selecting the optimum value of alpha as 4.0
ridge = Ridge(alpha = 4.0)
ridge.fit(x_train, y_train)
#predict
y_train_pred = ridge.predict(x_train)
y_test_pred = ridge.predict(x_test)
#Checking r2 score on train
print('r2 score for Train set')
print(metrics.r2_score(y_true = y_train, y_pred = y_train_pred))
print('r2 score for Test set')
print(metrics.r2_score(y_true = y_test, y_pred = y_test_pred))


In [None]:
# ridge model parameters
model_parameters = list(ridge.coef_)
model_parameters.insert(0, ridge.intercept_)
model_parameters = [round(x, 3) for x in model_parameters[1]]
cols = x_train.columns
cols = cols.insert(0, "constant")
ridge_f = list(zip(cols, model_parameters))
ridge_f

In [None]:
d = {'Feature':list(list(zip(*ridge_f))[0]),'Coeff':list(list(zip(*ridge_f))[1])}
ridge_params = pd.DataFrame(data = d)

In [None]:
#Let's see what is the importance given to each feature by our model
ridge_params.reindex(ridge_params.Coeff.abs().sort_values(ascending = False).index)

#### Ridge Model Explained
* Using ridge regression to regularize our parameters, we were able to determine the value of alpha (hyperparameter) to be optimal at 4.0
* Upon choosing this value, our model performs at a train set best r-squared of 0.87 during 5-fold cross validation. 
* We obtained a test data r2 value of 0.89 on fitting the model on training data (which gave train data r-squared of 0.90, indicating a good stable model)
* The important features can be observed in the sorted dataframe in the above cell.
    * OverallQual - High rating for the overall quality tends to bump the price up
    * LotShape - Regularly shaped lots would fetch a higher price for the house
    * 2nd/1st Floor Area - The area of both floors contribute to the price of the house.
    * HeatingQC - Houses equipped with better heating fetch better prices
    * GarageFinish - Better finish in the garage causes bump in price. 
    * KitchenQual - A hgher quality kitchen adds to the price
    * Basements that contain a full bath add value a lot.
    * High density residental zones are a major selling point
    * The quality of material used on the exterior is also being observed to impact the cost considerably.
    * There are certain neighborhood which are more preferred as well: 
        * Somerset
        * College Creek
        * North Ridge

## Lasso Regression

In [None]:
#List of parameters (alpha values) to tune
params = {'alpha': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


lasso = Lasso()
%time
# cross validation
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(x_train, y_train) 

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head(50)

In [None]:
model_cv.best_params_

In [None]:
model_cv.best_score_

In [None]:
# plotting mean test and train scoes with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float64')
plt.figure(figsize=(16,10))

# plotting
plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('R2 Score')
plt.title("R2 Score Against Alpha - Lasso Regression")
plt.legend(['train score', 'validation score'], loc='upper right')
# plt.xscale('log')
plt.xlim(0.0001,0.05)
plt.show()


In [None]:
#Selecting the optimum value of alpha as 0.0003
lasso = Lasso(alpha = 0.0003)
lasso.fit(x_train, y_train)
#predict
y_train_pred = lasso.predict(x_train)
y_test_pred = lasso.predict(x_test)
#Checking r2 score on train
print('r2 score for Train set')
print(metrics.r2_score(y_true = y_train, y_pred = y_train_pred))
print('r2 score for Test set')
print(metrics.r2_score(y_true = y_test, y_pred = y_test_pred))


In [None]:
# lasso model parameters
model_parameters = list(lasso.coef_)
model_parameters.insert(0, lasso.intercept_)
model_parameters = [round(x, 3) for x in model_parameters[1:]]
cols = x_train.columns
cols = cols.insert(0, "constant")
lasso_f = list(zip(cols, model_parameters))
lasso_f

In [None]:
d = {'Feature':list(list(zip(*lasso_f))[0]),'Coeff':list(list(zip(*lasso_f))[1])}
lasso_params = pd.DataFrame(data = d)

In [None]:
#Let's see what is the importance given to each feature by our model
lasso_params.reindex(lasso_params.Coeff.abs().sort_values(ascending = False).index)

#### Lasso model explained
* Using Lasso regression to regularize our parameters, we were able to determine the value of alpha (hyperparameter) to be optimal at 0.0003
* Lasso has an added benefit of performing feature selection. 
* Upon choosing this value, our model performs at a validation set r-squared of 0.87 during 5-fold cross validation. 
* We obtained a test data r2 value of 0.89 on fitting the model on training data (which gave train data r-squared of 0.90, indicating a good stable model)
* The most important features can be observed in the sorted dataframe in the above cell.
    * 2ndFloorSF - The are of the second floor
    * Lot Shape - The shape of the lot regular/irregular etc. The more it is towards regular, higher is the price
    * OverallQual - High rating (9/10) for the overall quality tends to bump the price up. 
    * ExterQual - The quality of material with which the exterior of house is buuilt
    * GarageFinish - Interiror finish of the garage
    * Some neighborhoods fetch better prices for the house
        * North Ridge
        * College Creek
        * Wayer West
    * Sales which are made as 'partial' will bring the price down considerably. 
    

These models can be further tuned using feature selection techniques like Forward, Backward and Stepwise. 

## Understanding Alpha

### Doubling the values of Alpha

In [None]:
#Doubling the alpha for ridge to 8.0
ridge = Ridge(alpha = 8.0)
ridge.fit(x_train, y_train)
#predict
y_train_pred = ridge.predict(x_train)
y_test_pred = ridge.predict(x_test)
#Checking r2 score on train
print('r2 score for Train set')
print(metrics.r2_score(y_true = y_train, y_pred = y_train_pred))
print('r2 score for Test set')
print(metrics.r2_score(y_true = y_test, y_pred = y_test_pred))

In [None]:
#Ridge Model Parameters
model_parameters = list(ridge.coef_)
model_parameters.insert(0, ridge.intercept_)
model_parameters = [round(x, 3) for x in model_parameters[1]]
cols = x_train.columns
cols = cols.insert(0, "constant")
ridge_f = list(zip(cols, model_parameters))
#Let's see what is the importance given to each feature by our model
pd.DataFrame(data = {'Feature':list(list(zip(*ridge_f))[0]),'Coeff':list(list(zip(*ridge_f))[1])}).reindex(ridge_params.Coeff.abs().sort_values(ascending = False).index)

In [None]:
#Doubling the alpha for lasso to 0.0006
lasso = Lasso(alpha = 0.0006)
lasso.fit(x_train, y_train)
#predict
y_train_pred = lasso.predict(x_train)
y_test_pred = lasso.predict(x_test)
#Checking r2 score on train
print('r2 score for Train set')
print(metrics.r2_score(y_true = y_train, y_pred = y_train_pred))
print('r2 score for Test set')
print(metrics.r2_score(y_true = y_test, y_pred = y_test_pred))


In [None]:
#Lasso Model Parameters
model_parameters = list(lasso.coef_)
model_parameters.insert(0, lasso.intercept_)
model_parameters = [round(x, 3) for x in model_parameters[1:]]
cols = x_train.columns
cols = cols.insert(0, "constant")
lasso_f = list(zip(cols, model_parameters))
#Let's see what is the importance given to each feature by our model
d = {'Feature':list(list(zip(*lasso_f))[0]),'Coeff':list(list(zip(*lasso_f))[1])}
lasso_params = pd.DataFrame(data = d)
lasso_params.reindex(lasso_params.Coeff.abs().sort_values(ascending = False).index)

We see a dip in the train r-squared for both the models, for both train set and test set. This is to be expected since the model is now more regularized and the fit is less tighter than it was before.

## Re-Run Lasso with Reduced set of input variables

* The top 5 features from the orignal Lasso model were ['2ndFlrSF','LotShape','OverallQual','ExterQual','GarageFinish'].
* These will be removed from the training (and test) set, and the model will be re-tuned and re-evaluated.

In [None]:
#List of parameters (alpha values) to tune
params = {'alpha': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


lasso = Lasso()
%time
# cross validation
folds = KFold(n_splits = 5, shuffle = True, random_state = 42)
model_cv = GridSearchCV(estimator = lasso, 
                        param_grid = params, 
                        scoring= 'r2', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv.fit(x_train.drop(['2ndFlrSF','LotShape','OverallQual','ExterQual','GarageFinish'],axis=1), y_train) #top features removed

In [None]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results.head()

In [None]:
model_cv.best_params_

In [None]:
model_cv.best_score_

In [None]:
# plotting mean test and train scoes with alpha 
cv_results['param_alpha'] = cv_results['param_alpha'].astype('float64')
plt.figure(figsize=(16,10))

# plotting
plt.plot(cv_results.loc[cv_results['param_alpha']<=1.0]['param_alpha'], cv_results.loc[cv_results['param_alpha']<=1.0]['mean_train_score'])
plt.plot(cv_results.loc[cv_results['param_alpha']<=1.0]['param_alpha'], cv_results.loc[cv_results['param_alpha']<=1.0]['mean_test_score'])
plt.xlabel('alpha')
plt.ylabel('R2 Score')
plt.title("R2 Score Against Alpha - Lasso Regression")
plt.legend(['train score', 'validation score'], loc='upper right')
plt.xlim(0.0001,0.05)
# plt.xscale('log')
plt.show()

In [None]:
#Selecting the optimum value of alpha as 0.0003
lasso = Lasso(alpha = 0.0003)
lasso.fit(x_train.drop(['2ndFlrSF','LotShape','OverallQual','ExterQual','GarageFinish'],axis=1), y_train)
#predict
y_train_pred = lasso.predict(x_train.drop(['2ndFlrSF','LotShape','OverallQual','ExterQual','GarageFinish'],axis=1))
y_test_pred = lasso.predict(x_test.drop(['2ndFlrSF','LotShape','OverallQual','ExterQual','GarageFinish'],axis=1))
#Checking r2 score on train
print('r2 score for Train set')
print(metrics.r2_score(y_true = y_train, y_pred = y_train_pred))
print('r2 score for Test set')
print(metrics.r2_score(y_true = y_test, y_pred = y_test_pred))


In [None]:
# lasso model parameters
model_parameters = list(lasso.coef_)
model_parameters.insert(0, lasso.intercept_)
model_parameters = [round(x, 3) for x in model_parameters[1:]]
cols = x_train.drop(['2ndFlrSF','LotShape','OverallQual','ExterQual','GarageFinish'],axis=1).columns
cols = cols.insert(0, "constant")
lasso_f = list(zip(cols, model_parameters))
lasso_f

In [None]:
d = {'Feature':list(list(zip(*lasso_f))[0]),'Coeff':list(list(zip(*lasso_f))[1])}
lasso_params = pd.DataFrame(data = d)

In [None]:
#Let's see what is the importance given to each feature by our model
lasso_params.reindex(lasso_params.Coeff.abs().sort_values(ascending = False).index)