In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score

np.set_printoptions(threshold=np.inf)

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


## Print the null values % of each column

In [5]:
for col in data.columns:
    print(col, ": ", data[col].isna().sum()/len(data[col]))

Id :  0.0
MSSubClass :  0.0
MSZoning :  0.0
LotFrontage :  0.1773972602739726
LotArea :  0.0
Street :  0.0
Alley :  0.9376712328767123
LotShape :  0.0
LandContour :  0.0
Utilities :  0.0
LotConfig :  0.0
LandSlope :  0.0
Neighborhood :  0.0
Condition1 :  0.0
Condition2 :  0.0
BldgType :  0.0
HouseStyle :  0.0
OverallQual :  0.0
OverallCond :  0.0
YearBuilt :  0.0
YearRemodAdd :  0.0
RoofStyle :  0.0
RoofMatl :  0.0
Exterior1st :  0.0
Exterior2nd :  0.0
MasVnrType :  0.005479452054794521
MasVnrArea :  0.005479452054794521
ExterQual :  0.0
ExterCond :  0.0
Foundation :  0.0
BsmtQual :  0.025342465753424658
BsmtCond :  0.025342465753424658
BsmtExposure :  0.026027397260273973
BsmtFinType1 :  0.025342465753424658
BsmtFinSF1 :  0.0
BsmtFinType2 :  0.026027397260273973
BsmtFinSF2 :  0.0
BsmtUnfSF :  0.0
TotalBsmtSF :  0.0
Heating :  0.0
HeatingQC :  0.0
CentralAir :  0.0
Electrical :  0.0006849315068493151
1stFlrSF :  0.0
2ndFlrSF :  0.0
LowQualFinSF :  0.0
GrLivArea :  0.0
BsmtFullBath :  0

## Find the columns with more than 90% null values

In [6]:
null_columns = [col for col in data.columns if data[col].isna().sum()/len(data[col]) > 0.90]
print(null_columns)

['Alley', 'PoolQC', 'MiscFeature']


## Drop null columns from data frame

In [7]:
data = data.drop(null_columns, axis=1)
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,,0,12,2008,WD,Normal,250000


## Drop `ID` column

In [8]:
data = data.drop(["Id"], axis=1)
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,3SsnPorch,ScreenPorch,PoolArea,Fence,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,,0,12,2008,WD,Normal,250000


In [9]:
columns = data.columns
columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'WoodDeckSF'

# Replace all numeric missing values with average value 

In [10]:
numeric_columns = data.select_dtypes(include='number').columns

In [11]:
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Convert categorical variable to numeric variables 

In [12]:
categorical_columns = data.select_dtypes(include='object').columns
categorical_columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'Fence', 'SaleType', 'SaleCondition'],
      dtype='object')

In [13]:
for col in categorical_columns:
    label = LabelEncoder()
    label.fit(list(data[col].values))
    data[col] = label.transform(list(data[col].values))

## Find highly correlated values to the salesPrice

In [14]:
corr_mat = data.corr()
k = 15
top_corr_cols = corr_mat.nlargest(k, "SalePrice")["SalePrice"].index
top_corr_cols

Index(['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea',
       'TotalBsmtSF', '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'GarageYrBlt', 'Fireplaces',
       'BsmtFinSF1'],
      dtype='object')

# Location is very highly correlated to the house prices so lets add it to the top_corr_cols 

In [15]:
top_corr_cols = top_corr_cols.union(["Neighborhood"])
top_corr_cols

Index(['1stFlrSF', 'BsmtFinSF1', 'Fireplaces', 'FullBath', 'GarageArea',
       'GarageCars', 'GarageYrBlt', 'GrLivArea', 'MasVnrArea', 'Neighborhood',
       'OverallQual', 'SalePrice', 'TotRmsAbvGrd', 'TotalBsmtSF', 'YearBuilt',
       'YearRemodAdd'],
      dtype='object')

In [None]:
sns.pairplot(data[top_corr_cols].corr())

<seaborn.axisgrid.PairGrid at 0x7fb10835a100>

In [None]:
ridge_data = data[top_corr_cols]

## Split the data in test and train set

In [None]:
ridge_train, ridge_test = train_test_split(ridge_data, test_size=0.25, random_state=100)

## Scale the data using MinMaxScaler

In [None]:
scaler = MinMaxScaler()
ridge_train[top_corr_cols] = scaler.fit_transform(ridge_train[top_corr_cols])

In [None]:
X_train_ridge = ridge_train[top_corr_cols]
y_train_ridge = X_train_ridge.pop("SalePrice")

In [None]:
X_train_ridge

## Transform the test data

In [None]:
ridge_test[top_corr_cols] = scaler.transform(ridge_test[top_corr_cols])

In [None]:
X_test_ridge = ridge_test[top_corr_cols]
y_test_ridge = X_test_ridge.pop("SalePrice")

## Ridge Regression

In [None]:
lambdas = [0, 0.001, 0.01, 0.1, 1, 10, 100] 
for i in lambdas: 
    
    ridgereg = Ridge(alpha = i) # Initialize the Ridge Regression model with a specific lambda
    ridgereg.fit(X_train_ridge, y_train_ridge) # fit the model on the polynomial features
    
    #Computing the r2 score
    y_pred_ridge = ridgereg.predict(X_test_ridge)
    print(i,"->", str(r2_score(y_test_ridge, y_pred_ridge)), ridgereg.coef_)
    

### Optimal value of lambda for `Ridge` regression is `0.01`
### Let's double it and see the model performance

In [None]:
ridgereg = Ridge(alpha = 0.02) # Initialize the Ridge Regression model with a specific lambda
ridgereg.fit(X_train_ridge, y_train_ridge) # fit the model on the polynomial features

#Computing the r2 score
y_pred_ridge = ridgereg.predict(X_test_ridge)

print(ridgereg.coef_)
print("R2: ", r2_score(y_test_ridge, y_pred_ridge))
print("RSS: ", np.sum(np.square(y_pred_ridge - y_test_ridge)))
mse =  mean_squared_error(y_test_ridge, y_pred_ridge)
print("MSE: ", mse)
print("RMSE: ", mse**0.5)

# Lasso Regression

## Test train split

In [None]:
lasso_train, lasso_test = train_test_split(data, test_size=0.3, random_state=100)

## scale the train data

In [None]:
scale = MinMaxScaler()
lasso_train[columns] = scaler.fit_transform(lasso_train[columns])

In [None]:
y_train_lasso = lasso_train.pop("SalePrice")
X_train_lasso = lasso_train

## transform the test data

In [None]:
lasso_test[columns] = scaler.transform(lasso_test[columns])

In [None]:
y_test_lasso = lasso_test.pop("SalePrice")
X_test_lasso = lasso_test

## Lasso can perform the feature selection, so lets just pass all the features

In [None]:
lambdas = [0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
for i in lambdas:
    lasso_reg = Lasso(alpha = i)
    lasso_reg.fit(X_train_lasso, y_train_lasso)
    
    # Compute the r2 score
    y_pred_lasso = lasso_reg.predict(X_test_lasso)
    print(i, "->", r2_score(y_test_lasso, y_pred_lasso))

### R Square value for all the lambda values is same, so lets go with `0.01` lambda value
### Let's doulble it and see the model performance

In [None]:
lasso_reg = Lasso(alpha = 0.00002)
lasso_reg.fit(X_train_lasso, y_train_lasso)

# Compute the r2 score
y_pred_lasso = lasso_reg.predict(X_test_lasso)

print(lasso_reg.coef_)
print("R2: ", r2_score(y_test_lasso, y_pred_lasso))
print("RSS: ", np.sum(np.square(y_pred_lasso - y_test_lasso)))
mse =  mean_squared_error(y_test_lasso, y_pred_lasso)
print("MSE: ", mse)
print("RMSE: ", mse**0.5)

## delete the top 5 predictors and retrain the Lasso model

In [None]:
top_5_predictors = ["GrLivArea", "OverallQual", "BsmtFinSF1", "TotalBsmtSF", "MasVnrArea"]

In [None]:
X_train_lasso = X_train_lasso.drop(top_5_predictors, axis=1)

In [None]:
X_test_lasso = X_test_lasso.drop(top_5_predictors, axis=1)

In [None]:
X_train_lasso

In [None]:
lasso_reg = Lasso(alpha = 0.00001)
lasso_reg.fit(X_train_lasso, y_train_lasso)

# Compute the r2 score
y_pred_lasso = lasso_reg.predict(X_test_lasso)

#print(list(lasso_reg.coef_))
print("R2: ", r2_score(y_test_lasso, y_pred_lasso))
print("RSS: ", np.sum(np.square(y_pred_lasso - y_test_lasso)))
mse =  mean_squared_error(y_test_lasso, y_pred_lasso)
print("MSE: ", mse)
print("RMSE: ", mse**0.5)