In [433]:
import pandas as pd
import os
import pathlib
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [434]:
# Set universal paths to enable easier navigation
root = pathlib.Path.cwd()
root_parent = root.parent

#Declare the data path
data = root_parent/"data"

In [435]:
#Import the data Frame
df = pd.read_csv(data/"housing.csv")
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [436]:
#Choice of quantitative columns for normalization
columns = ["LotArea", "YearBuilt", "OverallQual", "1stFlrSF", "2ndFlrSF", "SalePrice"]

In [437]:
#Create a new dataframe based on the columns to be normalized
numerical_df = df[columns]
numerical_df

Unnamed: 0,LotArea,YearBuilt,OverallQual,1stFlrSF,2ndFlrSF,SalePrice
0,8450,2003,7,856,854,208500
1,9600,1976,6,1262,0,181500
2,11250,2001,7,920,866,223500
3,9550,1915,7,961,756,140000
4,14260,2000,8,1145,1053,250000
...,...,...,...,...,...,...
1455,7917,1999,6,953,694,175000
1456,13175,1978,6,2073,0,210000
1457,9042,1941,7,1188,1152,266500
1458,9717,1950,5,1078,0,142125


In [438]:
#create a correlation matrix to identify the features to select 
correlation_matrix = numerical_df.corr()
correlation_matrix.style.background_gradient(cmap = "gray")

Unnamed: 0,LotArea,YearBuilt,OverallQual,1stFlrSF,2ndFlrSF,SalePrice
LotArea,1.0,0.014228,0.105806,0.299475,0.050986,0.263843
YearBuilt,0.014228,1.0,0.572323,0.281986,0.010308,0.522897
OverallQual,0.105806,0.572323,1.0,0.476224,0.295493,0.790982
1stFlrSF,0.299475,0.281986,0.476224,1.0,-0.202646,0.605852
2ndFlrSF,0.050986,0.010308,0.295493,-0.202646,1.0,0.319334
SalePrice,0.263843,0.522897,0.790982,0.605852,0.319334,1.0


In [439]:
#Select the independent variables from the new dataframe
X_columns = ["LotArea", "YearBuilt", "OverallQual", "1stFlrSF", "2ndFlrSF"]
X_quantitative = numerical_df[X_columns]

In [440]:
#Select the target variable from the new dataframe
y_columns = ["SalePrice"]
y = numerical_df[y_columns]

In [441]:
X_norm = X_quantitative.copy()
y=norm = y.copy()

In [442]:
# Normalize all the Quantitative rows in the X variable
for col in X_norm:
  X_norm[col] = ((X_norm[col]-X_norm[col].min())/(X_norm[col].max()-X_norm[col].min()))
X_norm.shape

(1460, 5)

In [443]:
#Get categorical dataframe
categorical_df = df.copy()

In [444]:
#Get the list of all the columns with at list a single null row
columns = []
for col in categorical_df:
  if categorical_df[col].isnull().any()==True:
    # print(col)
    columns.append(col)
print(columns)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [445]:
#Drop columns with atleas a single null value in the rows
for col in columns:
  categorical_df.drop([col], axis=1, inplace=True)
categorical_df

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2007,WD,Normal,175000
1456,1457,20,RL,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2010,WD,Normal,210000
1457,1458,70,RL,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,112,0,0,0,0,4,2010,WD,Normal,142125


In [446]:
#Get at least one categorical variable for one hot encoding
X_enconder = categorical_df[["Neighborhood"]]
X_enconder

Unnamed: 0,Neighborhood
0,CollgCr
1,Veenker
2,CollgCr
3,Crawfor
4,NoRidge
...,...
1455,Gilbert
1456,NWAmes
1457,Crawfor
1458,NAmes


In [447]:
# X_merged = X_merged[["Neighborhood"]].values
# X_merged

In [448]:
# categorical = X_merged[["Neighborhood"]]

In [449]:
# Instanciate the one hot encoder 
hot_encoder = OneHotEncoder(sparse=False, handle_unknown= "ignore")

In [450]:
#Fit the one hot encoder on the selected categorical variable
X_enconder = hot_encoder.fit_transform(X_enconder)
X_enconder = pd.DataFrame(X_enconder)
X_enconder

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1456,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1457,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1458,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [451]:
# Merge the datasets of the X variables
X_merged = pd.concat([X_norm, X_enconder], axis='columns')
X_merged.shape 

(1460, 30)

In [452]:
#Select the categorical varib
X_merged

Unnamed: 0,LotArea,YearBuilt,OverallQual,1stFlrSF,2ndFlrSF,0,1,2,3,4,...,15,16,17,18,19,20,21,22,23,24
0,0.033420,0.949275,0.666667,0.119780,0.413559,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.038795,0.753623,0.555556,0.212942,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.046507,0.934783,0.666667,0.134465,0.419370,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.038561,0.311594,0.666667,0.143873,0.366102,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.060576,0.927536,0.777778,0.186095,0.509927,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.030929,0.920290,0.555556,0.142038,0.336077,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1456,0.055505,0.768116,0.555556,0.399036,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1457,0.036187,0.500000,0.666667,0.195961,0.557869,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1458,0.039342,0.565217,0.444444,0.170721,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [453]:
# Instantiate regression model
model = LinearRegression()

In [454]:
# Fit the regression model
model.fit(X_merged,y)

LinearRegression()

In [455]:
# Check the model Score
model.score(X_merged,y)

0.8069983084396559