In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [88]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer, make_column_selector

In [122]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error

In [131]:
df = pd.read_csv('./transformed_datset')

In [132]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [133]:
dfx = df[['LotArea', 'OverallQual', 'MasVnrArea', 'BsmtFinType1', 'TotalBsmtSF',
       'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'KitchenQual',
       'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageCars', 'GarageArea',
       'GarageQual', 'GarageCond', 'TotalBathrooms', 'HouseAge',
       'NewHouse', 'OldHouse', 'RemodeledAge', 'TotalSquareFootage',
       'TotalOutdoorSpace', 'PricePerSF', 'QualityScore', 'HasPorch',
       'HasGarage', 'HasFirePlace', 'NeighborhoodMedianPrices',
       'LotShapeScore', 'ExternalQualityScore', 'BsmtQaulityScore',
       'BsmtConditionScore', 'MSZoning', 'LotConfig', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'Foundation', 'BsmtExposure', 'Heating', 'CentralAir', 'Electrical',
       'Functional', 'GarageType', 'GarageFinish', 'PavedDrive', 'Fence',
       'SaleType', 'SaleCondition', 'SeasonOfSale']]

In [134]:
dfy = df[['SalePrice']]

In [135]:
cate_pipe = Pipeline([('categorical', OneHotEncoder()) ])

In [136]:
num_pipe = Pipeline([('numerical', StandardScaler())])

In [137]:
preprocessing = make_column_transformer(
    (num_pipe, make_column_selector(dtype_include=np.number)),
    (cate_pipe, make_column_selector(dtype_exclude=np.number)),
)

In [138]:
X = preprocessing.fit_transform(dfx)

In [139]:
X

array([[-0.09390025,  0.68901862,  1.24186441, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.17643593, -0.0509812 , -0.81430121, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.51247381,  0.68901862,  1.20564279, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 0.0495633 ,  0.68901862, -0.81430121, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.20210142, -0.79098101, -0.81430121, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.24953531, -0.79098101, -0.81430121, ...,  0.        ,
         0.        ,  0.        ]])

In [140]:
y_scaler = StandardScaler()
y = y_scaler.fit_transform(dfy)

In [141]:
y

array([[ 0.61404908],
       [ 0.25373484],
       [ 0.79454581],
       ...,
       [ 1.2517165 ],
       [-0.38162809],
       [-0.28518336]])

In [142]:
from sklearn.linear_model import Lasso

In [143]:
lasso_reg = Lasso(alpha=0.01, max_iter=2000)

In [144]:
lasso_reg.fit(X,y)

In [145]:
y_pred = lasso_reg.predict(X)

In [146]:
y_pred

array([ 0.60554   ,  0.24846467,  0.78337466, ...,  1.23384106,
       -0.37931016, -0.28325469])

On scaled datasets


In [147]:
root_mean_squared_error(y_pred, y)

0.014262132669316958

In [111]:
y_pred_o = y_scaler.inverse_transform(y_pred.reshape(-1,1))

In [113]:
y_pred_o = np.exp(y_pred_o)

In [114]:
y_pred_o

array([[201626.89474181],
       [178444.43725854],
       [213669.92453892],
       ...,
       [248666.19508334],
       [143793.90896815],
       [148960.2785173 ]])

In [116]:
y_o = y_scaler.inverse_transform(y.reshape(-1,1))

In [119]:
y_o = np.exp(y_o)

In [120]:
mean_squared_error(y_o, y_pred_o)

147107053.5275429

In [121]:
np.sqrt(mean_squared_error(y_o, y_pred_o))

12128.769662564415

In [123]:
root_mean_squared_error(y_o, y_pred_o)

12128.769662564415

In [126]:
root_mean_squared_error(y_o, y_pred_o)/np.mean(y_o) * 100

6.8427090588942345

In [125]:
np.mean(y_o)

177250.9916492693

In [127]:
dftest = pd.read_csv('../../../../Datasets/home-data-for-ml-course/test.csv')

In [128]:
dftest

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
