In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
house_price = pd.read_csv("Data/housing_iter_7/housing_prices.csv")
house_price

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
house_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
house_price.drop(columns = ["Street", "BsmtExposure", "FireplaceQu", "MSSubClass", "MasVnrArea", "MoSold",
                            "YrSold", "Id", "Alley", "LotShape", "LotConfig", "LandSlope", "RoofStyle", 
                            "RoofMatl", "MasVnrType", "PoolQC", "Fence", "MiscFeature", "SaleType"], inplace = True)
house_price

Unnamed: 0,MSZoning,LotFrontage,LotArea,LandContour,Utilities,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,...,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,SaleCondition,SalePrice
0,RL,65.0,8450,Lvl,AllPub,CollgCr,Norm,Norm,1Fam,2Story,...,Y,0,61,0,0,0,0,0,Normal,208500
1,RL,80.0,9600,Lvl,AllPub,Veenker,Feedr,Norm,1Fam,1Story,...,Y,298,0,0,0,0,0,0,Normal,181500
2,RL,68.0,11250,Lvl,AllPub,CollgCr,Norm,Norm,1Fam,2Story,...,Y,0,42,0,0,0,0,0,Normal,223500
3,RL,60.0,9550,Lvl,AllPub,Crawfor,Norm,Norm,1Fam,2Story,...,Y,0,35,272,0,0,0,0,Abnorml,140000
4,RL,84.0,14260,Lvl,AllPub,NoRidge,Norm,Norm,1Fam,2Story,...,Y,192,84,0,0,0,0,0,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,62.0,7917,Lvl,AllPub,Gilbert,Norm,Norm,1Fam,2Story,...,Y,0,40,0,0,0,0,0,Normal,175000
1456,RL,85.0,13175,Lvl,AllPub,NWAmes,Norm,Norm,1Fam,1Story,...,Y,349,0,0,0,0,0,0,Normal,210000
1457,RL,66.0,9042,Lvl,AllPub,Crawfor,Norm,Norm,1Fam,2Story,...,Y,0,60,0,0,0,0,2500,Normal,266500
1458,RL,68.0,9717,Lvl,AllPub,NAmes,Norm,Norm,1Fam,1Story,...,Y,366,0,112,0,0,0,0,Normal,142125


In [5]:
house_price[["OverallQual", "OverallCond"]] = house_price[["OverallQual", "OverallCond"]].astype(str)

In [6]:
y = house_price.pop("SalePrice")
X = house_price.copy()

In [7]:
# split the dataset into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [8]:
# select categorical and numerical column names
X_num_col = X.select_dtypes(include = "number").copy().columns
X_cat_col = X.select_dtypes(exclude = "number").copy().columns

# define ordinal and onehot columns
onehot_col = X_cat_col.get_indexer(["MSZoning", "Condition1", "Heating", "CentralAir", "Foundation", 
                                    "LandContour", "Utilities", "Neighborhood", "Condition2", "BldgType", 
                                    "HouseStyle", "Exterior1st", "Exterior2nd", "GarageType", "GarageFinish",
                                    "PavedDrive", "SaleCondition"])
ord_col = X_cat_col.get_indexer(["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtFinType1", 
                                 "KitchenQual", "OverallQual", "OverallCond", "BsmtFinType2", "HeatingQC", 
                                 "Electrical", "Functional", "GarageQual", "GarageCond"])

# define categories for all ordinal columns
exqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
excond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtcond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtfin1_cat = ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "NA"]
kitqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
overqual_cat = ["10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "NA"]
overcond_cat = ["10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "NA"]
bsmtfin2_cat = ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "NA"]
heatqc_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
elec_cat = ["SBrkr", "FuseA", "FuseF", "FuseP", "Mix", "NA"] 
func_cat = ["Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal", "NA"]
garqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
garcond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]

# define encoder for all categorial columns
categorical_encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown = "ignore"), onehot_col),
    (OrdinalEncoder(categories = [exqual_cat, excond_cat, bsmtqual_cat, bsmtcond_cat, bsmtfin1_cat, 
                                  kitqual_cat, overqual_cat, overcond_cat, bsmtfin2_cat, heatqc_cat, 
                                  elec_cat, func_cat, garqual_cat, garcond_cat]), ord_col))

# create pipeline for all categorical columns
cat_pipeline = make_pipeline(SimpleImputer(strategy = "constant", fill_value = "NA"), categorical_encoder)

# create pipeline for all numerical columns
num_pipeline = make_pipeline(SimpleImputer(), StandardScaler())

# create pipeline for the entire preprocessing for all columns
preprocess = ColumnTransformer(transformers = [("num_pipeline", num_pipeline, X_num_col),
                                               ("cat_pipeline", cat_pipeline, X_cat_col)])

In [11]:
# create the full pipeline with linear regression
lreg_pipeline = make_pipeline(preprocess, LinearRegression())
lreg_pipeline.fit(X_train, y_train)
y_train_pred = lreg_pipeline.predict(X_train)
y_train_pred

array([234643., 232736., 150214., ..., 296870., 112318., 274661.])

In [12]:
y_train.head()

133     220000
290     233230
24      154000
1093    146000
1288    278000
Name: SalePrice, dtype: int64

In [26]:
print("MSE:", mean_squared_error(y_train, y_train_pred))
print("RMSE:", mean_squared_error(y_train, y_train_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_pred))
print("R squared:", r2_score(y_train, y_train_pred))

MSE: 720988726.2602739
RMSE: 26851.23323537066
MAE: 16773.946917808218
R squared: 0.8851154269995644


In [16]:
y_test_pred = lreg_pipeline.predict(X_test)

In [27]:
print("MSE:", mean_squared_error(y_test, y_test_pred))
print("RMSE:", mean_squared_error(y_test, y_test_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_pred))
print("R squared:", r2_score(y_test, y_test_pred))

MSE: 2.2941849822948143e+28
RMSE: 151465672094201.4
MAE: 15233477327673.771
R squared: -3.5684442446591447e+18


In [55]:
# create the full pipeline with knn regression
knn_pipeline = make_pipeline(preprocess, KNeighborsRegressor())
knn_pipeline.fit(X_train, y_train)
y_train_knn_pred = knn_pipeline.predict(X_train)
y_train_knn_pred

array([199600. , 225613. , 142700. , ..., 231912.2, 118780. , 232158. ])

In [56]:
print("MSE:", mean_squared_error(y_train, y_train_knn_pred))
print("RMSE:", mean_squared_error(y_train, y_train_knn_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_knn_pred))
print("R squared:", r2_score(y_train, y_train_knn_pred))

MSE: 1024177163.8284934
RMSE: 32002.76806509858
MAE: 17809.91609589041
R squared: 0.8368044438731513


In [57]:
y_test_knn_pred = knn_pipeline.predict(X_test)

In [58]:
print("MSE:", mean_squared_error(y_test, y_test_knn_pred))
print("RMSE:", mean_squared_error(y_test, y_test_knn_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_knn_pred))
print("R squared:", r2_score(y_test, y_test_knn_pred))

MSE: 1416911232.2036986
RMSE: 37641.88135845097
MAE: 21713.078767123283
R squared: 0.7796093701784836


In [37]:
# create the full pipeline with svm regression
svm_pipeline = make_pipeline(preprocess, SVR())
svm_pipeline.fit(X_train, y_train)
y_train_svm_pred = svm_pipeline.predict(X_train)
y_train_svm_pred

array([161351.05856678, 161356.20816139, 161249.99512245, ...,
       161356.357988  , 161202.49013532, 161358.93156894])

In [38]:
print("MSE:", mean_squared_error(y_train, y_train_svm_pred))
print("RMSE:", mean_squared_error(y_train, y_train_svm_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_svm_pred))
print("R squared:", r2_score(y_train, y_train_svm_pred))

MSE: 6641929162.748718
RMSE: 81498.03164953567
MAE: 55443.25927806568
R squared: -0.058345530199133355


In [39]:
y_test_svm_pred = svm_pipeline.predict(X_test)

In [40]:
print("MSE:", mean_squared_error(y_test, y_test_svm_pred))
print("RMSE:", mean_squared_error(y_test, y_test_svm_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_svm_pred))
print("R squared:", r2_score(y_test, y_test_svm_pred))

MSE: 6855473216.627602
RMSE: 82797.78509493839
MAE: 55727.696799261634
R squared: -0.0663208997131346


In [42]:
# create the full pipeline with decision tree regression
tree_pipeline = make_pipeline(preprocess, DecisionTreeRegressor())
tree_pipeline.fit(X_train, y_train)
y_train_tree_pred = tree_pipeline.predict(X_train)
y_train_tree_pred

array([220000., 233230., 154000., ..., 318061., 110000., 269790.])

In [43]:
print("MSE:", mean_squared_error(y_train, y_train_tree_pred))
print("RMSE:", mean_squared_error(y_train, y_train_tree_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_tree_pred))
print("R squared:", r2_score(y_train, y_train_tree_pred))

MSE: 24079.62328767123
RMSE: 155.176104112944
MAE: 6.421232876712328
R squared: 0.9999961630783691


In [44]:
y_test_tree_pred = tree_pipeline.predict(X_test)

In [45]:
print("MSE:", mean_squared_error(y_test, y_test_tree_pred))
print("RMSE:", mean_squared_error(y_test, y_test_tree_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_tree_pred))
print("R squared:", r2_score(y_test, y_test_tree_pred))

MSE: 1200883886.5034246
RMSE: 34653.77160574913
MAE: 23561.56506849315
R squared: 0.8132109125302273


In [47]:
# create the full pipeline with random forest regression
forest_pipeline = make_pipeline(preprocess, RandomForestRegressor())
forest_pipeline.fit(X_train, y_train)
y_train_forest_pred = forest_pipeline.predict(X_train)
y_train_forest_pred

array([223813.12, 236993.48, 149113.  , ..., 315406.27, 110797.  ,
       255274.14])

In [48]:
print("MSE:", mean_squared_error(y_train, y_train_forest_pred))
print("RMSE:", mean_squared_error(y_train, y_train_forest_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_forest_pred))
print("R squared:", r2_score(y_train, y_train_forest_pred))

MSE: 121330842.40304665
RMSE: 11015.028025522524
MAE: 6592.338946917809
R squared: 0.9806667684062715


In [52]:
y_test_forest_pred = forest_pipeline.predict(X_test)

In [53]:
print("MSE:", mean_squared_error(y_test, y_test_forest_pred))
print("RMSE:", mean_squared_error(y_test, y_test_forest_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_forest_pred))
print("R squared:", r2_score(y_test, y_test_forest_pred))

MSE: 793015438.0278118
RMSE: 28160.529789544296
MAE: 17617.84397260274
R squared: 0.8766519963474965
