In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectPercentile, f_regression, RFECV
from sklearn.decomposition import PCA

In [19]:
house_price = pd.read_csv("Data/housing_iter_7/housing_prices.csv")
house_price

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [20]:
house_price.drop(columns = ["Id", "Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"], inplace = True)

In [21]:
corr_matrix = house_price.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
house_price.drop(columns = to_drop, inplace = True)

In [22]:
house_price[["OverallQual", "OverallCond"]] = house_price[["OverallQual", "OverallCond"]].astype(str)

In [23]:
y = house_price.pop("SalePrice")
X = house_price.copy()

In [24]:
# split the dataset into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [25]:
# select categorical and numerical column names
X_num_col = X.select_dtypes(include = "number").copy().columns
X_cat_col = X.select_dtypes(exclude = "number").copy().columns

# define ordinal and onehot columns
onehot_col = X_cat_col.get_indexer(["MSZoning", "Street", "LotShape", "LandContour", "Utilities", "LotConfig",
                                    "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", 
                                    "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                                    "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", 
                                    "GarageFinish", "PavedDrive", "SaleType", "SaleCondition"])
ord_col = X_cat_col.get_indexer(["OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual", 
                                 "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", 
                                 "Electrical", "KitchenQual", "Functional", "GarageQual", "GarageCond"])

# define categories for all ordinal columns
overqual_cat = ["10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "NA"]
overcond_cat = ["10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "NA"]
exqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
excond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtcond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtexp_cat = ["Gd", "Av", "Mn", "No", "NA"]
bsmtfin1_cat = ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "NA"]
bsmtfin2_cat = ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "NA"]
heatqc_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
elec_cat = ["SBrkr", "FuseA", "FuseF", "FuseP", "Mix", "NA"]
kitqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"] 
func_cat = ["Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal", "NA"]
garqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
garcond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]

# define encoder for all categorial columns
categorical_encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown = "ignore"), onehot_col),
    (OrdinalEncoder(categories = [overqual_cat, overcond_cat, exqual_cat, excond_cat, bsmtqual_cat, 
                                  bsmtcond_cat, bsmtexp_cat, bsmtfin1_cat, bsmtfin2_cat, heatqc_cat, elec_cat,
                                  kitqual_cat, func_cat, garqual_cat, garcond_cat]), ord_col))

# create pipeline for all categorical columns
cat_pipeline = make_pipeline(SimpleImputer(strategy = "constant", fill_value = "NA"), categorical_encoder)

# create pipeline for all numerical columns
num_pipeline = make_pipeline(SimpleImputer(), StandardScaler(), PCA())

# create pipeline for the entire preprocessing for all columns
preprocess = ColumnTransformer(transformers = [("num_pipeline", num_pipeline, X_num_col),
                                               ("cat_pipeline", cat_pipeline, X_cat_col)])

In [26]:
# create a full pipeline with the random forest model
pca_pipeline = make_pipeline(preprocess, RandomForestRegressor())
pca_pipeline.fit(X_train, y_train)
y_train_pca_pred = pca_pipeline.predict(X_train)

In [27]:
print("MSE:", mean_squared_error(y_train, y_train_pca_pred))
print("RMSE:", mean_squared_error(y_train, y_train_pca_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_pca_pred))
print("R squared:", r2_score(y_train, y_train_pca_pred))

MSE: 168878967.00579807
RMSE: 12995.344051074526
MAE: 6742.606386986301
R squared: 0.9730903032092462


In [28]:
y_test_pca_pred = pca_pipeline.predict(X_test)

In [29]:
print("MSE:", mean_squared_error(y_test, y_test_pca_pred))
print("RMSE:", mean_squared_error(y_test, y_test_pca_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_pca_pred))
print("R squared:", r2_score(y_test, y_test_pca_pred))

MSE: 886586490.2601404
RMSE: 29775.602265279882
MAE: 18071.866506849317
R squared: 0.8620976737718531


In [30]:
forest_param_grid = {"columntransformer__num_pipeline__simpleimputer__strategy": ["mean", "median"],
                     "columntransformer__num_pipeline__pca__n_components": [0.7, 0.75, 0.8, 0.85, 0.9, 0.95],
                     "randomforestregressor__n_estimators": range(90, 111, 10),
                     # "randomforestregressor__criterion": ["squared_error", "absolute_error", "poisson"],
                     "randomforestregressor__max_depth": range(1, 10),
                     # "randomforestregressor__min_samples_leaf": range(5, 31, 5),
                     # "randomforestregressor__max_features": ["sqrt", "log2"],
                     # "randomforestregressor__bootstrap": [True, False]                    
                    }
forest_search = GridSearchCV(pca_pipeline, forest_param_grid, cv = 5, verbose = 1)
forest_search.fit(X_train, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipeline',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer()),
                                                                                         ('standardscaler',
                                                                                          StandardScaler()),
                                                                                         ('pca',
                                                                                          PCA())]),
                                                                         Index(['MSSubClass', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'Bsm

In [31]:
forest_search.best_params_

{'columntransformer__num_pipeline__pca__n_components': 0.7,
 'columntransformer__num_pipeline__simpleimputer__strategy': 'mean',
 'randomforestregressor__max_depth': 8,
 'randomforestregressor__n_estimators': 100}

In [32]:
forest_search.best_score_

0.8256463191404034

In [33]:
y_train_forest_pred = forest_search.predict(X_train)

In [34]:
print("MSE:", mean_squared_error(y_train, y_train_forest_pred))
print("RMSE:", mean_squared_error(y_train, y_train_forest_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_forest_pred))
print("R squared:", r2_score(y_train, y_train_forest_pred))

MSE: 273350979.371007
RMSE: 16533.329349257125
MAE: 10735.734544686517
R squared: 0.9564434097226753


In [35]:
y_test_forest_pred = forest_search.predict(X_test)

In [36]:
print("MSE:", mean_squared_error(y_test, y_test_forest_pred))
print("RMSE:", mean_squared_error(y_test, y_test_forest_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_forest_pred))
print("R squared:", r2_score(y_test, y_test_forest_pred))

MSE: 909546318.2032887
RMSE: 30158.68561796566
MAE: 18427.799574495104
R squared: 0.8585264331563671
