In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectPercentile, f_regression, RFECV
from sklearn.decomposition import PCA

In [31]:
house_price = pd.read_csv("Data/kaggle_comp_train.csv")
house_price

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [32]:
house_price.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [33]:
house_price.drop(columns = ["Id", "Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"], inplace = True)

In [34]:
# drop columns with correlation higher than 0.8
corr_matrix = house_price.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
house_price.drop(columns = to_drop, inplace = True)

In [35]:
house_price[["OverallQual", "OverallCond"]] = house_price[["OverallQual", "OverallCond"]].astype(str)

In [36]:
y = house_price.pop("SalePrice")
X = house_price.copy()

In [37]:
# split the dataset into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [73]:
# select categorical and numerical column names
X_num_col = X.select_dtypes(include = "number").copy().columns
X_cat_col = X.select_dtypes(exclude = "number").copy().columns

# define ordinal and onehot columns
onehot_col = X_cat_col.get_indexer(["MSZoning", "Street", "LotShape", "LandContour", "Utilities", "LotConfig",
                                    "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", 
                                    "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                                    "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", 
                                    "GarageFinish", "PavedDrive", "SaleType", "SaleCondition"])
ord_col = X_cat_col.get_indexer(["OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual", 
                                 "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", 
                                 "Electrical", "KitchenQual", "Functional", "GarageQual", "GarageCond"])

# define categories for all ordinal columns
overqual_cat = ["10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "NA"]
overcond_cat = ["10", "9", "8", "7", "6", "5", "4", "3", "2", "1", "NA"]
exqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
excond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtcond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
bsmtexp_cat = ["Gd", "Av", "Mn", "No", "NA"]
bsmtfin1_cat = ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "NA"]
bsmtfin2_cat = ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "NA"]
heatqc_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
elec_cat = ["SBrkr", "FuseA", "FuseF", "FuseP", "Mix", "NA"]
kitqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"] 
func_cat = ["Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal", "NA"]
garqual_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]
garcond_cat = ["Ex", "Gd", "TA", "Fa", "Po", "NA"]

# define encoder for all categorial columns
categorical_encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown = "ignore"), onehot_col),
    (OrdinalEncoder(categories = [overqual_cat, overcond_cat, exqual_cat, excond_cat, bsmtqual_cat, 
                                  bsmtcond_cat, bsmtexp_cat, bsmtfin1_cat, bsmtfin2_cat, heatqc_cat, elec_cat,
                                  kitqual_cat, func_cat, garqual_cat, garcond_cat]), ord_col))

# create pipeline for all categorical columns
cat_pipeline = make_pipeline(SimpleImputer(strategy = "constant", fill_value = "NA"), categorical_encoder)

# create pipeline for all numerical columns
num_pipeline = make_pipeline(SimpleImputer(), StandardScaler(), PCA(n_components = 0.9))

# create pipeline for the entire preprocessing for all columns
preprocess = ColumnTransformer(transformers = [("num_pipeline", num_pipeline, X_num_col),
                                               ("cat_pipeline", cat_pipeline, X_cat_col)])

In [39]:
# create a full pipeline with the random forest model
forest_pipeline = make_pipeline(preprocess, RandomForestRegressor())
forest_pipeline.fit(X_train, y_train)
y_train_forest_pred = forest_pipeline.predict(X_train)

In [40]:
print("MSE:", mean_squared_error(y_train, y_train_forest_pred))
print("RMSE:", mean_squared_error(y_train, y_train_forest_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_forest_pred))
print("R squared:", r2_score(y_train, y_train_forest_pred))

MSE: 136337690.57695293
RMSE: 11676.37317735918
MAE: 6668.284092465754
R squared: 0.9782755308157974


In [41]:
y_test_forest_pred = forest_pipeline.predict(X_test)

In [42]:
print("MSE:", mean_squared_error(y_test, y_test_forest_pred))
print("RMSE:", mean_squared_error(y_test, y_test_forest_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_forest_pred))
print("R squared:", r2_score(y_test, y_test_forest_pred))

MSE: 732118505.6361114
RMSE: 27057.688475479783
MAE: 17297.892773972606
R squared: 0.8861240881616967


In [58]:
forest_param_grid = {# "columntransformer__num_pipeline__simpleimputer__strategy": ["mean", "median"],
                     "randomforestregressor__n_estimators": range(90, 111, 10),
                     "randomforestregressor__criterion": ["squared_error", "absolute_error", "poisson"],
                     "randomforestregressor__max_depth": range(1, 10),
                     # "randomforestregressor__min_samples_leaf": range(5, 31, 5),
                     # "randomforestregressor__max_features": ["sqrt", "log2"],
                     # "randomforestregressor__bootstrap": [True, False]                    
                    }
forest_search = GridSearchCV(forest_pipeline, forest_param_grid, cv = 5, verbose = 1)
forest_search.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipeline',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer()),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         Index(['MSSubClass', 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       '2ndFlrSF', 'L...
       'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'Sale

In [59]:
forest_search.best_params_

{'randomforestregressor__criterion': 'absolute_error',
 'randomforestregressor__max_depth': 9,
 'randomforestregressor__n_estimators': 90}

In [60]:
forest_search.best_score_

0.8502693634218368

In [61]:
y_train_search_pred = forest_search.predict(X_train)

In [62]:
print("MSE:", mean_squared_error(y_train, y_train_search_pred))
print("RMSE:", mean_squared_error(y_train, y_train_search_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_search_pred))
print("R squared:", r2_score(y_train, y_train_search_pred))

MSE: 200343225.4909563
RMSE: 14154.265275561156
MAE: 9436.624904870625
R squared: 0.9680766909720726


In [63]:
y_test_search_pred = forest_search.predict(X_test)

In [64]:
print("MSE:", mean_squared_error(y_test, y_test_search_pred))
print("RMSE:", mean_squared_error(y_test, y_test_search_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_search_pred))
print("R squared:", r2_score(y_test, y_test_search_pred))

MSE: 805339003.1226029
RMSE: 28378.49543444125
MAE: 17960.084284627097
R squared: 0.874735152009504


In [74]:
rfecv = RFECV(estimator = RandomForestRegressor(), step = 1, scoring = "r2", cv = 5, verbose = 0, n_jobs = -1)
rfecv_pipeline = make_pipeline(preprocess, rfecv)
rfecv_pipeline.fit(X_train, y_train)
y_train_rfecv_pred = rfecv_pipeline.predict(X_train)

In [69]:
print("MSE:", mean_squared_error(y_train, y_train_rfecv_pred))
print("RMSE:", mean_squared_error(y_train, y_train_rfecv_pred)**0.5)
print("MAE:", mean_absolute_error(y_train, y_train_rfecv_pred))
print("R squared:", r2_score(y_train, y_train_rfecv_pred))

MSE: 125720506.88658169
RMSE: 11212.515635957066
MAE: 6570.438835616439
R squared: 0.9799673056942512


In [70]:
y_test_rfecv_pred = rfecv_pipeline.predict(X_test)

In [71]:
print("MSE:", mean_squared_error(y_test, y_test_rfecv_pred))
print("RMSE:", mean_squared_error(y_test, y_test_rfecv_pred)**0.5)
print("MAE:", mean_absolute_error(y_test, y_test_rfecv_pred))
print("R squared:", r2_score(y_test, y_test_rfecv_pred))

MSE: 774347470.5370736
RMSE: 27827.099571048966
MAE: 17699.70715753425
R squared: 0.8795556680943656


In [43]:
test = pd.read_csv("Data/kaggle_comp_test.csv")
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [44]:
test_cp = test.drop(columns = ["Id", "Alley", "FireplaceQu", "PoolQC", "Fence", "MiscFeature"])

In [45]:
corr_matrix = test_cp.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k = 1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
test_cp.drop(columns = to_drop, inplace = True)

In [46]:
test_cp[["OverallQual", "OverallCond"]] = test_cp[["OverallQual", "OverallCond"]].astype(str)

In [47]:
test_pred = forest_pipeline.predict(test_cp)

In [48]:
sub2 = pd.DataFrame({"Id": test["Id"], "SalePrice": test_pred})

In [49]:
sub2.to_csv("sub2.csv", index = False)

In [65]:
search_pred = forest_search.predict(test_cp)

In [66]:
sub3 = pd.DataFrame({"Id": test["Id"], "SalePrice": search_pred})

In [67]:
sub3.to_csv("sub3.csv", index = False)