In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, \
    RobustScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn import set_config

In [2]:
np.random.seed(42)
set_config(display='diagram')

plt.rcParams['figure.figsize'] = (12, 8)
sns.set_theme(style="whitegrid")

In [3]:
trainDf = pd.read_csv("data/train.csv")
testDf = pd.read_csv("data/test.csv")

In [4]:
for column in (
    'PoolQC', 
    'FireplaceQu', 
    'Alley', 
    'Fence', 
    'MiscFeature', 
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'GarageType', 
    'GarageFinish', 
    'GarageQual', 
    'GarageCond',
    'BsmtQual', 
    'BsmtCond', 
    'BsmtExposure', 
    'BsmtFinType1', 
    'BsmtFinType2',
    'MasVnrType',
    'MSSubClass',
):
    trainDf[column] = trainDf[column].fillna('None')
    testDf[column] = testDf[column].fillna('None')

for column in (
    'BsmtFinSF1', 
    'BsmtFinSF2', 
    'BsmtUnfSF',
    'TotalBsmtSF', 
    'BsmtFullBath', 
    'BsmtHalfBath',
    'MasVnrArea',
    'GarageCars',
    'GarageArea',
    'GarageYrBlt',
):
    trainDf[column] = trainDf[column].fillna(0)
    testDf[column] = testDf[column].fillna(0)

for column in (
    'Electrical', 
    'KitchenQual', 
    'Exterior1st',
    'Exterior2nd', 
    'SaleType',
    'MSZoning',
    'Utilities',
):
    trainDf[column] = trainDf[column].fillna(trainDf[column].mode()[0])
    testDf[column] = testDf[column].fillna(testDf[column].mode()[0])

trainDf['Functional'] = trainDf['Functional'].fillna('Typical')
testDf['Functional'] = testDf['Functional'].fillna('Typical')

In [5]:
trainDf['Exterior1st'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)
testDf['Exterior1st'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)
trainDf['Exterior2nd'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)
testDf['Exterior2nd'].replace(['Brk Cmn', 'CmentBd', 'Wd Shng'], ['BrkComm', 'CemntBd', 'Wd Sdng'], inplace=True)

In [6]:
# Remove outliers
trainDf.drop(
    trainDf[(trainDf["GrLivArea"] > 4000) & (trainDf["SalePrice"] < 700000)].index
);

In [7]:
baselineFeatures = [
    '1stFlrSF',
    '2ndFlrSF',
    'BsmtFinSF1', 
    'BsmtFinSF2',
    'BsmtFinType1',
    'BsmtFinType2',
    'BsmtUnfSF',
    'OverallQual',
    'GarageType',
    'GarageCond',
    'GarageCars', 
    'OverallCond', 
    'Neighborhood',
    'MSSubClass', 
    'LotShape',
    'LotConfig',
    'LandSlope',
    'BsmtCond',
    'BsmtQual',
    # 'HeatingQC'
    # 'SaleCondition',
    # 'Electrical',
    # 'Exterior1st',
    # 'Exterior2nd',
    # 'GarageFinish',
    # 'GarageQual',
    # 'KitchenAbvGr',
    #'KitchenQual',
    #'Foundation',
    'CentralAir',
    'Condition1',
    'Condition2',
]

X = trainDf[baselineFeatures]
Y = trainDf['SalePrice']

In [8]:
X_train, X_validation, y_train, y_validation = train_test_split(X, Y, test_size=0.3, random_state=42)

In [9]:
subclassCategories = [20, 30, 40, 45, 50, 60, 70, 75, 80, 85, 90, 120, 150, 160, 180, 190]
basementFinishCategories = ['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
#electricalCategories = ['Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr']
#heatingQCCategories = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
#exteriorCategories = ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']
#kitchenQCCategories = ['Po', 'Fa', 'TA', 'Gd', 'Ex']

garageQCCategories = ['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
conditionCategories = trainDf['Condition1'].unique()

In [10]:
# Build feature transformer

logTransformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1)

basementCondTransformer = Pipeline([
    ('basement_condition_impute', SimpleImputer(strategy="constant", fill_value='None')),
    ('basement_condition_onehot', OneHotEncoder()),
])

basementAreaTransformer = Pipeline([
    ('basement_area_impute', SimpleImputer(strategy="constant", fill_value=0)),
    ('basement_area_log', logTransformer),
])

basementFinishTransformer = Pipeline([
   ('basement_finish_impute', SimpleImputer(strategy="constant", fill_value='None')),
   ('basement_finish_onehot', OrdinalEncoder(categories=[basementFinishCategories, basementFinishCategories])),
])

featureTransformer = ColumnTransformer([
        ('garage_cars_impute', SimpleImputer(strategy="constant", fill_value=0), ['GarageCars']),
        ('basement_area_transformer', basementAreaTransformer, ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF']),
        ('neighborhood_onehot', OneHotEncoder(), ['Neighborhood']),
        ('subclass_onehot', OneHotEncoder(categories=[subclassCategories]), ['MSSubClass']),
        ('lotconfig_onehot', OneHotEncoder(), ['LotConfig']),
        ('lot_shape_onehot', OneHotEncoder(), ['LotShape']),
        ('land_slope_onehot', OneHotEncoder(), ['LandSlope']),
        ('basement_condition_transformer', basementCondTransformer, ['BsmtCond', 'BsmtQual']),
        ('basement_finish_transformer', basementFinishTransformer, ['BsmtFinType1', 'BsmtFinType2']),
        #('sale_condtion_onehot', OrdinalEncoder(), ['SaleCondition']),
        #('electrical_onehot', OneHotEncoder(categories=[electricalCategories]), ['Electrical']),
        #('exterior_onehot', OneHotEncoder(categories=[exteriorCategories, exteriorCategories]), ['Exterior1st', 'Exterior2nd']),
        ('air_condition_onehot', OneHotEncoder(), ['CentralAir']),
        ('garage_type_onehot', OneHotEncoder(), ['GarageType']),
        ('garage_qc_onehot', OneHotEncoder(), ['GarageCond']),
        ('condition_onehot', OneHotEncoder(categories=[conditionCategories, conditionCategories]), ['Condition1', 'Condition2']),
    ],
    remainder='passthrough'
)

In [11]:
randomForestPipeline = Pipeline([
    ("preprocessing", featureTransformer),
    ("random_forest", RandomForestRegressor(random_state=42)),
])

randomForestPipeline.fit(X_train, y_train)

y_train_predicted = randomForestPipeline.predict(X_train)
y_validation_predicted_rf = randomForestPipeline.predict(X_validation)

print('[Train] MSE: %.2f' % mean_squared_error(y_train, y_train_predicted))
print('[Train] MAE: %.2f' % mean_absolute_error(y_train, y_train_predicted))
print('[Train] R^2: %.2f' % r2_score(y_train, y_train_predicted))
print('[Test] MSE: %.2f' % mean_squared_error(y_validation, y_validation_predicted_rf))
print('[Test] MAE: %.2f' % mean_absolute_error(y_validation, y_validation_predicted_rf))
print('[Test] R^2: %.2f' % r2_score(y_validation, y_validation_predicted_rf))

[Train] MSE: 158448077.63
[Train] MAE: 7480.17
[Train] R^2: 0.97
[Test] MSE: 714637891.32
[Test] MAE: 17560.74
[Test] R^2: 0.90


In [12]:
x_test = testDf[baselineFeatures]
y_test_predicted = randomForestPipeline.predict(x_test)

submissionDf = pd.DataFrame({
    'Id': testDf['Id'],
    'SalePrice': y_test_predicted,
})

submissionDf.to_csv('./data/submission_random_forest.csv', index=False)