In [1]:
# main imports
import importlib # for reloading custom libraries
# importlib.reload(nameOfModule) # use of reload

from datetime import datetime, timedelta

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.exceptions import NotFittedError

from sklearn.base import BaseEstimator, TransformerMixin # baseclasses to create custom transformers

# custom transformers based on sklearn library
import pipeline_classes as plc
#from pipeline_classes import DFCreateAdditionalFeatures, DFReplaceMeaningfulNANs, DFJoinDates, DFCalcAge, DFDropColumns, DFConvertToNumpy, DFOneHotCategoriesCombined

In [57]:
#importlib.reload(plc)

<module 'pipeline_classes' from 'd:\\Work\\GoogleDrive\\DataAnalysis\\house_prices_advanced_regression\\pipeline_classes.py'>

Models classes:

In [2]:
# models selection
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

import xgboost as xgb

Performance measure is **Root-Mean-Squared-Error (RMSE) between the logarithm** of the predicted value and the logarithm of the observed sales price.  
Taking logs means that errors in predicting expensive houses and cheap houses will affect the result almost equally

In [3]:
from sklearn.metrics import mean_squared_log_error

Using ColumnTransformer (works with pandas DataFrames):  
  
num_attribs = [...]  
cat_attribs = [...]  

full_pipeline = ColumnTransformer([  
("num", num_pipeline, num_attribs),  
("cat", OneHotEncoder(), cat_attribs),  
])  

prepared_data = full_pipeline.fit_transform(original_data)

Data preparation pipeline

In [4]:
df_orig = pd.read_csv('datasets/processed/train_with_cat_dtypes.csv', index_col=0)
df_orig.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [5]:
y = df_orig['SalePrice'].to_numpy()

In [6]:
df = df_orig.copy()

From now on 'df' will be the name of dataset we are working with. All the versions we save as backups or milestones will have the suffix starting with '_'. This suffix with the cell comment will help us to recover why we saved this version

The data conversion pipeline should look as follow:


Split column names by lists to ease further pipeline processing 

In [7]:
ohe_combined_columns = [['Condition1', 'Condition2'], ['Exterior1st', 'Exterior2nd']]
ohe_single_columns = [
    'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LandContour', 'LotConfig', 'Neighborhood', 'BldgType', 'HouseStyle', 'RoofStyle', 
    'RoofMatl', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition'
]
ore_columns = [
    'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC',
    'LotShape', 'Utilities', 'LandSlope', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'GarageFinish', 'PavedDrive', 'Fence'
]
num_columns = []
for column in df.columns:
    if (df[column].dtype != 'object') and (df[column].dtype != 'category') and (column not in ohe_single_columns+ore_columns):
        num_columns.append(column)
print('Check total number of columns:')
print('Categorical unordered combined columns = 4')
print('Categorical unordered single columns = {}'.format(len(ohe_single_columns)))
print('Categorical ordered single columns = {}'.format(len(ore_columns)))
print('Numrical columns = {}'.format(len(num_columns)))
print('-'*20, '\nTOTAL = {}'.format(4 + len(ohe_single_columns) + len(ore_columns) + len(num_columns)))
print('Checksum is: {}'.format(len(df.columns)))

Check total number of columns:
Categorical unordered combined columns = 4
Categorical unordered single columns = 20
Categorical ordered single columns = 20
Numrical columns = 36
-------------------- 
TOTAL = 80
Checksum is: 80


Constructing pipelines

In [None]:
ct_values_fix = ColumnTransformer(
    transformers=[
        
    ],
    remainder='passthrough'
)

In [35]:
# Transformer for categorical columns which have to be processed separately from each other (combined columns are processed in custom DFOneHotCategoriesCombined class)
ct_cat_encoder = ColumnTransformer(
    transformers=[
        ('ohe_unordered', plc.DFAllCategoriesOneHotEncoder(), ohe_single_columns),
        ('ore_ordered', OrdinalEncoder(), ore_columns),
    ],
    remainder='passthrough'
)

In [41]:
pipeline_data_process = Pipeline([
    ('values_fixing', #ColumnTransformer to replace incorrect values with correct ones),
    ('replace_meanful_nans', plc.DFReplaceMeaningfulNANs()),
    ('set_ordered_cat', plc.DFSetOrderedCategories(process=True)),
    ('set_unordered_cat', plc.DFSetUnorderedCategories(process=True)),
    ('add_has_features', plc.DFCreateAdditionalFeatures()),
    ('days_since_sold', plc.DFJoinDates(month_col='MoSold', year_col='YrSold', calc_period_to=datetime.today(), new_column_name='DaysSinceSold', drop_originals=False)),
    ('calc_ages', plc.DFCalcAge({'YearBuilt': 'BuiltAge', 'YearRemodAdd': 'RemodAge'}, calc_age_to=2022, drop_originals=False)),
    ('onehot_combined', plc.DFOneHotCategoriesCombined(
        features_kits=[
            (['Condition1', 'Condition2'], ['Artery', 'Feedr', 'RRNn', 'RRAn', 'PosN', 'PosA', 'RRNe', 'RRAe']),
            (['Exterior1st', 'Exterior2nd'], ['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'PreCast', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing'])
        ],
        drop_originals=True
    )),
    ('drop_redundant', plc.DFDropColumns(cols=['SalePrice'])),
    ('cat_encoding', ct_cat_encoder),
    
    ('knn_impute', KNNImputer(n_neighbors=3)),
])

In [1]:
df_mod = DFSetOrderedCategories(process=True).fit_transform(df)
df_mod.info()

NameError: name 'DFSetOrderedCategories' is not defined

In [68]:
df_mod = plc.DFReplaceMeaningfulNANs().fit_transform(df_mod)
df_mod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MSSubClass     1460 non-null   int64   
 1   MSZoning       1460 non-null   object  
 2   LotFrontage    1201 non-null   float64 
 3   LotArea        1460 non-null   int64   
 4   Street         1460 non-null   object  
 5   Alley          1460 non-null   object  
 6   LotShape       1460 non-null   category
 7   LandContour    1460 non-null   object  
 8   Utilities      1460 non-null   category
 9   LotConfig      1460 non-null   object  
 10  LandSlope      1460 non-null   category
 11  Neighborhood   1460 non-null   object  
 12  Condition1     1460 non-null   object  
 13  Condition2     1460 non-null   object  
 14  BldgType       1460 non-null   object  
 15  HouseStyle     1460 non-null   object  
 16  OverallQual    1460 non-null   int64   
 17  OverallCond    1460 non-null   in

In [69]:
df_mod = DFSetUnorderedCategories(
    categories={
        'MSZoning': pd.CategoricalDtype(categories=['Abs', 'A', 'C (all)', 'FV', 'I', 'RH', 'RL', 'RP', 'RM'], ordered=False),
        'BldgType': pd.CategoricalDtype(categories=['Abs', '1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'], ordered=False)
    },
    process=True
).fit_transform(df_mod)
df_mod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   MSSubClass     1460 non-null   category
 1   MSZoning       1460 non-null   category
 2   LotFrontage    1201 non-null   float64 
 3   LotArea        1460 non-null   int64   
 4   Street         1460 non-null   category
 5   Alley          1460 non-null   category
 6   LotShape       1460 non-null   category
 7   LandContour    1460 non-null   category
 8   Utilities      1460 non-null   category
 9   LotConfig      1460 non-null   category
 10  LandSlope      1460 non-null   category
 11  Neighborhood   1460 non-null   category
 12  Condition1     1460 non-null   category
 13  Condition2     1460 non-null   category
 14  BldgType       1460 non-null   category
 15  HouseStyle     1460 non-null   category
 16  OverallQual    1460 non-null   int64   
 17  OverallCond    1460 non-null   in

In [75]:
for c in df_mod.columns:
    n = df_mod[c].isna().sum()
    if n > 0:
        print(f'{c} => {n} nans')

LotFrontage => 259 nans
Exterior2nd => 105 nans
MasVnrType => 8 nans
MasVnrArea => 8 nans
BsmtExposure => 1 nans
BsmtFinType2 => 1 nans
Electrical => 1 nans


In [76]:
df_orig['Exterior2nd'].value_counts(dropna=False)

VinylSd    504
MetalSd    214
HdBoard    207
Wd Sdng    197
Plywood    142
CmentBd     60
Wd Shng     38
Stucco      26
BrkFace     25
AsbShng     20
ImStucc     10
Brk Cmn      7
Stone        5
AsphShn      3
Other        1
CBlock       1
Name: Exterior2nd, dtype: int64

In [77]:
list(df_mod['Exterior2nd'].dtype.categories)

['Abs',
 'Other',
 'AsbShng',
 'AsphShn',
 'BrkComm',
 'BrkFace',
 'CBlock',
 'CemntBd',
 'HdBoard',
 'ImStucc',
 'MetalSd',
 'Plywood',
 'PreCast',
 'Stone',
 'Stucco',
 'VinylSd',
 'Wd Sdng',
 'WdShing']

In [48]:
df_orig[df_orig['MSZoning'] == 'C (all)']['MSZoning']

30      C (all)
88      C (all)
93      C (all)
495     C (all)
557     C (all)
711     C (all)
812     C (all)
916     C (all)
1061    C (all)
1279    C (all)
Name: MSZoning, dtype: object

In [42]:
processed_data = pipeline_data_process.fit_transform(df)
processed_data.shape

ValueError: Found unknown categories [nan] in column 0 during fit

In [41]:
X_train, X_test, y_train, y_test = train_test_split(processed_data, y, train_size=0.8, random_state=42)
print('Train X size:', X_train.shape)
print('Test X size:', X_test.shape)
print('Train Y size:', y_train.shape)
print('Test Y size:', y_test.shape)

Train X size: (1168, 224)
Test X size: (292, 224)
Train Y size: (1168, 1)
Test Y size: (292, 1)


In [42]:
tree_reg = RandomForestRegressor(n_estimators=100)

In [44]:
tree_reg.fit(X_train, y_train.ravel())

RandomForestRegressor()

In [45]:
tree_reg.score(X_test, y_test)

0.8866410097141778

In [47]:
mean_squared_log_error(y_test, tree_reg.predict(X_test))


0.023273505338622706

## Make 1st prediction submit for Kaggle

In [49]:
df_test = pd.read_csv('datasets/test.csv', index_col=0)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1459 non-null   int64  
 1   MSZoning       1455 non-null   object 
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   object 
 5   Alley          107 non-null    object 
 6   LotShape       1459 non-null   object 
 7   LandContour    1459 non-null   object 
 8   Utilities      1457 non-null   object 
 9   LotConfig      1459 non-null   object 
 10  LandSlope      1459 non-null   object 
 11  Neighborhood   1459 non-null   object 
 12  Condition1     1459 non-null   object 
 13  Condition2     1459 non-null   object 
 14  BldgType       1459 non-null   object 
 15  HouseStyle     1459 non-null   object 
 16  OverallQual    1459 non-null   int64  
 17  OverallCond    1459 non-null   int64  
 18  YearB

In [61]:
test_processed = pipeline_data_process.fit_transform(df_test)
test_processed.shape

(1459, 216)

In [62]:
tree_reg.predict(test_processed)

ValueError: X has 216 features, but RandomForestRegressor is expecting 224 features as input.

What if we get rid of garage and basement sections but leave 'HasBsmt' and 'HasGarage' features?

In [None]:
basement_features = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']
garage_features = ['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond']