In [149]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.impute import SimpleImputer
import json
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
import tqdm
from sklearn.metrics import mean_absolute_error

In [150]:
with open("feature_dict.json", "r") as json_file:
    feature_dict = json.load(json_file)

In [151]:
df = pd.read_csv('train_prepared_for_cv.csv')

In [152]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 80, dtype: object

In [153]:
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

In [161]:
numeric_cols = [col for col in X.columns if X[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in X.columns if col not in numeric_cols]
ordinal_encode_cols = ["ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "HeatingQC", "KitchenQual", "FireplaceQu", "GarageQual", "GarageCond", "PoolQC", "Street", "Alley", "PavedDrive"]
label_encode_cols = [col for col in categorical_cols if col not in ordinal_encode_cols]
ohe_encode_cols = label_encode_cols

In [162]:
print(numeric_cols)

['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


In [163]:
print(categorical_cols)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [164]:
print(ordinal_encode_cols)

['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'Street', 'Alley', 'PavedDrive']


In [165]:
print(label_encode_cols)

['MSZoning', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'BsmtFinType1', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'GarageType', 'GarageFinish', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [166]:
print(len(categorical_cols))
print(len(ordinal_encode_cols) + len(label_encode_cols))

42
42


In [167]:
ordinal_mappings = {}

for col in ordinal_encode_cols:
    if col in X.columns and col in feature_dict:
        all_values = list(feature_dict[col]["values"].keys())  # Extract and preserve the order of values from the dict
        mapping = {category: rank for rank, category in enumerate(reversed(all_values), start=1)}  # Encode in the dict order
        mapping["No_item"] = -1
        if "NA" in mapping.keys():
            del mapping["NA"]
        ordinal_mappings[col] = mapping
        
def safe_map(value, mapping):
    if value not in mapping:
        raise ValueError(f"Value '{value}' not found in the mapping!")
    return mapping[value]

def ordinal_encode_column(X, mappings):
    encoded = X.copy()
    for col in X.columns:
        if col in mappings:
            encoded[col] = X.apply(lambda x: safe_map(x, mappings[col]))
    return encoded

print(ordinal_mappings)

{'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'BsmtQual': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'BsmtCond': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'BsmtExposure': {'No': 2, 'Mn': 3, 'Av': 4, 'Gd': 5, 'No_item': -1}, 'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'FireplaceQu': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'GarageQual': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'GarageCond': {'Po': 2, 'Fa': 3, 'TA': 4, 'Gd': 5, 'Ex': 6, 'No_item': -1}, 'PoolQC': {'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'No_item': -1}, 'Street': {'Grvl': 1, 'Pave': 2, 'No_item': -1}, 'Alley': {'Grvl': 2, 'Pave': 3, 'No_item': -1}, 'PavedDrive': {'N': 1, 'P': 2, 'Y': 3, 'No_item': -1}}


In [168]:
label_mappings = {}

for col in label_encode_cols:
    if col in X.columns and col in feature_dict:
        all_values = list(feature_dict[col]["values"].keys())  # Extract and preserve the order of values from the dict
        mapping = {category: rank for rank, category in enumerate(reversed(all_values), start=1)}  # Encode in the dict order
        mapping["No_item"] = -1
        if "NA" in mapping.keys():
            del mapping["NA"]
        label_mappings[col] = mapping
        
def safe_map(value, mapping):
    if value not in mapping:
        raise ValueError(f"Value '{value}' not found in the mapping!")
    return mapping[value]

def label_encode_column(X, mappings):
    encoded = X.copy()
    for col in X.columns:
        if col in mappings:
            encoded[col] = df[col].apply(lambda x: safe_map(x, mappings[col]))
    return encoded

print(label_mappings)

{'MSZoning': {'RM': 1, 'RP': 2, 'RL': 3, 'RH': 4, 'I': 5, 'FV': 6, 'C': 7, 'A': 8, 'No_item': -1}, 'LotShape': {'IR3': 1, 'IR2': 2, 'IR1': 3, 'Reg': 4, 'No_item': -1}, 'LandContour': {'Low': 1, 'HLS': 2, 'Bnk': 3, 'Lvl': 4, 'No_item': -1}, 'Utilities': {'ELO': 1, 'NoSeWa': 2, 'NoSewr': 3, 'AllPub': 4, 'No_item': -1}, 'LotConfig': {'FR3': 1, 'FR2': 2, 'CulDSac': 3, 'Corner': 4, 'Inside': 5, 'No_item': -1}, 'LandSlope': {'Sev': 1, 'Mod': 2, 'Gtl': 3, 'No_item': -1}, 'Neighborhood': {'Veenker': 1, 'Timber': 2, 'StoneBr': 3, 'Somerst': 4, 'SawyerW': 5, 'Sawyer': 6, 'SWISU': 7, 'OldTown': 8, 'NWAmes': 9, 'NridgHt': 10, 'NPkVill': 11, 'NoRidge': 12, 'Names': 13, 'Mitchel': 14, 'MeadowV': 15, 'IDOTRR': 16, 'Gilbert': 17, 'Edwards': 18, 'Crawfor': 19, 'CollgCr': 20, 'ClearCr': 21, 'BrkSide': 22, 'BrDale': 23, 'Blueste': 24, 'Blmngtn': 25, 'No_item': -1}, 'Condition1': {'RRAe': 1, 'RRNe': 2, 'PosA': 3, 'PosN': 4, 'RRAn': 5, 'RRNn': 6, 'Norm': 7, 'Feedr': 8, 'Artery': 9, 'No_item': -1}, 'Conditi

In [169]:
class DataFramePreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, numeric_cols, ordinal_cols, ohe_cols, label_cols, ordinal_mappings, label_mappings):
        self.numeric_cols = numeric_cols
        self.ordinal_cols = ordinal_cols
        self.ohe_cols = ohe_cols
        self.label_cols = label_cols
        self.ordinal_mappings = ordinal_mappings
        self.label_mappings = label_mappings
        self.ohe_columns_names = None
        self.medians = {}
        self.modes = {}

    def fit(self, X, y=None):
        self.medians = X[self.numeric_cols].median()
        self.modes = X[self.ordinal_cols + self.ohe_cols + self.label_cols].mode().iloc[0]
        if self.ohe_cols:
            ohe_encoded = pd.get_dummies(X[self.ohe_cols])
            self.ohe_columns_names = ohe_encoded.columns  # Save the column names for consistency
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.numeric_cols:
            X[col] = X[col].fillna(self.medians[col])
        for col in self.ordinal_cols + self.ohe_cols + self.label_cols:
            X[col] = X[col].fillna(self.modes[col])
        for col in self.ordinal_cols:
            if col in self.ordinal_mappings:
                X[col] = X[col].map(self.ordinal_mappings[col]).fillna(-1)
            else:
                raise KeyError
        for col in self.label_cols:
            if col in self.label_mappings:
                X[col] = X[col].map(self.label_mappings[col]).fillna(-1)
            else:
                raise KeyError
        if self.ohe_cols:
            ohe_encoded = pd.get_dummies(X[self.ohe_cols])
            ohe_encoded = ohe_encoded.reindex(columns=self.ohe_columns_names, fill_value=0)  # Ensure consistent columns
            X = X.drop(columns=self.ohe_cols)
            X = pd.concat([X, ohe_encoded], axis=1)
        return X

In [170]:
preprocessor = DataFramePreprocessor(
    numeric_cols=numeric_cols,
    ordinal_cols=ordinal_encode_cols,
    ohe_cols=ohe_encode_cols,
    label_cols=[],
    ordinal_mappings=ordinal_mappings,
    label_mappings=label_mappings
)

In [171]:
# testing preprocessing
processed_data = preprocessor.fit_transform(df)

In [172]:
processed_data

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,OverallQual,OverallCond,YearBuilt,YearRemodAdd,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,2,-1,7,5,2003,2003,...,False,False,False,True,False,False,False,False,True,False
1,2,20,80.0,9600,2,-1,6,8,1976,1976,...,False,False,False,True,False,False,False,False,True,False
2,3,60,68.0,11250,2,-1,7,5,2001,2002,...,False,False,False,True,False,False,False,False,True,False
3,4,70,60.0,9550,2,-1,7,5,1915,1970,...,False,False,False,True,True,False,False,False,False,False
4,5,60,84.0,14260,2,-1,8,5,2000,2000,...,False,False,False,True,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,1456,60,62.0,7917,2,-1,6,5,1999,2000,...,False,False,False,True,False,False,False,False,True,False
1452,1457,20,85.0,13175,2,-1,6,6,1978,1988,...,False,False,False,True,False,False,False,False,True,False
1453,1458,70,66.0,9042,2,-1,7,9,1941,2006,...,False,False,False,True,False,False,False,False,True,False
1454,1459,20,68.0,9717,2,-1,5,6,1950,1996,...,False,False,False,True,False,False,False,False,True,False


In [173]:
preprocessor = DataFramePreprocessor(
    numeric_cols=numeric_cols,
    ordinal_cols=ordinal_encode_cols,
    ohe_cols=[],
    label_cols=label_encode_cols,
    ordinal_mappings=ordinal_mappings,
    label_mappings=label_mappings
)

In [174]:
X

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,No_item,Reg,Lvl,AllPub,...,0,0,No_item,No_item,No_item,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,No_item,Reg,Lvl,AllPub,...,0,0,No_item,No_item,No_item,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,No_item,IR1,Lvl,AllPub,...,0,0,No_item,No_item,No_item,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,No_item,IR1,Lvl,AllPub,...,0,0,No_item,No_item,No_item,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,No_item,IR1,Lvl,AllPub,...,0,0,No_item,No_item,No_item,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,1456,60,RL,62.0,7917,Pave,No_item,Reg,Lvl,AllPub,...,0,0,No_item,No_item,No_item,0,8,2007,WD,Normal
1452,1457,20,RL,85.0,13175,Pave,No_item,Reg,Lvl,AllPub,...,0,0,No_item,MnPrv,No_item,0,2,2010,WD,Normal
1453,1458,70,RL,66.0,9042,Pave,No_item,Reg,Lvl,AllPub,...,0,0,No_item,GdPrv,Shed,2500,5,2010,WD,Normal
1454,1459,20,RL,68.0,9717,Pave,No_item,Reg,Lvl,AllPub,...,0,0,No_item,No_item,No_item,0,4,2010,WD,Normal


In [175]:
sum(y == 'RL')

0

In [176]:
y

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1451    175000
1452    210000
1453    266500
1454    142125
1455    147500
Name: SalePrice, Length: 1456, dtype: int64

In [186]:
model = RandomForestRegressor(random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

r2 = {"train": [], "test": []}
mae = {"train": [], "test": []}
feature_importances = pd.DataFrame(index=X.columns)

for fold, (train_index, test_index) in enumerate(kf.split(X), 1):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    preprocessor.fit(X_train)
    X_train_preprocessed = preprocessor.transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)
    
    model.fit(X_train_preprocessed, y_train)
    
    y_train_pred = model.predict(X_train_preprocessed)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = model.score(X_train_preprocessed, y_train)
    
    y_test_pred = model.predict(X_test_preprocessed)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = model.score(X_test_preprocessed, y_test)
    
    r2["train"].append(train_r2)
    r2["test"].append(test_r2)
    mae["train"].append(train_mae)
    mae["test"].append(test_mae)

    
    fold_importances = pd.Series(model.feature_importances_, index=X.columns)
    feature_importances = pd.concat([feature_importances, fold_importances], axis=1)

feature_importances['mean_importance'] = feature_importances.mean(axis=1)
feature_importances = feature_importances[['mean_importance']].sort_values(by='mean_importance', ascending=False)

In [187]:
pd.DataFrame.from_dict(mae)

Unnamed: 0,train,test
0,6244.614433,16407.441815
1,6366.476635,16157.490412
2,6242.790455,18994.633711
3,6471.111511,16957.311375
4,6363.588,15571.492509


In [188]:
pd.DataFrame.from_dict(r2)

Unnamed: 0,train,test
0,0.983463,0.895092
1,0.981943,0.91091
2,0.983808,0.863372
3,0.982933,0.878497
4,0.984175,0.880736


In [191]:
feature_importances.sort_values(by="mean_importance", ascending=False)

Unnamed: 0,mean_importance
OverallQual,5.842843e-01
GrLivArea,1.145185e-01
TotalBsmtSF,4.053258e-02
BsmtFinSF1,3.089661e-02
1stFlrSF,2.446239e-02
...,...
Condition2,3.572230e-05
MiscFeature,2.630802e-05
Street,1.662733e-05
PoolQC,1.298109e-05
