In [151]:
# =========================
# Core Libraries
# =========================
# !pip install catboost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# =========================
# Scipy
# =========================
from scipy.sparse import issparse

# =========================
# Preprocessing
# =========================
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# =========================
# Model Selection
# =========================
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# =========================
# Evaluation Metrics
# =========================
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# =========================
# State-of-the-Art Gradient Boosting Models
# =========================
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

# =========================
# Ensemble Techniques
# =========================
from sklearn.ensemble import StackingRegressor, VotingRegressor

In [152]:
df = pd.read_csv("processed_data/preprocessed_house_data.csv")

In [153]:
df.shape

(1460, 14)

In [154]:
## Independent and dependent features
X = df.drop(labels=['SalePrice'],axis=1)
Y = df[['SalePrice']]

In [155]:
X.shape

(1460, 13)

In [156]:
Y.shape

(1460, 1)

In [157]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [158]:
categorical_cols

Index(['Neighborhood', 'ExterQual', 'KitchenQual', 'BsmtQual', 'Foundation',
       'GarageFinish', 'GarageType', 'SaleCondition', 'MSZoning',
       'HouseStyle'],
      dtype='object')

In [159]:
numerical_cols

Index(['OverallQual', 'GrLivArea', 'GarageCars'], dtype='object')

In [160]:
numerical_cols = ["OverallQual", "GrLivArea", "GarageCars"]

nominal_cols = [
    "Neighborhood",
    "Foundation",
    "GarageType",
    "SaleCondition",
    "MSZoning",
    "HouseStyle"
]

In [161]:


# Columns
numerical_cols = ["OverallQual", "GrLivArea", "GarageCars"]

ordinal_cols = ["ExterQual", "KitchenQual", "BsmtQual", "GarageFinish"]

nominal_cols = [
    "Neighborhood",
    "Foundation",
    "GarageType",
    "SaleCondition",
    "MSZoning",
    "HouseStyle"
]


# Domain Categories
quality_categories = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmt_categories = ['NoBsmt', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garage_finish_categories = ['Unf', 'RFn', 'Fin']


# Numerical Pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


# Ordinal Pipeline
ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder', OrdinalEncoder(
        categories=[
            quality_categories,
            quality_categories,
            bsmt_categories,
            garage_finish_categories
        ],
        handle_unknown='use_encoded_value',
        unknown_value=-1
    )),
    ('scaler', StandardScaler())
])


# Nominal Pipeline
nominal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# =========================
# Column Transformer
# =========================
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols),
    ('ordinal_pipeline', ordinal_pipeline, ordinal_cols),
    ('nominal_pipeline', nominal_pipeline, nominal_cols)
])

print("Preprocessing Pipeline Created Successfully!")

✅ Preprocessing Pipeline Created Successfully!


In [162]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=30)

# Fit preprocessor on training data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Convert sparse matrix to dense if needed
if issparse(X_train_transformed):
    X_train_transformed = X_train_transformed.toarray()
    X_test_transformed = X_test_transformed.toarray()

# Get feature names from preprocessor
feature_names = preprocessor.get_feature_names_out()

# Make sure the number of feature names matches the transformed data
if X_train_transformed.shape[1] != len(feature_names):
    feature_names = feature_names[:X_train_transformed.shape[1]]

# Convert to DataFrame with proper column names
X_train = pd.DataFrame(X_train_transformed, columns=feature_names)
X_test = pd.DataFrame(X_test_transformed, columns=feature_names)

# Optional: reset index to match original dataset
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Check shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Number of features:", len(feature_names))

X_train shape: (1022, 63)
X_test shape: (438, 63)
Number of features: 63


In [163]:
X_train.head()

Unnamed: 0,num_pipeline__OverallQual,num_pipeline__GrLivArea,num_pipeline__GarageCars,ordinal_pipeline__ExterQual,ordinal_pipeline__KitchenQual,ordinal_pipeline__BsmtQual,ordinal_pipeline__GarageFinish,nominal_pipeline__Neighborhood_Blmngtn,nominal_pipeline__Neighborhood_Blueste,nominal_pipeline__Neighborhood_BrDale,...,nominal_pipeline__MSZoning_RL,nominal_pipeline__MSZoning_RM,nominal_pipeline__HouseStyle_1.5Fin,nominal_pipeline__HouseStyle_1.5Unf,nominal_pipeline__HouseStyle_1Story,nominal_pipeline__HouseStyle_2.5Fin,nominal_pipeline__HouseStyle_2.5Unf,nominal_pipeline__HouseStyle_2Story,nominal_pipeline__HouseStyle_SFoyer,nominal_pipeline__HouseStyle_SLvl
0,1.37514,2.261913,0.332543,1.061022,2.274292,0.656312,1.512421,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.522577,-1.222343,0.332543,-0.662928,0.763026,-0.82765,-0.94301,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.073719,-0.072917,0.332543,-0.662928,-0.748239,0.656312,-0.94301,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.798148,-1.222343,0.332543,-0.662928,0.763026,-0.82765,0.284705,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.073719,-0.379683,0.332543,-0.662928,0.763026,-0.82765,0.284705,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [164]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [165]:
models = {
    
    "CatBoost": CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        random_state=42,
        verbose=0
    ),
    
    "LightGBM": LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=-1,
        random_state=42
    ),
    
    "XGBoost": XGBRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ),
    
    "RandomForest": RandomForestRegressor(
        n_estimators=500,
        max_depth=None,
        random_state=42
    )
}

trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

CatBoost
Model Training Performance
RMSE: 31384.512673442994
MAE: 20062.259351629007
R2 score 86.62247581452934


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 358
[LightGBM] [Info] Number of data points in the train set: 1022, number of used features: 41
[LightGBM] [Info] Start training from score 179175.000000
LightGBM
Model Training Performance
RMSE: 33884.48428569611
MAE: 22443.93794974717
R2 score 84.40638791878025


XGBoost
Model Training Performance
RMSE: 31066.545350263845
MAE: 20624.064453125
R2 score 86.89216375350952




  return fit_method(estimator, *args, **kwargs)


RandomForest
Model Training Performance
RMSE: 36996.41037568971
MAE: 21630.47923420002
R2 score 81.41065266551767


