In [13]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from mlxtend.regressor import StackingCVRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import sklearn, mlxtend
import joblib

# Import the models for ensemble
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge, RidgeCV

In [None]:
# Load Your Data and Preprocessor

X_train = pd.read_csv('Output Files/Price Prediction model files/X_train_for_CatBoost.csv')
X_test = pd.read_csv('Output Files/Price Prediction model files/X_test_for_CatBoost.csv')
y_train_log = pd.read_csv('Output Files/Price Prediction model files/y_train_for_CatBoost.csv').squeeze()
y_test_log = pd.read_csv('Output Files/Price Prediction model files/y_test_for_CatBoost.csv').squeeze()

# CatBoost Model pipeline
model_pipeline = joblib.load('Output Files\\Price Prediction model files\\predict_price_model_catBoost.joblib')

# Extract the pre-processing part of the pipeline
preprocessor = model_pipeline.named_steps['preprocessor']

##### Define Ensemble Architecture

In [15]:
# Level 0: Base Models ("The Specialists")
rf_reg = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
catboost_reg = CatBoostRegressor(random_state=42, verbose=0)
lgbm_reg = LGBMRegressor(random_state=42)
ridge_reg = Ridge(alpha=1.0) # A simple linear model

# Level 1: Meta-Model ("The Manager")
meta_model = RidgeCV()  # RidgeCV because it tunes its own regularization strength

# Build the StackingCVRegressor. This is the core of our custom model.
# It uses cross-validation to train the meta-model, which prevents data leakage.
stack = StackingCVRegressor(
    regressors=[rf_reg, catboost_reg, lgbm_reg, ridge_reg],
    meta_regressor=meta_model,
    cv=5,  # 5-fold cross-validation for training the meta-model
    use_features_in_secondary=False, # Meta-model only sees predictions of base models
    store_train_meta_features=True,
    n_jobs=-1,
    verbose=1
)

In [16]:
# Create the Final End-to-End Pipeline. This pipeline ensures that the same preprocessing is applied before the ensemble runs.
ensemble_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('stacking_regressor', stack)
])

In [21]:
# Train and Evaluate the Custom Ensemble
ensemble_pipeline.fit(X_train, y_train_log)

# Predict on test set
y_pred_log = ensemble_pipeline.predict(X_test)

# Inverse transform the predictions and true values to get the actual prices
y_pred = np.exp(y_pred_log)
y_test = np.exp(y_test_log)

# Calculate the final metrics
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('---------------------------------')
print("\nCustom Ensemble Performance")
print(f"Mean Absolute Error (MAE): {mae:.4f} Crores")
print(f"R² Score: {r2:.4f}")
print("---------------------------------")

# Compare with the previous best
catboost_mae = 0.41
print(f"Previous Best MAE (CatBoost): {catboost_mae:.4f} Crores")
improvement = (catboost_mae - mae) / catboost_mae * 100
if mae < catboost_mae:
    print(f"Mean Absolute Error shows an improvement of {round(catboost_mae - mae, 3)} Crores equivalent to {round((catboost_mae - mae) * 100, 2)} Lakhs which is a meaningful amount in a real estate transaction i.e an improvement of {improvement:.2f}%")
else:
    print("The ensemble did not outperform the CatBoost model.")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   15.5s remaining:   23.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   21.4s remaining:   32.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   21.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.7s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002061 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2845
[LightGBM] [Info] Number of data points in the train set: 5024, number of used features: 52
[LightGBM] [Info] Start training from score 1.029228
---------------------------------

Custom Ensemble Performance
Mean Absolute Error (MAE): 0.3966 Crores
R² Score: 0.9307
---------------------------------
Previous Best MAE (CatBoost): 0.4100 Crores
Mean Absolute Error shows an improvement of 0.013 Crores equivalent to 1.34 Lakhs which is a meaningful amount in a real estate transaction i.e an improvement of 3.26%


In [None]:
# Save the entire custom ensemble pipeline to a file
joblib.dump(ensemble_pipeline, 'Output Files/Price Prediction model files/custom_ensemble_price_predict_model.joblib')

['Output Files/Price Prediction model files/custom_ensemble_price_predict_model.joblib']