#CatBoost Regressor

In [2]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('/content/drive/MyDrive/ProjectCdac/TerraTrend-HousePricePrediction_DataExploration.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141304 entries, 0 to 141303
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Price (in rupees)    141304 non-null  float64
 1   location             141304 non-null  object 
 2   Carpet Area in sqft  141304 non-null  float64
 3   Status               141304 non-null  int64  
 4   Transaction          141304 non-null  float64
 5   Furnishing           141304 non-null  float64
 6   facing               141304 non-null  int64  
 7   Bathroom             141304 non-null  int64  
 8   Balcony              141304 non-null  int64  
 9   Ownership            141304 non-null  float64
 10  Final Amount         141304 non-null  float64
 11  BHK                  141304 non-null  float64
 12  Super Area in sqft   141304 non-null  float64
dtypes: float64(8), int64(4), object(1)
memory usage: 14.0+ MB


In [5]:
# Separate features and target
X = df.drop('Final Amount', axis=1)
y = df['Final Amount']

In [6]:
# One-hot encode categorical features
X_ohe = pd.get_dummies(X)

In [7]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.2, random_state=42)

In [8]:
cat = CatBoostRegressor(verbose=0, random_state=42)
cat.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7b52344c4c90>

In [9]:
y_pred_train = cat.predict(X_train)
y_pred_test = cat.predict(X_test)

In [10]:
# Evaluation on training set
print("------ Training Set Metrics ------")
print(f"MSE: {mean_squared_error(y_train, y_pred_train):.2f}")
print(f"RMSE: {mean_squared_error(y_train, y_pred_train)**0.5:.2f}")
print(f"MAE: {mean_absolute_error(y_train, y_pred_train):.2f}")
print(f"MAPE: {mean_absolute_percentage_error(y_train, y_pred_train):.2f}")
print(f"R²: {r2_score(y_train, y_pred_train):.2f}")

# Evaluation on test set
print("\n------ Test Set Metrics ------")
print(f"MSE: {mean_squared_error(y_test, y_pred_test):.2f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred_test)**0.5:.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_test):.2f}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred_test):.2f}")
print(f"R²: {r2_score(y_test, y_pred_test):.2f}")

------ Training Set Metrics ------
MSE: 25.27
RMSE: 5.03
MAE: 2.05
MAPE: 0.03
R²: 0.99

------ Test Set Metrics ------
MSE: 29.70
RMSE: 5.45
MAE: 2.12
MAPE: 0.03
R²: 0.99


# CatBoost Regressor Hyperparameter Tuning + Evaluation

In [11]:
cat = CatBoostRegressor(verbose=0, random_state=42)

param_grid = {
    'iterations': [100, 200],
    'learning_rate': [0.05, 0.1],
    'depth': [4, 6]
}

In [13]:
grid_search = GridSearchCV(
    estimator=cat,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

In [14]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [15]:
# Predictions on test set
y_pred = best_model.predict(X_test)

In [16]:
# Evaluation on test set
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [17]:
print("\n------ Tuned Model Test Set Metrics ------")
print("Best Parameters:", grid_search.best_params_)
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape:.2f}")
print(f"R² Score: {r2:.2f}")


------ Tuned Model Test Set Metrics ------
Best Parameters: {'depth': 6, 'iterations': 200, 'learning_rate': 0.1}
MSE: 43.65
RMSE: 6.61
MAE: 3.32
MAPE: 0.05
R² Score: 0.99


In [18]:
# Compare training vs test for overfitting
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)


In [19]:
print("\n------ Overfitting Check ------")
print("Train RMSE:", train_rmse)
print("Test RMSE :", test_rmse)
print("Train R²  :", train_r2)
print("Test R²   :", test_r2)


------ Overfitting Check ------
Train RMSE: 6.681542083972667
Test RMSE : 6.606587140748166
Train R²  : 0.9873442237175623
Test R²   : 0.987469187451786


In [None]:
# Training R² > Testing R²
# Explanation: The model fits training data better than unseen data.
# In CatBoost, this can happen if too many iterations are allowed, or learning_rate is too low (overfitting through excessive fitting).

# Training RMSE < Testing RMSE
# Explanation: The model has low error on training data but fails to generalize on the test set.
# A big gap between RMSE values is a strong sign of overfitting.

# Training MAE < Testing MAE
# Explanation: Absolute error is much lower on training, indicating memorization of patterns rather than generalization.

# Training MAPE < Testing MAPE
# Explanation: Overfitting often results in a significant increase in percentage error on test data.

# Train R² ≈ 1.0 and Test R² ≪ 1.0
# Example: Train R² = 0.99, Test R² = 0.65
# Indicates the model is memorizing the training data.

# CatBoost-specific scenario:
# Overfitting may occur when:
# - `iterations` is high (e.g., >500)
# - `depth` is large (e.g., >8)
# - `learning_rate` is too low (causing too many small steps, leading to overfitting)
# - Early stopping is not used

# ➤ Conclusion:
# If most of the above conditions are TRUE → Model is overfitting.
