#AdaBoost Regressor

In [5]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np


df = pd.read_csv('/content/drive/MyDrive/ProjectCdac/TerraTrend-HousePricePrediction_DataExploration.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141304 entries, 0 to 141303
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Price (in rupees)    141304 non-null  float64
 1   location             141304 non-null  object 
 2   Carpet Area in sqft  141304 non-null  float64
 3   Status               141304 non-null  int64  
 4   Transaction          141304 non-null  float64
 5   Furnishing           141304 non-null  float64
 6   facing               141304 non-null  int64  
 7   Bathroom             141304 non-null  int64  
 8   Balcony              141304 non-null  int64  
 9   Ownership            141304 non-null  float64
 10  Final Amount         141304 non-null  float64
 11  BHK                  141304 non-null  float64
 12  Super Area in sqft   141304 non-null  float64
dtypes: float64(8), int64(4), object(1)
memory usage: 14.0+ MB


In [7]:
X = df.drop('Final Amount', axis=1)  # Replace 'target' with your regression label
y = df['Final Amount']

X_ohe = pd.get_dummies(X)

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size=0.2, random_state=42)

In [9]:
# Initialize and train AdaBoost regressor
ada = AdaBoostRegressor(random_state=42)
ada.fit(X_train, y_train)

In [12]:
# Predictions
y_pred_train = ada.predict(X_train)
y_pred_test = ada.predict(X_test)

In [14]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score


# Evaluation on training set
print("------ Training Set Metrics ------")
print(f"MSE: {mean_squared_error(y_train, y_pred_train):.2f}")
print(f"RMSE: {mean_squared_error(y_train, y_pred_train)**0.5:.2f}")
print(f"MAE: {mean_absolute_error(y_train, y_pred_train):.2f}")
print(f"MAPE: {mean_absolute_percentage_error(y_train, y_pred_train):.2f}")
print(f"R²: {r2_score(y_train, y_pred_train):.2f}")

# Evaluation on test set
print("\n------ Test Set Metrics ------")
print(f"MSE: {mean_squared_error(y_test, y_pred_test):.2f}")
print(f"RMSE: {mean_squared_error(y_test, y_pred_test)**0.5:.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred_test):.2f}")
print(f"MAPE: {mean_absolute_percentage_error(y_test, y_pred_test):.2f}")
print(f"R²: {r2_score(y_test, y_pred_test):.2f}")

------ Training Set Metrics ------
MSE: 463.44
RMSE: 21.53
MAE: 17.17
MAPE: 0.29
R²: 0.87

------ Test Set Metrics ------
MSE: 455.56
RMSE: 21.34
MAE: 17.03
MAPE: 0.29
R²: 0.87


In [None]:
# Train R² ≈ 1.0 and Test R² ≪ 1.0	Strong overfitting
# Train RMSE ≪ Test RMSE	Likely overfitting
# Train MAPE ≪ Test MAPE	Likely overfitting

#AdaBoost Regressor Hyperparameter Tuning + Evaluation

In [15]:
ada = AdaBoostRegressor(random_state=42)

In [16]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}

In [17]:
# GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

grid_search = GridSearchCV(
    estimator=ada,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Evaluation
y_pred = best_model.predict(X_test)

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

# Root Mean Squared Error (RMSE)
rmse = mse ** 0.5
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.2f}")

# Mean Absolute Percentage Error (MAPE)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}")

# R-squared (R²)
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2:.2f}")


print("Best Parameters:", grid_search.best_params_)
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.2f}")

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Mean Squared Error (MSE): 404.29
Root Mean Squared Error (RMSE): 20.11
Mean Absolute Error (MAE): 15.41
Mean Absolute Percentage Error (MAPE): 0.24
R-squared (R²): 0.88
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 50}
Mean Squared Error (MSE): 404.29
Root Mean Squared Error (RMSE): 20.11
Mean Absolute Error (MAE): 15.41
R² Score: 0.88


In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predict on training and test sets
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Training metrics
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

# Test metrics
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Print results
print("Train RMSE:", rmse_train)
print("Test RMSE :", rmse_test)
print("Train R²  :", r2_train)
print("Test R²   :", r2_test)

Train RMSE: 20.308193398463718
Test RMSE : 20.10689190705877
Train R²  : 0.8830829216966073
Test R²   : 0.8839313897842327


In [20]:
# Training R² > Testing R²
# Explanation: The model performs significantly better on training data.
# In AdaBoost, this often happens if the model uses too many estimators or the base learners are too complex.

# Training RMSE < Testing RMSE
# Explanation: Root Mean Squared Error is much lower on training data.
# A large gap indicates the model is learning noise in training rather than generalizing.

# Training MAE < Testing MAE
# Explanation: Lower absolute error on training data can mean the model is overfitting.

# Training MAPE < Testing MAPE
# Explanation: A lower percentage error on training vs test data shows the model may be memorizing the training samples.

# Train R² ≈ 1.0 and Test R² ≪ 1.0
# Example: Train R² = 0.97, Test R² = 0.68
# Indicates very high model confidence on training but poor generalization on unseen data.

# AdaBoost-specific scenario:
# Overfitting may occur when:
# - `n_estimators` is too high (e.g., >200 without early stopping)
# - `learning_rate` is too low (which leads to many weak learners stacking up)
# - Base estimators like `DecisionTreeRegressor` have large depth (overly complex)

# ➤ Conclusion:
# If most of the above conditions are TRUE → AdaBoost model is overfitting.
