### Training the Advanced ML models on Preprocessed Dataset

In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
earth_df = pd.read_csv('preprocessed_earthquake_data.csv')
earth_df.sample(5)

Unnamed: 0,Latitude,Longitude,Type,Depth,Magnitude,Magnitude Type,Root Mean Square,Source,Status,Year,Day,Month_sin,Month_cos,Hour_sin,Hour_cos,Type_Explosion,Type_Nuclear Explosion,Type_Rock Burst,Magnitude Type_MD,Magnitude Type_MH,Magnitude Type_ML,Magnitude Type_MS,Magnitude Type_MW,Magnitude Type_MWB,Magnitude Type_MWC,Magnitude Type_MWR,Magnitude Type_MWW,Source_ATLAS,Source_CI,Source_GCMT,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
22050,-0.328323,0.676352,Earthquake,-0.425342,-0.904207,MWB,-0.472794,US,Reviewed,1.481199,-0.655174,1.218537,0.705254,-1.419204,0.006682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
14023,-1.993755,-0.425721,Earthquake,-0.495461,-0.667832,MWC,-0.841749,US,Reviewed,0.372066,0.844142,0.00141,-1.429558,-0.713894,-1.221272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
22615,-2.061345,-0.530221,Earthquake,-0.495461,-0.195082,MWB,0.265115,US,Reviewed,1.55052,1.651466,1.218537,-0.717954,0.696725,-1.221272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9901,-1.383113,0.048945,Earthquake,-0.495461,-0.195082,MS,1.740934,US,Reviewed,-0.182501,1.766798,1.406827,-0.00635,-1.230217,0.715642,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
10362,-1.151116,-1.744731,Earthquake,-0.307934,0.041293,MW,0.511085,US,Reviewed,-0.113181,-0.193846,1.218537,0.705254,0.356511,1.376288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [4]:
# Define target and categorical columns (update if needed)
target = 'Magnitude'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

# Prepare features and target variable
X = earth_df.drop(columns=[target]+categorical_cols)
y = earth_df[target]

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

Unnamed: 0,Latitude,Longitude,Depth,Root Mean Square,Year,Day,Month_sin,Month_cos,Hour_sin,Hour_cos,Type_Explosion,Type_Nuclear Explosion,Type_Rock Burst,Magnitude Type_MD,Magnitude Type_MH,Magnitude Type_ML,Magnitude Type_MS,Magnitude Type_MW,Magnitude Type_MWB,Magnitude Type_MWC,Magnitude Type_MWR,Magnitude Type_MWW,Source_ATLAS,Source_CI,Source_GCMT,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
16953,-0.078008,0.755211,-0.495461,1.003024,0.787991,-1.116502,-1.215716,0.705254,-0.713894,1.234637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
15800,-0.471466,1.014252,1.194724,-1.087719,0.649349,0.844142,1.218537,-0.717954,-1.006043,-0.995938,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
9014,0.213764,-0.62189,-0.495461,0.511085,-0.321143,0.498146,0.704119,-1.238884,-0.713894,-1.221272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
15516,-1.274852,0.310949,-0.495461,-0.534287,0.580028,0.036818,-1.215716,-0.717954,-1.419204,0.006682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
17837,0.01046,0.456978,-0.405774,-0.718764,0.926632,1.074806,1.218537,-0.717954,-1.419204,0.006682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


#### 1. Advanced Model-1 : GBM Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Initialize the Gradient Boosting Regressor
gbr = GradientBoostingRegressor(random_state=42)

# Hyperparameter grid for tuning
param_grid = {


}

# Setup GridSearchCV with 5-fold cross-validation using negative MAE scoring
grid_search = GridSearchCV(estimator=gbr,
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_absolute_error',
                           n_jobs=-1,
                           verbose=2)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Best model from tuning
best_gbr = grid_search.best_estimator_

# Predict on test data
y_pred = best_gbr.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Cross-validation scores for best model on full data
cv_mae_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='neg_mean_absolute_error')
cv_mse_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='neg_mean_squared_error')
cv_r2_scores = cross_val_score(best_gbr, X, y, cv=5, scoring='r2')

# Print results
print("Best model parameters:", grid_search.best_params_)
print(f"Test MAE: {mae:.4f}")
print(f"Test MSE: {mse:.4f}")
print(f"Test R2 score: {r2:.4f}")
print(f"5-Fold CV Mean MAE: {-np.mean(cv_mae_scores):.4f} ± {np.std(cv_mae_scores):.4f}")
print(f"5-Fold CV Mean MSE: {-np.mean(cv_mse_scores):.4f} ± {np.std(cv_mse_scores):.4f}")
print(f"5-Fold CV Mean R2: {np.mean(cv_r2_scores):.4f} ± {np.std(cv_r2_scores):.4f}")

# Save the best trained model
model_filename = 'gbm_regressor_model.pkl'
joblib.dump(best_gbr, model_filename)
print(f"Model saved to {model_filename}")


Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best model parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Test MAE: 0.6687
Test MSE: 0.8765
Test R2 score: 0.1563
5-Fold CV Mean MAE: 0.7288 ± 0.0498
5-Fold CV Mean MSE: 0.9771 ± 0.1018
5-Fold CV Mean R2: 0.0227 ± 0.0528
Model saved to gbm_regressor_model.pkl


#### Advanced Model-2: catGBM Regressor

In [11]:
from sklearn.preprocessing import LabelEncoder
target = 'Magnitude'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

# Prepare features and target
X = earth_df.drop(columns=[target])
y = earth_df[target]

# Label encode categorical columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from catboost import CatBoostRegressor, Pool

# Create CatBoost Pool for train and test sets (specify categorical features)
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_cols)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_cols)

# Initialize CatBoost Regressor
catboost_model = CatBoostRegressor(
    random_seed=42,
    verbose=0
)

# Define hyperparameter grid for GridSearch
param_grid = {
    'iterations': [100, 200],
    'depth': [4, 6, 8],
    'learning_rate': [0.03, 0.1],
    'l2_leaf_reg': [1, 3, 5]
}

# Set up GridSearchCV (CatBoost supports sklearn API)
grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)

# Fit GridSearchCV - using DataFrame without Pool
# CatBoost handles categorical features by column name in dataframe during sklearn API use
grid_search.fit(X_train, y_train, cat_features=categorical_cols)

# Best model from grid search
best_model = grid_search.best_estimator_

# Predict on test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Cross-validation scores on full dataset
cv_mae_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_absolute_error')

print("Best parameters:", grid_search.best_params_)
print(f"Test MAE: {mae:.4f}")
print(f"Test MSE: {mse:.4f}")
print(f"Test R2 score: {r2:.4f}")
print(f"5-Fold CV Mean MAE: {-np.mean(cv_mae_scores):.4f} ± {np.std(cv_mae_scores):.4f}")

# Save the trained model
model_filename = 'catboost_regressor_model.cbm'
best_model.save_model(model_filename)
print(f"Model saved to {model_filename}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters: {'depth': 8, 'iterations': 200, 'l2_leaf_reg': 3, 'learning_rate': 0.1}
Test MAE: 0.6692
Test MSE: 0.8701
Test R2 score: 0.1625
5-Fold CV Mean MAE: 0.7062 ± 0.0434
Model saved to catboost_regressor_model.cbm
