<a href="https://colab.research.google.com/github/pyayivargitam/Infosys-Assignments/blob/main/Assign_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Assignment 5**

# Build, train, and save LightGBM and SVM classifiers with integrated cross-validation and hyperparameter tuning & do evaluation of these models using appropriate metrics, compare their performance, and identify which model performs best with reasoning.
NOTE: Use the preprocessed dataset of earthquake predictions.

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score, confusion_matrix
import joblib

# Load dataset
data = pd.read_excel("/content/synthetic_earthquake_data.xlsx")

# Assume last column is the target
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

print("Dataset loaded. Shape of X:", X.shape, "Shape of y:", y.shape)
print("Label distribution:\n", y.value_counts())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


Dataset loaded. Shape of X: (1000, 7) Shape of y: (1000,)
Label distribution:
 Risk_Level
1    357
2    338
0    305
Name: count, dtype: int64


In [None]:
# Define LightGBM classifier
lgbm_clf = lgb.LGBMClassifier(random_state=42, verbose=-1)

# Hyperparameter grid
lgbm_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 62],
    'boosting_type': ['gbdt']
}

# Grid search with cross-validation
lgbm_grid_search = GridSearchCV(
    estimator=lgbm_clf,
    param_grid=lgbm_param_grid,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("Starting LightGBM grid search...")
lgbm_grid_search.fit(X_train, y_train)

# Best LightGBM model
best_lgbm_model = lgbm_grid_search.best_estimator_
print(f"Best LightGBM parameters: {lgbm_grid_search.best_params_}")

# Predictions and evaluation
lgbm_preds = best_lgbm_model.predict(X_test)
# For multiclass, predict_proba returns probabilities for each class.
# ROC AUC requires probabilities for each class vs the rest (OvR) or specific classes (OvO) or an average.
# We'll calculate macro average ROC AUC for simplicity here.
try:
    lgbm_proba_preds = best_lgbm_model.predict_proba(X_test)
    roc_auc = roc_auc_score(y_test, lgbm_proba_preds, multi_class='ovr', average='weighted') # Use 'weighted' for multiclass ROC AUC
except ValueError:
    roc_auc = "N/A (Cannot calculate ROC AUC for this model/data)"


print("\n--- LightGBM Model Evaluation ---")
print("Accuracy:", accuracy_score(y_test, lgbm_preds))
print("F1 Score:", f1_score(y_test, lgbm_preds, average='weighted')) # Use 'weighted' for multiclass F1
print("ROC AUC Score:", roc_auc)
print("\nClassification Report:\n", classification_report(y_test, lgbm_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, lgbm_preds))

# Save model
joblib.dump(best_lgbm_model, 'best_lgbm_model.pkl')
print("\nLightGBM model saved as 'best_lgbm_model.pkl'")

Starting LightGBM grid search...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best LightGBM parameters: {'boosting_type': 'gbdt', 'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 31}

--- LightGBM Model Evaluation ---
Accuracy: 1.0
F1 Score: 1.0
ROC AUC Score: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        61
           1       1.00      1.00      1.00        71
           2       1.00      1.00      1.00        68

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Confusion Matrix:
 [[61  0  0]
 [ 0 71  0]
 [ 0  0 68]]

LightGBM model saved as 'best_lgbm_model.pkl'


In [None]:
# Define SVM classifier
svm_clf = SVC(probability=True, random_state=42)

# Hyperparameter grid
svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf']
}

# Grid search with cross-validation
svm_grid_search = GridSearchCV(
    estimator=svm_clf,
    param_grid=svm_param_grid,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

print("\nStarting SVM grid search...")
svm_grid_search.fit(X_train, y_train)

# Best SVM model
best_svm_model = svm_grid_search.best_estimator_
print(f"Best SVM parameters: {svm_grid_search.best_params_}")

# Predictions and evaluation
svm_preds = best_svm_model.predict(X_test)
# For multiclass, predict_proba returns probabilities for each class.
# ROC AUC requires probabilities for each class vs the rest (OvR) or specific classes (OvO) or an average.
# We'll calculate weighted average ROC AUC for simplicity here.
try:
    svm_proba_preds = best_svm_model.predict_proba(X_test)
    # Use 'weighted' for multiclass ROC AUC and specify multi_class strategy
    roc_auc = roc_auc_score(y_test, svm_proba_preds, multi_class='ovr', average='weighted')
except ValueError:
    roc_auc = "N/A (Cannot calculate ROC AUC for this model/data or insufficient classes)"


print("\n--- SVM Model Evaluation ---")
print("Accuracy:", accuracy_score(y_test, svm_preds))
print("F1 Score:", f1_score(y_test, svm_preds, average='weighted')) # Use 'weighted' for multiclass F1
print("ROC AUC Score:", roc_auc)
print("\nClassification Report:\n", classification_report(y_test, svm_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, svm_preds))

# Save model
joblib.dump(best_svm_model, 'best_svm_model.pkl')
print("\nSVM model saved as 'best_svm_model.pkl'")


Starting SVM grid search...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best SVM parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}

--- SVM Model Evaluation ---
Accuracy: 0.355
F1 Score: 0.1860147601476015
ROC AUC Score: 0.44964922353871656

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        61
           1       0.35      1.00      0.52        71
           2       0.00      0.00      0.00        68

    accuracy                           0.35       200
   macro avg       0.12      0.33      0.17       200
weighted avg       0.13      0.35      0.19       200

Confusion Matrix:
 [[ 0 61  0]
 [ 0 71  0]
 [ 0 68  0]]

SVM model saved as 'best_svm_model.pkl'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
