# Task
Build, train, and save `LightGBM` and `SVM` classifiers with integrated cross-validation and hyperparameter tuning using the data in "preprocessed_earthquake_data.csv". Evaluate these models using appropriate metrics, compare their performance, and identify which model performs best with reasoning.

## Load data

### Subtask:
Load the `preprocessed_earthquake_data.csv` file into a pandas DataFrame.


**Reasoning**:
Import pandas and load the data into a DataFrame.



In [13]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import joblib

In [14]:
df = pd.read_csv('/content/preprocessed_earthquake_data.csv')
display(df.head())
display(df.info())

Unnamed: 0,Latitude,Longitude,Type,Depth,Magnitude,Magnitude Type,Root Mean Square,Source,Status,Year,...,Source_ISCGEM,Source_ISCGEMSUP,Source_NC,Source_NN,Source_OFFICIAL,Source_PR,Source_SE,Source_US,Source_UW,Status_Reviewed
0,0.583377,0.844368,Earthquake,0.495984,0.277668,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.006109,0.698849,Earthquake,0.075272,-0.195082,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.739162,-1.701962,Earthquake,-0.413928,0.750418,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.017599,-0.503524,Earthquake,-0.454694,-0.195082,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.340688,0.691479,Earthquake,-0.454694,-0.195082,MW,-0.103839,ISCGEM,Automatic,-1.915523,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23409 entries, 0 to 23408
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Latitude                23409 non-null  float64
 1   Longitude               23409 non-null  float64
 2   Type                    23409 non-null  object 
 3   Depth                   23409 non-null  float64
 4   Magnitude               23409 non-null  float64
 5   Magnitude Type          23409 non-null  object 
 6   Root Mean Square        23409 non-null  float64
 7   Source                  23409 non-null  object 
 8   Status                  23409 non-null  object 
 9   Year                    23409 non-null  float64
 10  Day                     23409 non-null  float64
 11  Month_sin               23409 non-null  float64
 12  Month_cos               23409 non-null  float64
 13  Hour_sin                23409 non-null  float64
 14  Hour_cos                23409 non-null

None

In [15]:
#  (1 if Type == 'Earthquake', else 0)
if 'Type' not in df.columns:
    raise KeyError("Expected a 'Type' column in the dataset for labeling events. Columns: " + ", ".join(df.columns))

df['is_earthquake'] = (df['Type'] == 'Earthquake').astype(int)
print("Class distribution (is_earthquake):")
print(df['is_earthquake'].value_counts(normalize=False))
print(df['is_earthquake'].value_counts(normalize=True))

Class distribution (is_earthquake):
is_earthquake
1    23229
0      180
Name: count, dtype: int64
is_earthquake
1    0.992311
0    0.007689
Name: proportion, dtype: float64


In [16]:
drop_cols = ['Type', 'Source', 'Status']
for c in drop_cols:
    if c in df.columns:
        df = df.drop(columns=c)

y = df['is_earthquake'].values
X = df.drop(columns=['is_earthquake']).select_dtypes(include=[np.number])
print("Features matrix shape:", X.shape)

Features matrix shape: (23409, 36)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (18727, 36) Test shape: (4682, 36)


In [18]:
try:
    import lightgbm as lgb
    has_lgb = True
except Exception as e:
    has_lgb = False
    print("lightgbm not available in this environment. ")

from sklearn.metrics import roc_auc_score

lgb_pipe = None
if has_lgb:
    lgb_clf = lgb.LGBMClassifier(random_state=42, n_jobs=-1)
    lgb_pipe = Pipeline([('scaler', StandardScaler()), ('model', lgb_clf)])

    lgb_param_grid = {
        'model__n_estimators': [100, 200],
        'model__num_leaves': [31, 63],
        'model__learning_rate': [0.1, 0.01]
    }

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    lgb_search = GridSearchCV(lgb_pipe, lgb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
    lgb_search.fit(X_train, y_train)
    print("Best LightGBM params:", lgb_search.best_params_)
    lgb_best = lgb_search.best_estimator_
    joblib.dump(lgb_best, "lightgbm_best_model.joblib")
    print("Saved LightGBM model to lightgbm_best_model.joblib")

    y_pred = lgb_best.predict(X_test)
    y_proba = lgb_best.predict_proba(X_test)[:,1]
    lgb_metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_proba)
    }
    print("LightGBM metrics:", lgb_metrics)
else:
    print("Skipping LightGBM training because lightgbm is not installed in this environment.")

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Number of positive: 18583, number of negative: 144
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001868 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1130
[LightGBM] [Info] Number of data points in the train set: 18727, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.992311 -> initscore=4.860189
[LightGBM] [Info] Start training from score 4.860189
Best LightGBM params: {'model__learning_rate': 0.1, 'model__n_estimators': 100, 'model__num_leaves': 31}
Saved LightGBM model to lightgbm_best_model.joblib
LightGBM metrics: {'accuracy': 0.9997864160615122, 'precision': 0.9997848074026253, 'recall': 1.0, 'f1': 0.9998923921231034, 'roc_auc': np.float64(0.9999581479887119)}




In [19]:
# SVM classifier with GridSearchCV
svm_pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC(probability=True, random_state=42))])
svm_param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['rbf', 'linear'],
    'svc__gamma': ['scale', 'auto']
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
svm_search = GridSearchCV(svm_pipe, svm_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)
svm_search.fit(X_train, y_train)
print("Best SVM params:", svm_search.best_params_)
svm_best = svm_search.best_estimator_
joblib.dump(svm_best, "svm_best_model.joblib")
print("Saved SVM model to svm_best_model.joblib")

y_pred_svm = svm_best.predict(X_test)
y_proba_svm = svm_best.predict_proba(X_test)[:,1]
svm_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_svm),
    'precision': precision_score(y_test, y_pred_svm, zero_division=0),
    'recall': recall_score(y_test, y_pred_svm, zero_division=0),
    'f1': f1_score(y_test, y_pred_svm, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_proba_svm)
}
print("SVM metrics:", svm_metrics)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best SVM params: {'svc__C': 0.1, 'svc__gamma': 'scale', 'svc__kernel': 'rbf'}
Saved SVM model to svm_best_model.joblib
SVM metrics: {'accuracy': 0.9991456642460487, 'precision': 0.9991397849462366, 'recall': 1.0, 'f1': 0.9995697074010327, 'roc_auc': np.float64(0.985309944037882)}


In [20]:
results = {}
if 'lgb_metrics' in globals():
    results['LightGBM'] = lgb_metrics
if 'svm_metrics' in globals():
    results['SVM'] = svm_metrics

for model_name, mets in results.items():
    print(f"\n{model_name}------")
    for k,v in mets.items():
        print(f"{k}: {v:.4f}")



LightGBM------
accuracy: 0.9998
precision: 0.9998
recall: 1.0000
f1: 0.9999
roc_auc: 1.0000

SVM------
accuracy: 0.9991
precision: 0.9991
recall: 1.0000
f1: 0.9996
roc_auc: 0.9853
