In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [111]:
import warnings
warnings.filterwarnings('ignore')

In [112]:
df = pd.read_csv('..\Datasets\\2_earthquake_1995-2023.csv')

In [113]:
df.columns

Index(['Unnamed: 0', 'title', 'magnitude', 'cdi', 'mmi', 'alert', 'tsunami',
       'sig', 'net', 'nst', 'dmin', 'gap', 'magType', 'depth', 'latitude',
       'longitude', 'location', 'continent', 'country', 'day', 'month',
       'year'],
      dtype='object')

In [114]:
print(df.shape)
df.head()

(974, 22)


Unnamed: 0.1,Unnamed: 0,title,magnitude,cdi,mmi,alert,tsunami,sig,net,nst,...,magType,depth,latitude,longitude,location,continent,country,day,month,year
0,0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,7,4,green,0,657,us,114,...,mww,192.955,-13.8814,167.158,"Sola, Vanuatu",Oceania,Vanuatu,16,8,2023
1,1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,8,6,yellow,0,775,us,92,...,mww,69.727,12.814,-88.1265,"Intipucá, El Salvador",North America,El Salvador,19,7,2023
2,2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,7,5,green,0,899,us,70,...,mww,171.371,-38.1911,-70.3731,"Loncopué, Argentina",South America,Argentina,17,7,2023
3,3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,6,6,green,1,860,us,173,...,mww,32.571,54.3844,-160.699,"Sand Point, Alaska",North America,United States,16,7,2023
4,4,M 7.3 - Alaska Peninsula,7.3,0,5,No alert,1,820,at,79,...,Mi,21.0,54.49,-160.796,Alaska Peninsula,North America,United States,16,7,2023


In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  974 non-null    int64  
 1   title       974 non-null    object 
 2   magnitude   974 non-null    float64
 3   cdi         974 non-null    int64  
 4   mmi         974 non-null    int64  
 5   alert       858 non-null    object 
 6   tsunami     974 non-null    int64  
 7   sig         974 non-null    int64  
 8   net         974 non-null    object 
 9   nst         974 non-null    int64  
 10  dmin        974 non-null    float64
 11  gap         974 non-null    float64
 12  magType     974 non-null    object 
 13  depth       974 non-null    float64
 14  latitude    974 non-null    float64
 15  longitude   974 non-null    float64
 16  location    974 non-null    object 
 17  continent   974 non-null    object 
 18  country     974 non-null    object 
 19  day         974 non-null    i

## Preprocessing

### Handling Nulls

#### - Handling Alert Nulls

In [116]:
data = df[~df['alert'].isna()]
data.head(3)

Unnamed: 0.1,Unnamed: 0,title,magnitude,cdi,mmi,alert,tsunami,sig,net,nst,...,magType,depth,latitude,longitude,location,continent,country,day,month,year
0,0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,7,4,green,0,657,us,114,...,mww,192.955,-13.8814,167.158,"Sola, Vanuatu",Oceania,Vanuatu,16,8,2023
1,1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,8,6,yellow,0,775,us,92,...,mww,69.727,12.814,-88.1265,"Intipucá, El Salvador",North America,El Salvador,19,7,2023
2,2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,7,5,green,0,899,us,70,...,mww,171.371,-38.1911,-70.3731,"Loncopué, Argentina",South America,Argentina,17,7,2023


In [117]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 858 entries, 0 to 973
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  858 non-null    int64  
 1   title       858 non-null    object 
 2   magnitude   858 non-null    float64
 3   cdi         858 non-null    int64  
 4   mmi         858 non-null    int64  
 5   alert       858 non-null    object 
 6   tsunami     858 non-null    int64  
 7   sig         858 non-null    int64  
 8   net         858 non-null    object 
 9   nst         858 non-null    int64  
 10  dmin        858 non-null    float64
 11  gap         858 non-null    float64
 12  magType     858 non-null    object 
 13  depth       858 non-null    float64
 14  latitude    858 non-null    float64
 15  longitude   858 non-null    float64
 16  location    858 non-null    object 
 17  continent   858 non-null    object 
 18  country     858 non-null    object 
 19  day         858 non-null    int64 

# Alert imputation

### Training Model

In [118]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import pickle

In [119]:
import os
models_folder = 'Models'
os.makedirs(models_folder, exist_ok=True)

In [120]:
# Select features
# selected_features = [
#     'magnitude', 'latitude', 'longitude', 'depth', 'tsunami'
# ]

# Select features
selected_features = [
    'dmin', 'magnitude', 'tsunami', 'depth', 'latitude', 'longitude'
]

X = data[selected_features]
y = data['alert']

# Encode categorical target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save label encoding mapping
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Encoding Mapping:", label_mapping)

# Saving the Label Encoder
with open("..\\Notebooks\\Models\\label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)
print("Label Encoder saved to 'Models/label_encoder.pkl'")

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[selected_features])

# Saving the Scaler
with open("..\\Notebooks\\Models\\scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
print("Label Encoder saved to 'Models/scaler.pkl'")

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(pd.DataFrame(X_scaled, columns=selected_features), y_encoded)

# Saving the resampled data
X_resampled.to_csv(os.path.join(models_folder, 'X_resampled.csv'), index=False)
pd.DataFrame(y_resampled, columns=['alert']).to_csv(os.path.join(models_folder, 'y_resampled.csv'), index=False)
print("Resampled data saved to 'Models/X_resampled.csv' and 'Models/y_resampled.csv'")

print("Data encoding and resampling complete.")


Label Encoding Mapping: {'No alert': 0, 'green': 1, 'orange': 2, 'red': 3, 'yellow': 4}
Label Encoder saved to 'Models/label_encoder.pkl'
Label Encoder saved to 'Models/scaler.pkl'
Resampled data saved to 'Models/X_resampled.csv' and 'Models/y_resampled.csv'
Data encoding and resampling complete.


In [121]:
from sklearn.model_selection import StratifiedKFold

In [122]:
# Define StratifiedKFold
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create splits
splits = stratified_kfold.split(X_resampled, y_resampled)

In [123]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [124]:
# Define parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 300, 500],         # Number of trees
    'max_depth': [3, 5, 10],                # Maximum depth of a tree
    'learning_rate': [0.01, 0.1, 0.2],      # Step size shrinkage
    'subsample': [0.8, 1.0],                # Subsample ratio of training samples
    'colsample_bytree': [0.8, 1.0],         # Subsample ratio of features
    'gamma': [0, 1],                        # Minimum loss reduction to make a split
}              


In [125]:
# Initialize XGBoost classifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Initialize GridSearchCV with reduced search space
# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=stratified_kfold,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
# grid_search_xgb.fit(X_resampled, y_resampled)

# Retrieve the best parameters
best_params_xgb = {
                    'colsample_bytree': 0.8, 
                    'gamma': 0, 
                    'learning_rate': 0.01, 
                    'max_depth': 10, 
                    'n_estimators': 300, 
                    'subsample': 0.8
                    }
print("Best Parameters for XGBoost:", best_params_xgb)

Best Parameters for XGBoost: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 300, 'subsample': 0.8}


In [126]:
# Train XGBoost with the best parameters from GridSearchCV
best_xgb = XGBClassifier(
    n_estimators=best_params_xgb['n_estimators'],
    max_depth=best_params_xgb['max_depth'],
    learning_rate=best_params_xgb['learning_rate'],
    subsample=best_params_xgb['subsample'],
    colsample_bytree=best_params_xgb['colsample_bytree'],
    gamma=best_params_xgb['gamma'],
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

In [127]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_validate

In [128]:
# First, fit the model with the best parameters
best_xgb.fit(X_resampled, y_resampled)

# Create a new list to store the scores manually
scores = []

# Manually perform cross-validation
for train_idx, test_idx in stratified_kfold.split(X_resampled, y_resampled):
    # Split data
    X_train, X_test = X_resampled.iloc[train_idx], X_resampled.iloc[test_idx]
    y_train, y_test = y_resampled[train_idx], y_resampled[test_idx]
    
    # Train model
    model = XGBClassifier(**best_params_xgb, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    
    # Get score
    score = model.score(X_test, y_test)
    scores.append(score)

# Convert scores to numpy array
scores = np.array(scores)

# Print results
print("\nCross-validation scores with StratifiedKFold (Tuned XGBoost):", scores)
print(f"Average CV score: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")


Cross-validation scores with StratifiedKFold (Tuned XGBoost): [0.89351852 0.91666667 0.89814815 0.9537037  0.92824074]
Average CV score: 0.9181 (+/- 0.0436)


In [129]:
# Feature Importance
feature_importance_xgb = pd.DataFrame({
    'feature': selected_features,
    'importance': best_xgb.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance for Tuned XGBoost:")
print(feature_importance_xgb)


Feature Importance for Tuned XGBoost:
     feature  importance
2    tsunami    0.336208
0       dmin    0.236899
4   latitude    0.129205
1  magnitude    0.108191
5  longitude    0.094914
3      depth    0.094582


In [130]:
with open("../Notebooks/Models/best_xgb_model.pkl", "wb") as f:
    pickle.dump(best_xgb, f)
print("XGBoost model saved to 'Models/best_xgb_model.pkl'")


XGBoost model saved to 'Models/best_xgb_model.pkl'
