In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [42]:
import warnings
warnings.filterwarnings('ignore')

In [43]:
df = pd.read_csv('2_earthquake_1995-2023.csv')

df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [44]:
print(df.shape)
df.head()

(974, 21)


Unnamed: 0,title,magnitude,cdi,mmi,alert,tsunami,sig,net,nst,dmin,...,magType,depth,latitude,longitude,location,continent,country,day,month,year
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,7,4,green,0,657,us,114,7.177,...,mww,192.955,-13.88,167.16,"Sola, Vanuatu",Oceania,Vanuatu,16,8,2023
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,8,6,yellow,0,775,us,92,0.679,...,mww,69.727,12.81,-88.13,"Intipucá, El Salvador",North America,El Salvador,19,7,2023
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,7,5,green,0,899,us,70,1.634,...,mww,171.371,-38.19,-70.37,"Loncopué, Argentina",South America,Argentina,17,7,2023
3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,6,6,green,1,860,us,173,0.907,...,mww,32.571,54.38,-160.7,"Sand Point, Alaska",North America,United States,16,7,2023
4,M 7.3 - Alaska Peninsula,7.3,0,5,No alert,1,820,at,79,0.879451,...,Mi,21.0,54.49,-160.8,Alaska Peninsula,North America,United States,16,7,2023


In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 21 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      974 non-null    object 
 1   magnitude  974 non-null    float64
 2   cdi        974 non-null    int64  
 3   mmi        974 non-null    int64  
 4   alert      858 non-null    object 
 5   tsunami    974 non-null    int64  
 6   sig        974 non-null    int64  
 7   net        974 non-null    object 
 8   nst        974 non-null    int64  
 9   dmin       974 non-null    float64
 10  gap        974 non-null    float64
 11  magType    974 non-null    object 
 12  depth      974 non-null    float64
 13  latitude   974 non-null    float64
 14  longitude  974 non-null    float64
 15  location   974 non-null    object 
 16  continent  974 non-null    object 
 17  country    974 non-null    object 
 18  day        974 non-null    int64  
 19  month      974 non-null    int64  
 20  year      

## Preprocessing

### Handling Nulls

#### - Handling Alert Nulls

In [46]:
data = df[~df['alert'].isna()]
data.head(3)

Unnamed: 0,title,magnitude,cdi,mmi,alert,tsunami,sig,net,nst,dmin,...,magType,depth,latitude,longitude,location,continent,country,day,month,year
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,7,4,green,0,657,us,114,7.177,...,mww,192.955,-13.88,167.16,"Sola, Vanuatu",Oceania,Vanuatu,16,8,2023
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,8,6,yellow,0,775,us,92,0.679,...,mww,69.727,12.81,-88.13,"Intipucá, El Salvador",North America,El Salvador,19,7,2023
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,7,5,green,0,899,us,70,1.634,...,mww,171.371,-38.19,-70.37,"Loncopué, Argentina",South America,Argentina,17,7,2023


In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 858 entries, 0 to 973
Data columns (total 21 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      858 non-null    object 
 1   magnitude  858 non-null    float64
 2   cdi        858 non-null    int64  
 3   mmi        858 non-null    int64  
 4   alert      858 non-null    object 
 5   tsunami    858 non-null    int64  
 6   sig        858 non-null    int64  
 7   net        858 non-null    object 
 8   nst        858 non-null    int64  
 9   dmin       858 non-null    float64
 10  gap        858 non-null    float64
 11  magType    858 non-null    object 
 12  depth      858 non-null    float64
 13  latitude   858 non-null    float64
 14  longitude  858 non-null    float64
 15  location   858 non-null    object 
 16  continent  858 non-null    object 
 17  country    858 non-null    object 
 18  day        858 non-null    int64  
 19  month      858 non-null    int64  
 20  year       858 

# Alert imputation

### Training Model

In [48]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import pickle

In [49]:
# Select features
# selected_features = [
#     'magnitude', 'latitude', 'longitude', 'depth', 'tsunami'
# ]

# Select features
selected_features = [
    'dmin', 'cdi', 'sig', 'tsunami', 'gap'
]

X = data[selected_features]
y = data['alert']

# Encode categorical target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save label encoding mapping
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Encoding Mapping:", label_mapping)

# Save the label encoder for the 'alert' target variable
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Label Encoder saved to 'label_encoder.pkl'")

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[selected_features])

# Save scaler for future use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(pd.DataFrame(X_scaled, columns=selected_features), y_encoded)

# Save the resampled data
X_resampled.to_csv('X_resampled.csv', index=False)
pd.DataFrame(y_resampled, columns=['alert']).to_csv('y_resampled.csv', index=False)

print("Data encoding and resampling complete.")


Label Encoding Mapping: {'No alert': 0, 'green': 1, 'orange': 2, 'red': 3, 'yellow': 4}
Label Encoder saved to 'label_encoder.pkl'
Data encoding and resampling complete.


In [50]:
from sklearn.model_selection import ShuffleSplit

In [51]:
# Define ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

# Create splits
splits = shuffle_split.split(X_resampled, y_resampled)

In [52]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [53]:
# Define parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 300, 500],         # Number of trees
    'max_depth': [3, 5, 10],                # Maximum depth of a tree
    'learning_rate': [0.01, 0.1, 0.2],      # Step size shrinkage
    'subsample': [0.8, 1.0],                # Subsample ratio of training samples
    'colsample_bytree': [0.8, 1.0],         # Subsample ratio of features
    'gamma': [0, 1],                        # Minimum loss reduction to make a split
}              


In [54]:
# Initialize XGBoost classifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Initialize GridSearchCV with reduced search space
# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=shuffle_split,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
# grid_search_xgb.fit(X_resampled, y_resampled)

# Retrieve the best parameters
best_params_xgb = {
                    'colsample_bytree': 0.8, 
                    'gamma': 0, 
                    'learning_rate': 0.01, 
                    'max_depth': 10, 
                    'n_estimators': 300, 
                    'subsample': 0.8
                    }
print("Best Parameters for XGBoost:", best_params_xgb)

Best Parameters for XGBoost: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 300, 'subsample': 0.8}


In [55]:
# Train XGBoost with the best parameters from GridSearchCV
best_xgb = XGBClassifier(
    n_estimators=best_params_xgb['n_estimators'],
    max_depth=best_params_xgb['max_depth'],
    learning_rate=best_params_xgb['learning_rate'],
    subsample=best_params_xgb['subsample'],
    colsample_bytree=best_params_xgb['colsample_bytree'],
    gamma=best_params_xgb['gamma'],
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

In [56]:
from sklearn.metrics import classification_report, confusion_matrix

In [57]:
# Train and evaluate using ShuffleSplit
fold = 1
for train_idx, test_idx in splits:
    print(f"--- Fold {fold} ---")
    
    # Split data
    X_train, X_test = X_resampled.iloc[train_idx], X_resampled.iloc[test_idx]
    y_train, y_test = y_resampled[train_idx], y_resampled[test_idx]
    
    # Train the best XGBoost model
    # Add early stopping to your training
    best_xgb.fit(
        X_train, 
        y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100
    )
    
    # Predict on test data
    y_pred = best_xgb.predict(X_test)
    
    # Evaluation
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    fold += 1

--- Fold 1 ---
[0]	validation_0-mlogloss:1.58942	validation_1-mlogloss:1.59010


[100]	validation_0-mlogloss:0.59282	validation_1-mlogloss:0.65945
[200]	validation_0-mlogloss:0.27476	validation_1-mlogloss:0.36439
[299]	validation_0-mlogloss:0.14573	validation_1-mlogloss:0.24457

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        88
           1       0.92      0.89      0.91        93
           2       0.97      0.98      0.97        87
           3       1.00      1.00      1.00        86
           4       0.86      0.88      0.87        78

    accuracy                           0.95       432
   macro avg       0.95      0.95      0.95       432
weighted avg       0.95      0.95      0.95       432


Confusion Matrix:
[[87  1  0  0  0]
 [ 1 83  0  0  9]
 [ 0  0 85  0  2]
 [ 0  0  0 86  0]
 [ 0  6  3  0 69]]
--- Fold 2 ---
[0]	validation_0-mlogloss:1.58932	validation_1-mlogloss:1.59035
[100]	validation_0-mlogloss:0.59448	validation_1-mlogloss:0.65378
[200]	validation_0-mlogloss:0.2761

In [58]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation using ShuffleSplit with the tuned model
cv_scores_xgb = cross_val_score(best_xgb, X_resampled, y_resampled, cv=shuffle_split, scoring='accuracy')

print("\nCross-validation scores with ShuffleSplit (Tuned XGBoost):", cv_scores_xgb)
print(f"Average CV score: {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std() * 2:.4f})")


Cross-validation scores with ShuffleSplit (Tuned XGBoost): [0.94907407 0.93981481 0.95601852 0.94212963 0.93287037]
Average CV score: 0.9440 (+/- 0.0159)


In [59]:
# Feature Importance
feature_importance_xgb = pd.DataFrame({
    'feature': selected_features,
    'importance': best_xgb.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance for Tuned XGBoost:")
print(feature_importance_xgb)


Feature Importance for Tuned XGBoost:
   feature  importance
2      sig    0.375109
0     dmin    0.214870
1      cdi    0.181597
3  tsunami    0.165918
4      gap    0.062506


In [60]:
# Save the trained XGBoost model
with open('best_xgb_model.pkl', 'wb') as f:
    pickle.dump(best_xgb, f)

print("XGBoost model saved to 'best_xgb_model.pkl'")

XGBoost model saved to 'best_xgb_model.pkl'
