In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('1_earthquake_1995-2023.csv')

df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [4]:
print(df.shape)
df.head()

(974, 21)


Unnamed: 0,title,magnitude,cdi,mmi,alert,tsunami,sig,net,nst,dmin,...,magType,depth,latitude,longitude,location,continent,country,day,month,year
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,7,4,green,0,657,us,114,7.177,...,mww,192.955,-13.88,167.16,"Sola, Vanuatu",Oceania,Vanuatu,16,8,2023
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,8,6,yellow,0,775,us,92,0.679,...,mww,69.727,12.81,-88.13,"Intipucá, El Salvador",North America,El Salvador,19,7,2023
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,7,5,green,0,899,us,70,1.634,...,mww,171.371,-38.19,-70.37,"Loncopué, Argentina",South America,Argentina,17,7,2023
3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,6,6,green,1,860,us,173,0.907,...,mww,32.571,54.38,-160.7,"Sand Point, Alaska",North America,United States,16,7,2023
4,M 7.3 - Alaska Peninsula,7.3,0,5,,1,820,at,79,0.879451,...,Mi,21.0,54.49,-160.8,Alaska Peninsula,North America,United States,16,7,2023


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 21 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      974 non-null    object 
 1   magnitude  974 non-null    float64
 2   cdi        974 non-null    int64  
 3   mmi        974 non-null    int64  
 4   alert      426 non-null    object 
 5   tsunami    974 non-null    int64  
 6   sig        974 non-null    int64  
 7   net        974 non-null    object 
 8   nst        974 non-null    int64  
 9   dmin       974 non-null    float64
 10  gap        974 non-null    float64
 11  magType    974 non-null    object 
 12  depth      974 non-null    float64
 13  latitude   974 non-null    float64
 14  longitude  974 non-null    float64
 15  location   974 non-null    object 
 16  continent  974 non-null    object 
 17  country    974 non-null    object 
 18  day        974 non-null    int64  
 19  month      974 non-null    int64  
 20  year      

## Preprocessing

### Handling Nulls

#### - Handling Alert Nulls

In [6]:
data = df[~df['alert'].isna()]
data.head(3)

Unnamed: 0,title,magnitude,cdi,mmi,alert,tsunami,sig,net,nst,dmin,...,magType,depth,latitude,longitude,location,continent,country,day,month,year
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,7,4,green,0,657,us,114,7.177,...,mww,192.955,-13.88,167.16,"Sola, Vanuatu",Oceania,Vanuatu,16,8,2023
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,8,6,yellow,0,775,us,92,0.679,...,mww,69.727,12.81,-88.13,"Intipucá, El Salvador",North America,El Salvador,19,7,2023
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,7,5,green,0,899,us,70,1.634,...,mww,171.371,-38.19,-70.37,"Loncopué, Argentina",South America,Argentina,17,7,2023


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 426 entries, 0 to 517
Data columns (total 21 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      426 non-null    object 
 1   magnitude  426 non-null    float64
 2   cdi        426 non-null    int64  
 3   mmi        426 non-null    int64  
 4   alert      426 non-null    object 
 5   tsunami    426 non-null    int64  
 6   sig        426 non-null    int64  
 7   net        426 non-null    object 
 8   nst        426 non-null    int64  
 9   dmin       426 non-null    float64
 10  gap        426 non-null    float64
 11  magType    426 non-null    object 
 12  depth      426 non-null    float64
 13  latitude   426 non-null    float64
 14  longitude  426 non-null    float64
 15  location   426 non-null    object 
 16  continent  426 non-null    object 
 17  country    426 non-null    object 
 18  day        426 non-null    int64  
 19  month      426 non-null    int64  
 20  year       426 

# Alert imputation

### Training Model

In [8]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import pickle

In [9]:
# Select features
selected_features = [
    'magnitude', 'latitude', 'longitude', 'depth', 'tsunami'
]

X = data[selected_features]
y = data['alert']

# Encode categorical target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Save label encoding mapping
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
print("Label Encoding Mapping:", label_mapping)

# Save the label encoder for the 'alert' target variable
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("Label Encoder saved to 'label_encoder.pkl'")

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[selected_features])

# Save scaler for future use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(pd.DataFrame(X_scaled, columns=selected_features), y_encoded)

# Save the resampled data
X_resampled.to_csv('X_resampled.csv', index=False)
pd.DataFrame(y_resampled, columns=['alert']).to_csv('y_resampled.csv', index=False)

print("Data encoding and resampling complete.")


Label Encoding Mapping: {'green': 0, 'orange': 1, 'red': 2, 'yellow': 3}
Label Encoder saved to 'label_encoder.pkl'
Data encoding and resampling complete.


In [10]:
from sklearn.model_selection import ShuffleSplit

In [11]:
# Define ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

# Create splits
splits = shuffle_split.split(X_resampled, y_resampled)

In [12]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [13]:
# Define parameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 300, 500],         # Number of trees
    'max_depth': [3, 5, 10],                # Maximum depth of a tree
    'learning_rate': [0.01, 0.1, 0.2],      # Step size shrinkage
    'subsample': [0.8, 1.0],                # Subsample ratio of training samples
    'colsample_bytree': [0.8, 1.0],         # Subsample ratio of features
    'gamma': [0, 1],                        # Minimum loss reduction to make a split
}              


In [14]:
# Initialize XGBoost classifier
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Initialize GridSearchCV with reduced search space
# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=shuffle_split,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
# grid_search_xgb.fit(X_resampled, y_resampled)

# Retrieve the best parameters
best_params_xgb = {
                    'colsample_bytree': 0.8, 
                    'gamma': 0, 
                    'learning_rate': 0.01, 
                    'max_depth': 10, 
                    'n_estimators': 300, 
                    'subsample': 0.8
                    }
print("Best Parameters for XGBoost:", best_params_xgb)

Best Parameters for XGBoost: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 300, 'subsample': 0.8}


In [15]:
# Train XGBoost with the best parameters from GridSearchCV
best_xgb = XGBClassifier(
    n_estimators=best_params_xgb['n_estimators'],
    max_depth=best_params_xgb['max_depth'],
    learning_rate=best_params_xgb['learning_rate'],
    subsample=best_params_xgb['subsample'],
    colsample_bytree=best_params_xgb['colsample_bytree'],
    gamma=best_params_xgb['gamma'],
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

In [17]:
# Train and evaluate using ShuffleSplit
fold = 1
for train_idx, test_idx in splits:
    print(f"--- Fold {fold} ---")
    
    # Split data
    X_train, X_test = X_resampled.iloc[train_idx], X_resampled.iloc[test_idx]
    y_train, y_test = y_resampled[train_idx], y_resampled[test_idx]
    
    # Train the best XGBoost model
    # Add early stopping to your training
    best_xgb.fit(
        X_train, 
        y_train,
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=100
    )
    
    # Predict on test data
    y_pred = best_xgb.predict(X_test)
    
    # Evaluation
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    fold += 1

--- Fold 1 ---
[0]	validation_0-mlogloss:1.37342	validation_1-mlogloss:1.37577
[100]	validation_0-mlogloss:0.62428	validation_1-mlogloss:0.77292
[200]	validation_0-mlogloss:0.33859	validation_1-mlogloss:0.53905
[299]	validation_0-mlogloss:0.20742	validation_1-mlogloss:0.43127

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.90        68
           1       0.87      0.86      0.87        64
           2       0.90      0.93      0.91        69
           3       0.81      0.81      0.81        64

    accuracy                           0.88       265
   macro avg       0.87      0.87      0.87       265
weighted avg       0.88      0.88      0.88       265


Confusion Matrix:
[[61  0  2  5]
 [ 0 55  3  6]
 [ 1  3 64  1]
 [ 5  5  2 52]]
--- Fold 2 ---
[0]	validation_0-mlogloss:1.37366	validation_1-mlogloss:1.37497
[100]	validation_0-mlogloss:0.63025	validation_1-mlogloss:0.72753
[200]	validation_0-mlogloss:0.34461	val

In [18]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation using ShuffleSplit with the tuned model
cv_scores_xgb = cross_val_score(best_xgb, X_resampled, y_resampled, cv=shuffle_split, scoring='accuracy')

print("\nCross-validation scores with ShuffleSplit (Tuned XGBoost):", cv_scores_xgb)
print(f"Average CV score: {cv_scores_xgb.mean():.4f} (+/- {cv_scores_xgb.std() * 2:.4f})")


Cross-validation scores with ShuffleSplit (Tuned XGBoost): [0.8754717  0.92075472 0.90943396 0.9245283  0.91320755]
Average CV score: 0.9087 (+/- 0.0349)


In [19]:
# Feature Importance
feature_importance_xgb = pd.DataFrame({
    'feature': selected_features,
    'importance': best_xgb.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance for Tuned XGBoost:")
print(feature_importance_xgb)


Feature Importance for Tuned XGBoost:
     feature  importance
4    tsunami    0.293228
1   latitude    0.201947
0  magnitude    0.187808
3      depth    0.172160
2  longitude    0.144858


In [20]:
# Save the trained XGBoost model
with open('best_xgb_model.pkl', 'wb') as f:
    pickle.dump(best_xgb, f)

print("XGBoost model saved to 'best_xgb_model.pkl'")

XGBoost model saved to 'best_xgb_model.pkl'
