Understand and implement model evaluation using cross-validation and improve model performance by hyperparameter tuning.

Step 1: Import Libraries and Load Data


In [9]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer   # ✅ added to handle NaN


Step 2: Load Dataset and Prepare Features and Target

In [10]:
# Step 2: Load the data
data = pd.read_csv('/content/preprocessed_earthquake_data.csv')

# Define target and categorical columns
target = 'Status_Reviewed'
categorical_cols = ['Type', 'Magnitude Type', 'Source', 'Status']

# Drop rows where target is NaN
data = data.dropna(subset=[target])

# Features and target
X = data.drop(columns=[target] + categorical_cols)
y = data[target]

# Handle missing values in features
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)


Step 3: Implement Cross-Validation

In [11]:
# Step 3: Define models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=500, random_state=42)
}

# Perform 10-fold cross-validation
for name, model in models.items():
    print(f"\nCross-validation for {name}:")
    scores = cross_val_score(model, X, y, cv=10, scoring='accuracy')
    print(f"Accuracy Scores for each fold: {scores}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Accuracy Variance: {np.var(scores):.6f}")



Cross-validation for RandomForest:
Accuracy Scores for each fold: [0.9987185  1.         1.         1.         1.         1.
 1.         1.         1.         0.97350427]
Mean Accuracy: 0.9972
Accuracy Variance: 0.000063

Cross-validation for LogisticRegression:
Accuracy Scores for each fold: [0.99700982 1.         1.         1.         1.         1.
 1.         1.         1.         0.97521368]
Mean Accuracy: 0.9972
Accuracy Variance: 0.000055


Step 4: Hyperparameter Tuning with GridSearchCV

In [12]:
# Step 4: Hyperparameter Tuning

# Parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # solver supporting l1 penalty
}

# GridSearchCV for each model with stratified 5-fold CV
grid_searches = {
    'RandomForest': GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='accuracy'),
    'LogisticRegression': GridSearchCV(LogisticRegression(max_iter=500, random_state=42), param_grid_lr, cv=5, scoring='accuracy')
}

# Fit grid search and find best params and scores
for name, gs in grid_searches.items():
    gs.fit(X, y)
    print(f"\nBest parameters for {name}: {gs.best_params_}")
    print(f"Best cross-validation accuracy for {name}: {gs.best_score_:.4f}")



Best parameters for RandomForest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validation accuracy for RandomForest: 0.9974

Best parameters for LogisticRegression: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy for LogisticRegression: 0.9976


Step 5: Evaluate Best Model on Full Dataset

In [14]:
# Step 5: Evaluate Best Model on Full Dataset

# Collect the best models from grid searches
best_models = {name: gs.best_estimator_ for name, gs in grid_searches.items()}

# Train each best model on the full dataset and report performance
for name, model in best_models.items():
    model.fit(X, y)
    acc = model.score(X, y)
    print(f"\n{name} trained on full dataset:")
    print(f"Final Accuracy on entire dataset: {acc:.4f}")



RandomForest trained on full dataset:
Final Accuracy on entire dataset: 1.0000

LogisticRegression trained on full dataset:
Final Accuracy on entire dataset: 0.9997
