In [241]:
import pandas as pd
import numpy as np

## Load the Dataset

In [242]:
from dotenv import load_dotenv
import os
from pathlib import Path
import pickle

# Load environment variables
load_dotenv()

# Get dataset directory
dataset_dir = os.getenv("DATASET_DIR")
if dataset_dir is None:
    raise ValueError("DATASET_DIR not found in .env file")

# Use pathlib for reliable path handling
data_file = Path(dataset_dir) / "3_earthquake_1995-2023.csv"
df = pd.read_csv(data_file)


# Drop unnecessary columns
df.drop(columns=['title'], inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

## EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize CDI distribution
sns.histplot(df['cdi'], kde=True, bins=30)
plt.title('Distribution of CDI')
plt.show()

## Preprocessing

In [246]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [247]:
X = df.drop(columns=['cdi'])
y = df['cdi']

# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Initialize transformers dictionary
transformers = {
    'label_encoders': {},
    'scaler': StandardScaler(),
    'target_encoder': LabelEncoder()  # Added target encoder
}

# Create copy of X for transformation
X_transformed = X.copy()

# Transform categorical features
for cat_col in categorical_features:
    le = LabelEncoder()
    X_transformed[cat_col] = le.fit_transform(X[cat_col])
    transformers['label_encoders'][cat_col] = le

# Transform numerical features
if len(numerical_features) > 0:
    X_transformed[numerical_features] = transformers['scaler'].fit_transform(X[numerical_features])

# Transform target variable
y_transformed = transformers['target_encoder'].fit_transform(y)

### Splitting model

In [None]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y_transformed, test_size=0.2, random_state=42, stratify=y_transformed
)

In [None]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
transformers['smote'] = smote

### Saving files

In [None]:
# Create Models directory
models_dir = Path.cwd() / "Models"
models_dir.mkdir(exist_ok=True)

# Save transformers
with open(models_dir / 'transformers.pkl', 'wb') as f:
    pickle.dump(transformers, f)

# Save feature information
feature_info = {
    'categorical_features': list(categorical_features),
    'numerical_features': list(numerical_features)
}

with open(models_dir / 'feature_info.pkl', 'wb') as f:
    pickle.dump(feature_info, f)

## Feature selection

In [250]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, mutual_info_classif

In [None]:
# Initialize RFC for feature selection
feature_selector_rf = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=feature_selector_rf, n_features_to_select=5)
kbest = SelectKBest(score_func=mutual_info_classif, k=5)

# Fit feature selectors
rfe.fit(X_train_resampled, y_train_resampled)
kbest.fit(X_train_resampled, y_train_resampled)

# Get selected features
selected_features_rfe = X_train.columns[rfe.support_]
selected_features_kbest = X_train.columns[kbest.get_support()]

# Initialize RFC for feature importance
rf_importance = RandomForestClassifier(n_estimators=100, random_state=42)
rf_importance.fit(X_train_resampled, y_train_resampled)
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf_importance.feature_importances_
}).sort_values(by='Importance', ascending=False)

selected_features_rf = feature_importance.head(5)['Feature']

print("\nSelected Features:")
print(f"RFE: {list(selected_features_rfe)}")
print(f"SelectKBest: {list(selected_features_kbest)}")
print(f"Random Forest Importance: {list(selected_features_rf)}")

In [None]:
# Final selected features (using RF importance)
selected_features = list(selected_features_rf)

In [None]:
# Save selected features
with open(models_dir / 'selected_features.pkl', 'wb') as f:
    pickle.dump(selected_features, f)

### RFC

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Prepare final datasets
X_train_final = X_train_resampled[selected_features]
X_test_final = X_test[selected_features]

# Define RFC parameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


##  Training model

In [253]:
# Initialize RFC
rfc = RandomForestClassifier(random_state=42)

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=1
)

# Fit the model
grid_search.fit(X_train_final, y_train_resampled)

# Print best parameters and score
print(f"\nBest Parameters: {grid_search.best_params_}")
print(f"Best Cross-validation Score: {grid_search.best_score_:.4f}")

In [None]:
# Get best model
best_rf = grid_search.best_estimator_

# Make predictions
y_pred_rf = best_rf.predict(X_test_final)

# Print classification report
print("\nClassification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))
print(f"Random Forest Test Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")


In [None]:
# Save the best model
with open(models_dir / 'cdi_model.pkl', 'wb') as f:
    pickle.dump(best_rf, f)

print(f"\nBest Random Forest model saved in {models_dir}")