# Import Libraries

In [7]:
import sys
import sqlite3
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import f1_score


np.random.seed(42)

# Data Loading & Preprocessing

##### 1. Load the data

In [8]:
DATABASE_PATH = "../database/database.db"
conn = sqlite3.connect(DATABASE_PATH)
data = pd.read_sql(sql='SELECT * FROM data', con=conn)
conn.close()

In [None]:
di = "SPEI"
di_scale = 1

SPEI_LABELS = ['ED', 'SD', 'MD', 'NN', 'MW', 'VW', 'EW']
SPEI_BINS = [-10, -2, -1.5, -1, 1, 1.5, 2, 10]

selected_columns = [
    'region_id',
    'region_name',
    'station_name',
    'station_id',
    'lat',
    'lon',
    'station_elevation',
    'date',
    'rrr24',
    'Hargreaves',
    f'{di}_{di_scale}',
    'GPM',
    'MOD16A2GF',
    'NDVI',
    'EVI',
    'LST_Day',
    'LST_Night',
    'LST',
    'PCI_GPM',
    'VCI',
    'TCI',
    'TCI_Day',
    'TCI_Night',
    'VHI',
    'VHI_Day',
    'VHI_Night',
    'CI_GPM',
    f'GPM_{di}_{di_scale}',
]

selected_stations = [
    'Ramsar',
    'Nowshahr',
    'Siahbisheh',
    'Kiyasar',
    'Kojur',
    'Baladeh',
    'Alasht',
    'Babolsar',
    'Gharakhil',
    'Sari',
    'Sari (dasht-e-naz airport)',
    'Galugah',
    'Bandar-e-amirabad',
    'Amol',
    'Polsefid'
]

# selected_stations_cat1 = ['Ramsar', 'Nowshahr']
# selected_stations_cat2 = ['Siahbisheh', 'Kiyasar', 'Kojur', 'Baladeh', 'Alasht']
# selected_stations_cat3 = ['Babolsar', 'Gharakhil', 'Sari', 'Sari (dasht-e-naz airport)', 'Galugah', 'Bandar-e-amirabad', 'Amol', 'Polsefid']

start_date = '2006-09'
end_date = '2025-09'

df = data\
    .filter(items=selected_columns)\
        .query("station_name in @selected_stations and date >= @start_date and date < @end_date")

df[f'{di}_{di_scale}_Class'] = pd.cut(df[f'{di}_{di_scale}'], bins=SPEI_BINS, labels=SPEI_LABELS)
df[f'{di}_{di_scale}_Class'] = df[f'{di}_{di_scale}_Class'].astype('category')

df['date'] = pd.to_datetime(df['date'], format='%Y-%m')
df['month_sin'] = np.sin(2 * np.pi * df['date'].dt.month / 12)
df['month_cos'] = np.cos(2 * np.pi * df['date'].dt.month / 12)
        
df['LST_Diff'] = df['LST_Day'] - df['LST_Night']

df

Unnamed: 0,region_id,region_name,station_name,station_id,lat,lon,station_elevation,date,rrr24,Hargreaves,...,TCI_Night,VHI,VHI_Day,VHI_Night,CI_GPM,GPM_SPEI_1,SPEI_1_Class,month_sin,month_cos,LST_Diff
320,MASA,Mazandaran,Ramsar,40732,36.90,50.68,-20.0,2006-09-01,274.40,88.95,...,0.492,0.623,0.649,0.578,0.712,1.062,MW,-1.000000e+00,-1.836970e-16,0.180
321,MASA,Mazandaran,Ramsar,40732,36.90,50.68,-20.0,2006-10-01,285.52,63.69,...,0.071,0.503,0.560,0.472,0.484,-0.025,NN,-8.660254e-01,5.000000e-01,0.154
322,MASA,Mazandaran,Ramsar,40732,36.90,50.68,-20.0,2006-11-01,193.70,44.86,...,0.477,0.725,0.709,0.738,0.617,-0.122,NN,-5.000000e-01,8.660254e-01,0.162
323,MASA,Mazandaran,Ramsar,40732,36.90,50.68,-20.0,2006-12-01,97.11,29.97,...,1.000,0.940,0.927,0.949,0.835,1.016,NN,-2.449294e-16,1.000000e+00,0.156
324,MASA,Mazandaran,Ramsar,40732,36.90,50.68,-20.0,2007-01-01,64.52,38.24,...,0.593,0.676,0.623,0.797,0.493,-1.678,NN,5.000000e-01,8.660254e-01,0.190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8229,MASA,Mazandaran,Alasht,99361,36.07,52.84,1805.0,2025-04-01,21.40,102.41,...,0.255,0.336,0.394,0.416,0.274,,MD,8.660254e-01,-5.000000e-01,0.362
8230,MASA,Mazandaran,Alasht,99361,36.07,52.84,1805.0,2025-05-01,22.20,133.10,...,0.091,0.564,0.707,0.448,0.376,,SD,5.000000e-01,-8.660254e-01,0.311
8231,MASA,Mazandaran,Alasht,99361,36.07,52.84,1805.0,2025-06-01,54.32,118.76,...,0.592,0.916,0.947,0.796,0.637,,NN,1.224647e-16,-1.000000e+00,0.337
8232,MASA,Mazandaran,Alasht,99361,36.07,52.84,1805.0,2025-07-01,19.20,135.82,...,0.428,0.537,0.534,0.494,0.397,,MD,-5.000000e-01,-8.660254e-01,0.362


#### 2. Identify feature columns (exclude the target and any non-predictive ID columns) 

In [10]:
target_col = 'SPEI_1_Class'
id_cols = ['region_id', 'region_name', 'station_name', 'station_id', 'date', 'rrr24', 'Hargreaves', 'SPEI_1', 'GPM_SPEI_1_Class']
feature_cols = [col for col in df.columns if col not in id_cols + [target_col]]

#### 3. Impute missing values in features (using median for numeric features)

In [11]:
numeric_cols = df[feature_cols].select_dtypes(include=['int64','float64']).columns
imputer = SimpleImputer(strategy='median')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
print("After imputation, total missing values:", df[feature_cols].isnull().sum().sum())
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

After imputation, total missing values: 0


#### 4. Encode categorical features if any (one-hot encoding for categorical predictors)

In [12]:
categorical_cols = df[feature_cols].select_dtypes(include=['category']).columns
if len(categorical_cols) > 0:
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    new_features = [c for c in df.columns if c not in id_cols + [target_col]]
    feature_cols = new_features

#### 5. Scale/standardize features

In [13]:
scaler = StandardScaler()
df[feature_cols] = scaler.fit_transform(df[feature_cols])
print("Feature means after scaling (should be ~0):", df[feature_cols].mean().round(3).tolist())
print("Feature std devs after scaling (should be 1):", df[feature_cols].std().round(3).tolist())

Feature means after scaling (should be ~0): [0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Feature std devs after scaling (should be 1): [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


# Feature Selection

#### 6. Feature selection using Random Forest importance

In [14]:
X_all = df[feature_cols]
y_all = df[target_col]
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_all, y_all)
n = 8
importances = pd.Series(rf.feature_importances_, index=X_all.columns).sort_values(ascending=False)
top_n_features = importances.iloc[:5].index.tolist()
print(f"Top {n} features by Random Forest importance:", top_n_features)

Top 8 features by Random Forest importance: ['GPM_SPEI_1', 'PCI_GPM', 'GPM', 'CI_GPM', 'TCI']


# Class Imbalance Handling

#### 7. Split the data into training and testing sets (80/20 split, stratified by class)

In [15]:
X = df[top_n_features]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Training set size:", X_train.shape[0], " Testing set size:", X_test.shape[0])
print("Class distribution in training:", y_train.value_counts().to_dict())
print("Class distribution in test:", y_test.value_counts().to_dict())


Training set size: 2370  Testing set size: 593
Class distribution in training: {'NN': 1541, 'MD': 258, 'MW': 256, 'SD': 146, 'VW': 101, 'ED': 50, 'EW': 18}
Class distribution in test: {'NN': 385, 'MD': 65, 'MW': 64, 'SD': 37, 'VW': 25, 'ED': 13, 'EW': 4}


#### 8. Apply SMOTE oversampling on the training set to handle class imbalance

In [16]:
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
print("Training set size after SMOTE:", X_train_balanced.shape[0])
print("Class distribution after SMOTE:", pd.Series(y_train_balanced).value_counts().to_dict())

Training set size after SMOTE: 10787
Class distribution after SMOTE: {'ED': 1541, 'SD': 1541, 'MD': 1541, 'NN': 1541, 'MW': 1541, 'VW': 1541, 'EW': 1541}


# Model Architecture (MLP Classifier Initialization)


#### 9. Initialize the MLPClassifier with one hidden layer (100 neurons) as a baseline

In [17]:
mlp = MLPClassifier(
    hidden_layer_sizes=(100,),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    max_iter=200,
    random_state=42
)

print("Initialized MLP:", mlp)

Initialized MLP: MLPClassifier(random_state=42)


# Hyperparameter Tuning


#### 10. Set up hyperparameter grid for tuning


In [18]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.001, 0.01]
}

base_mlp = MLPClassifier(max_iter=300, random_state=42)

grid_search = GridSearchCV(base_mlp, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train_balanced, y_train_balanced)

print("Best hyperparameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_

KeyboardInterrupt: 

# Train/Test Split and Model Training


#### 11. (Re)Train the best model on the full training set (in case GridSearchCV was not used)

In [None]:
# This step is optional because GridSearchCV already refit the best model on the training set.
best_model.fit(X_train_balanced, y_train_balanced)

# Evaluation & Visualization

#### 12. Make predictions on the test set

In [None]:
y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.3f}")


report = classification_report(y_test, y_pred, target_names=best_model.classes_, digits=3)
print("Classification Report:\n", report)

#### 13. Plot the confusion matrix


In [None]:
cm = confusion_matrix(y_test, y_pred, labels=best_model.classes_)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=best_model.classes_, yticklabels=best_model.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Class")
plt.ylabel("Actual Class")
plt.show()

# Model Saving and Exporting Results


#### 14. Save the trained model to disk


In [None]:
joblib.dump(best_model, "mlp_model.joblib")
print("Model saved as mlp_model.joblib")

#### 15. Save predictions on test set to a CSV for further analysis


In [None]:
results_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
results_df.to_csv("test_predictions.csv", index=False)
print("Test predictions saved to test_predictions.csv")

#### 16. Save evaluation metrics to a CSV


In [None]:
metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'macro_F1': f1_score(y_test, y_pred, average='macro'),
    'weighted_F1': f1_score(y_test, y_pred, average='weighted')
}
pd.DataFrame([metrics]).to_csv("evaluation_metrics.csv", index=False)
print("Evaluation metrics saved to evaluation_metrics.csv")