# Import Libraries

In [1]:
import sys
import os
import sqlite3
import warnings
import json

import pandas as pd
import numpy as np

import optuna
import shap
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE, SMOTENC, BorderlineSMOTE, ADASYN

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier # Changed from ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, classification_report, confusion_matrix
from sklearn.feature_selection import RFECV


warnings.filterwarnings('ignore')
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


# Load Database

In [2]:
sys.path.append('/home/pooya/w/DroughtMonitoringIran/')

DATABASE_PATH = "./database/database.db"

RESULTS_DIR = 'results_smote'

os.makedirs(RESULTS_DIR, exist_ok=True)

conn = sqlite3.connect(DATABASE_PATH)

data = pd.read_sql(sql='SELECT * FROM data', con=conn)

conn.close()

# Generate Result

In [3]:
def run(di, di_scale, selected_stations, stations_group_name, start_date, end_date):
    
    group_dir = os.path.join(RESULTS_DIR, stations_group_name)
    os.makedirs(group_dir, exist_ok=True)
    
    scale_dir = os.path.join(group_dir, f'{di}_{di_scale}')
    os.makedirs(scale_dir, exist_ok=True)
    
    # Select Columns
    selected_columns = [
        'Station_Name', 'Station_ID',
        'Station_Latitude', 'Station_Longitude', 'Station_Elevation',
        'Date',
        f'{di}_{di_scale}',
        f'GPM_{di}_{di_scale}',
        f'ERA5_{di}_{di_scale}',
        'ERA5_Precipitation',
        'GPM_Precipitation',
        'PET_MOD16A2GF',
        'NDVI', 'EVI',
        'LSTDay', 'LSTNight', 'LST',
        'PCI_GPM', 'PCI_ERA5',
        'VCI', 'TCI', 'VHI',
        'CI_GPM', 'CI_ERA5',
    ]
    
    df = data\
    .filter(items=selected_columns)\
        .query("Station_Name in @selected_stations and Date >= @start_date and Date < @end_date")

    # Date, Year, Month
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m')
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month

    # SPI and SPEI Class
    df[f'{di}_{di_scale}_Class'] = pd.cut(df[f'{di}_{di_scale}'], bins=[-10, -2, -1.5, -1, 1, 1.5, 2, 10], labels=['ED', 'SD', 'MD', 'NN', 'MW', 'VW', 'EW'])

    # Month Sin & Cos
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
            
    # LST Diff
    df['LST_Diff'] = df['LSTDay'] - df['LSTNight']

    # Convert to Category
    df['Station_ID'] = df['Station_ID'].astype('category')
    df['Year'] = df['Year'].astype('category')
    df['Month'] = df['Month'].astype('category')
    df[f'{di}_{di_scale}_Class'] = df[f'{di}_{di_scale}_Class'].astype('category')
    
    df.dropna(inplace=True)
    df.sort_values(by=['Station_ID', 'Year', 'Month'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    selected_columns_lag_roll = [
        'ERA5_Precipitation',
        'GPM_Precipitation',
        f'GPM_{di}_{di_scale}',
        f'ERA5_{di}_{di_scale}',
        'PET_MOD16A2GF',
        'NDVI', 'EVI',
        'LSTDay', 'LSTNight', 'LST',
        'PCI_GPM', 'PCI_ERA5',
        'VCI', 'TCI', 'VHI',
        'CI_GPM', 'CI_ERA5',
    ]

    # Add Lag
    for lag in range(1, 4):
        for col in selected_columns_lag_roll:
            df[f'{col}_lag_{lag}'] = df.groupby('Station_ID', observed=False)[col].shift(lag)


    # # Add Mean and Std Roll
    # for r in [3, 6]:
    #     for col in selected_columns_lag_roll:
    #         df[f'{col}_roll_mean_{r}'] = df.groupby('Station_ID', observed=False)[col].transform(lambda x: x.rolling(window=r, min_periods=1).mean())
    #         df[f'{col}_roll_std_{r}'] = df.groupby('Station_ID', observed=False)[col].transform(lambda x: x.rolling(window=r, min_periods=1).std())
    
    
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # Define columns to exclude from features
    EXCLUDE_COLS = ['Station_Name', 'Station_ID', 'Date', f'{di}_{di_scale}_Class', f'{di}_{di_scale}', 'Year', 'Month']
    FEATURES = [col for col in df.columns if col not in EXCLUDE_COLS]

    # Station-wise standardization
    df_scaled = df.copy()
    for station in df_scaled['Station_Name'].unique():
        station_mask = df_scaled['Station_Name'] == station
        scaler = StandardScaler()
        df_scaled.loc[station_mask, FEATURES] = scaler.fit_transform(df_scaled.loc[station_mask, FEATURES])


    class_counts = df_scaled[f'{di}_{di_scale}_Class'].value_counts()
    classes_to_remove = class_counts[class_counts < 2].index

    if not classes_to_remove.empty:
        print(f"Removing classes with fewer than 2 samples: {classes_to_remove.tolist()}")
        
        # Filter the dataframe to exclude these rare classes
        df_filtered = df_scaled[~df_scaled[f'{di}_{di_scale}_Class'].isin(classes_to_remove)].copy()
        
        # Redefine X and y from the filtered dataframe
        X = df_filtered[FEATURES]
        y = df_filtered[f'{di}_{di_scale}_Class']
        
        print(f"Data shape after removing rare classes: {X.shape}")
        print("\n--- Final Class Distribution ---")
        print(y.value_counts())
    else:
        print("No classes with fewer than 2 samples found.")
        X = df_scaled[FEATURES]
        y = df_scaled[f'{di}_{di_scale}_Class']

    # Encode the target variable
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y_encoded,
        test_size=0.30,
        random_state=42,
        stratify=y_encoded
    )

    print(f"Target Classes: {le.classes_}")
    print(f"Training set shape: {X_train.shape}")
    print(f"Testing set shape: {X_test.shape}")


    # Check class distribution before SMOTE
    unique, counts = np.unique(y_train, return_counts=True)
    print(f"Class distribution before SMOTE: {dict(zip(unique, counts))}")
    
    # Apply SMOTE to balance the training data
    smote = BorderlineSMOTE(random_state=42)
    # smote = ADASYN(random_state=42)
    
    try:
        X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
        
        # Check class distribution after SMOTE
        unique_smote, counts_smote = np.unique(y_train_smote, return_counts=True)
        print(f"Class distribution after SMOTE: {dict(zip(unique_smote, counts_smote))}")
        print(f"Training set shape after SMOTE: {X_train_smote.shape}")
        
    except ValueError as e:
        print(f"SMOTE failed: {e}")
        print("Using original training data without SMOTE")
        X_train_smote, y_train_smote = X_train, y_train


    # The estimator that will be used by RFECV
    estimator = RandomForestClassifier(
        random_state=42,
        n_jobs=-1
    )

    # The RFECV object
    rfecv = RFECV(
        estimator=estimator,
        step=1,
        cv=StratifiedKFold(5),
        scoring='f1_macro',
        n_jobs=-1
    )

    # Fit RFECV on the training data
    rfecv.fit(X_train_smote, y_train_smote)

    print(f"Optimal number of features found: {rfecv.n_features_}")

    # Get the final selected features
    final_features = X_train_smote.columns[rfecv.support_]
    print(f"Selected features: {final_features.tolist()}")
    
    features_path = os.path.join(scale_dir, 'selected_features.txt')
    with open(features_path, 'w') as f:
        for feature in final_features:
            f.write(f"{feature}\n")
    print(f"Selected features saved to {features_path}")

    # --- Plot Feature Importances ---
    # The `rfecv.estimator_` attribute is the model trained on the full set of features
    # during the last step of the cross-validation process. We can use its feature importances.
    importances = rfecv.estimator_.feature_importances_
    feature_importance_df = pd.DataFrame({
        'Feature': final_features,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # Plot only the selected features
    selected_feature_importances = feature_importance_df[feature_importance_df['Feature'].isin(final_features)]

    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=selected_feature_importances)
    plt.title('Feature Importances for Selected Features (from RFECV)')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    fig_path = os.path.join(scale_dir, 'feature_importances.png')
    plt.savefig(fig_path)
    plt.close()


    # Update training and testing sets with selected features
    X_train_final = X_train_smote[final_features]
    X_test_final = X_test[final_features]
    y_train_final = y_train_smote
    
    
    def objective(trial):
        """Define the objective function for Optuna to optimize."""
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 5, 50),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'max_features': trial.suggest_float('max_features', 0.1, 1.0),
            'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
            'random_state': 42,
            'n_jobs': -1
        }
        
        model = RandomForestClassifier(**params)
        
        # Stratified K-Fold Cross-Validation
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        f1_scores = []
        
        for train_idx, val_idx in skf.split(X_train_final, y_train_final):
            X_train_fold, X_val_fold = X_train_final.iloc[train_idx], X_train_final.iloc[val_idx]
            y_train_fold, y_val_fold = y_train_final[train_idx], y_train_final[val_idx]
            
            model.fit(X_train_fold, y_train_fold)
            preds = model.predict(X_val_fold)
            f1_scores.append(f1_score(y_val_fold, preds, average='macro'))
            
        return np.mean(f1_scores)

    # Create a study object and optimize
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50, timeout=300) # 50 trials or 5 minutes

    # Save best parameters
    best_params = study.best_params 
    params_path = os.path.join(scale_dir, 'best_params.json')
    with open(params_path, 'w') as f:
        json.dump(best_params, f, indent=4)
    print(f"Best parameters saved to {params_path}")

    print(f"Best trial found: {study.best_trial.number}")
    print(f"Best Macro-F1 Score: {study.best_value}")
    print("Best Hyperparameters:")
    print(study.best_params)



    # Get best hyperparameters
    best_params = study.best_params
    best_params['random_state'] = 42
    best_params['n_jobs'] = -1

    # Train the final model with SMOTE data
    final_model = RandomForestClassifier(**best_params)
    final_model.fit(X_train_final, y_train_final)

    # Make predictions on the test set (original test set, not SMOTE)
    y_pred = final_model.predict(X_test_final)

    # Save evaluation metrics
    report = classification_report(y_test, y_pred, target_names=le.classes_)
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    kappa = cohen_kappa_score(y_test, y_pred)
    
    metrics_path = os.path.join(scale_dir, 'evaluation_metrics.txt')
    with open(metrics_path, 'w') as f:
        f.write(f"Overall Accuracy: {accuracy:.4f}\n")
        f.write(f"Macro-F1 Score: {macro_f1:.4f}\n")
        f.write(f"Cohen's Kappa: {kappa:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(report)
    print(f"Evaluation metrics saved to {metrics_path}")

    print(f"Overall Accuracy: {accuracy:.4f}")
    print(f"Macro-F1 Score: {macro_f1:.4f}")
    print(f"Cohen's Kappa: {kappa:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))


    # --- Plot Confusion Matrix ---
    print("\n--- Plotting Confusion Matrix ---")
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    fig_path = os.path.join(scale_dir, 'confusion_matrix.png')
    plt.savefig(fig_path)
    plt.close()

    
    return None



# for di in ['SPI', 'SPEI']:
for di in ['SPI']:
    # for di_scale in [1, 3, 6, 9, 12, 18, 24]:
    for di_scale in [12]:
        # for stations_group_name in ['ALL', 'C1', 'C2', 'C3']: 
        for stations_group_name in ['C2']: 

            if stations_group_name == 'ALL':
                selected_stations = ['Ramsar', 'Nowshahr', 'Siahbisheh', 'Kiyasar', 'Kojur', 'Baladeh', 'Alasht', 'Babolsar', 'Gharakhil', 'Sari', 'Sari (dasht-e-naz airport)', 'Galugah', 'Bandar-e-amirabad', 'Amol', 'Polsefid']
            elif stations_group_name == 'C3':
                selected_stations = ['Ramsar', 'Nowshahr']
            elif stations_group_name == 'C2':
                selected_stations = ['Siahbisheh', 'Kiyasar', 'Kojur', 'Baladeh', 'Alasht']
            elif stations_group_name == 'C1':
                selected_stations = ['Babolsar', 'Gharakhil', 'Sari', 'Sari (dasht-e-naz airport)', 'Galugah', 'Bandar-e-amirabad', 'Amol', 'Polsefid']
            else:
                selected_stations = ['Ramsar', 'Nowshahr', 'Siahbisheh', 'Kiyasar', 'Kojur', 'Baladeh', 'Alasht', 'Babolsar', 'Gharakhil', 'Sari', 'Sari (dasht-e-naz airport)', 'Galugah', 'Bandar-e-amirabad', 'Amol', 'Polsefid']

            ex_dir = os.path.join(RESULTS_DIR, stations_group_name, f'{di}_{di_scale}')
            if os.path.exists(ex_dir):
                continue 
            
            run(
                di=di,
                di_scale=di_scale,
                selected_stations=selected_stations,
                stations_group_name=stations_group_name,
                start_date='2006-09', 
                end_date='2023-10'
            )

No classes with fewer than 2 samples found.
Target Classes: ['ED' 'EW' 'MD' 'MW' 'NN' 'SD' 'VW']
Training set shape: (668, 74)
Testing set shape: (287, 74)
Class distribution before SMOTE: {np.int64(0): np.int64(25), np.int64(1): np.int64(15), np.int64(2): np.int64(37), np.int64(3): np.int64(57), np.int64(4): np.int64(484), np.int64(5): np.int64(20), np.int64(6): np.int64(30)}
Class distribution after SMOTE: {np.int64(0): np.int64(484), np.int64(1): np.int64(484), np.int64(2): np.int64(484), np.int64(3): np.int64(484), np.int64(4): np.int64(484), np.int64(5): np.int64(484), np.int64(6): np.int64(484)}
Training set shape after SMOTE: (3388, 74)
Optimal number of features found: 73
Selected features: ['Station_Latitude', 'Station_Longitude', 'Station_Elevation', 'GPM_SPI_12', 'ERA5_SPI_12', 'ERA5_Precipitation', 'GPM_Precipitation', 'PET_MOD16A2GF', 'NDVI', 'EVI', 'LSTDay', 'LSTNight', 'LST', 'PCI_GPM', 'PCI_ERA5', 'VCI', 'TCI', 'VHI', 'CI_GPM', 'CI_ERA5', 'Month_sin', 'Month_cos', 'LST_

[I 2025-08-24 21:23:44,664] A new study created in memory with name: no-name-c63604c2-fdfc-4bc8-9da4-2bbd614378ee
[I 2025-08-24 21:23:47,535] Trial 0 finished with value: 0.9015341080476352 and parameters: {'n_estimators': 170, 'max_depth': 24, 'min_samples_split': 19, 'min_samples_leaf': 18, 'max_features': 0.16464594939478006, 'criterion': 'gini'}. Best is trial 0 with value: 0.9015341080476352.
[I 2025-08-24 21:24:06,875] Trial 1 finished with value: 0.9473496667695468 and parameters: {'n_estimators': 272, 'max_depth': 34, 'min_samples_split': 19, 'min_samples_leaf': 1, 'max_features': 0.7024607231096078, 'criterion': 'gini'}. Best is trial 1 with value: 0.9473496667695468.
[I 2025-08-24 21:24:26,951] Trial 2 finished with value: 0.944890510806671 and parameters: {'n_estimators': 364, 'max_depth': 48, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 0.815085587241152, 'criterion': 'gini'}. Best is trial 1 with value: 0.9473496667695468.
[I 2025-08-24 21:24:32,409] Tria

Best parameters saved to results_smote/C2/SPI_12/best_params.json
Best trial found: 31
Best Macro-F1 Score: 0.968794571612567
Best Hyperparameters:
{'n_estimators': 318, 'max_depth': 46, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 0.32084110904643454, 'criterion': 'gini'}
Evaluation metrics saved to results_smote/C2/SPI_12/evaluation_metrics.txt
Overall Accuracy: 0.7247
Macro-F1 Score: 0.4660
Cohen's Kappa: 0.3528

Classification Report:
              precision    recall  f1-score   support

          ED       0.70      0.64      0.67        11
          EW       1.00      0.67      0.80         6
          MD       0.11      0.06      0.08        16
          MW       0.18      0.12      0.14        25
          NN       0.83      0.89      0.86       208
          SD       0.33      0.44      0.38         9
          VW       0.33      0.33      0.33        12

    accuracy                           0.72       287
   macro avg       0.50      0.45      0.47       2