In [1]:
import os
import sys
import sqlite3

import numpy as np
import pandas as pd
import geopandas as gp
import skill_metrics as sm

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import scipy. stats as scs
import xgboost as xgb
from skopt import BayesSearchCV
from skopt.space import Real, Integer

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, precision_score, recall_score, f1_score, confusion_matrix

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

2025-01-17 14:14:13.054377: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-17 14:14:13.062164: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-17 14:14:13.083356: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737110653.117209   32921 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737110653.127132   32921 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
sys.path.append('/home/pooya/w/DroughtMonitoringIran/')

DATABASE_PATH = "./database/database.db"

### Monthly Data

In [3]:
conn = sqlite3.connect(DATABASE_PATH)

data = pd.read_sql(sql='SELECT * FROM data', con=conn)

conn.close()

### Select Columns

In [4]:
data.columns.to_list()

['Station_ID',
 'Station_Name',
 'Province',
 'Station_Latitude',
 'Station_Longitude',
 'Station_Elevation',
 'Date',
 'Temperature_Maximum',
 'Temperature_Minimum',
 'Temperature',
 'Precipitation',
 'PET_Hargreaves',
 'SPI_1',
 'SPEI_1',
 'SPI_3',
 'SPEI_3',
 'SPI_6',
 'SPEI_6',
 'SPI_9',
 'SPEI_9',
 'SPI_12',
 'SPEI_12',
 'SPI_15',
 'SPEI_15',
 'SPI_18',
 'SPEI_18',
 'SPI_21',
 'SPEI_21',
 'SPI_24',
 'SPEI_24',
 'ERA5_Precipitation',
 'GPM_Precipitation',
 'TRMM_Precipitation',
 'TERRACLIMATE_Precipitation',
 'PERSIANNCDR_Precipitation',
 'PET_MOD16A2GF',
 'NDVI_MOD13A3',
 'LSTDay_MOD21C3',
 'LSTNight_MYD21C3',
 'EVI_MYD13A3',
 'LSTNight_MOD21C3',
 'NDVI_MYD13A3',
 'LSTDay_MYD21C3',
 'EVI_MOD13A3',
 'NDVI',
 'EVI',
 'LSTDay',
 'LSTNight',
 'LST',
 'PCI_ERA5',
 'PCI_GPM',
 'PCI_TRMM',
 'PCI_TerraClimate',
 'PCI_PERSIANNCDR',
 'VCI',
 'TCI',
 'VHI',
 'CI_GPM',
 'CI_ERA5',
 'CI_TRMM',
 'CI_TerraClimate',
 'CI_PERSIANNCDR',
 'ERA5_SPI_1',
 'ERA5_SPEI_1',
 'GPM_SPI_1',
 'GPM_SPEI_1',
 '

In [9]:
di = 'SPI'
di_scale = 1

selected_columns = [
    'Station_ID',
    'Station_Latitude', 'Station_Longitude', 'Station_Elevation',
    'Date', # 'Year', 'Month',
    f'{di}_{di_scale}',
    'GPM_Precipitation',
    'PET_MOD16A2GF',
    'NDVI', 'EVI',
    'LSTDay', 'LSTNight', 'LST',
    'PCI_GPM', 'VCI', 'TCI', 'VHI', 'CI_GPM',
    f'GPM_{di}_{di_scale}'
 ]

### Define Dataset

In [10]:
df = data.filter(items=selected_columns)
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Station_ID'] = df['Station_ID'].astype('category')
df.drop(columns=['Date'], inplace=True)
df.dropna(inplace=True)
df.sort_values(by=['Station_ID', 'Year', 'Month'], inplace=True)
df.reset_index(drop=True, inplace=True)
df[f'{di}_{di_scale}_Class'] = pd.cut(df[f'{di}_{di_scale}'], bins=[-10, -2, -1.5, -1, 1, 1.5, 2, 10], labels=['ED', 'SD', 'MD', 'NN', 'MW', 'VW', 'EW'])
df[f'GPM_{di}_{di_scale}_Class'] = pd.cut(df[f'GPM_{di}_{di_scale}'], bins=[-10, -2, -1.5, -1, 1, 1.5, 2, 10], labels=['ED', 'SD', 'MD', 'NN', 'MW', 'VW', 'EW'])
print(df.info())
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4992 entries, 0 to 4991
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   Station_ID         4992 non-null   category
 1   Station_Latitude   4992 non-null   float64 
 2   Station_Longitude  4992 non-null   float64 
 3   Station_Elevation  4992 non-null   float64 
 4   SPI_1              4992 non-null   float64 
 5   GPM_Precipitation  4992 non-null   float64 
 6   PET_MOD16A2GF      4992 non-null   float64 
 7   NDVI               4992 non-null   float64 
 8   EVI                4992 non-null   float64 
 9   LSTDay             4992 non-null   float64 
 10  LSTNight           4992 non-null   float64 
 11  LST                4992 non-null   float64 
 12  PCI_GPM            4992 non-null   float64 
 13  VCI                4992 non-null   float64 
 14  TCI                4992 non-null   float64 
 15  VHI                4992 non-null   float64 
 16  CI_GPM

Unnamed: 0,Station_ID,Station_Latitude,Station_Longitude,Station_Elevation,SPI_1,GPM_Precipitation,PET_MOD16A2GF,NDVI,EVI,LSTDay,...,PCI_GPM,VCI,TCI,VHI,CI_GPM,GPM_SPI_1,Year,Month,SPI_1_Class,GPM_SPI_1_Class
0,40709,38.365,48.855,-21.1,0.577,115.920,125.725,0.510,0.308,27.62,...,0.456,0.271,0.562,0.416,0.430,0.308,2006,9,NN,NN
1,40709,38.365,48.855,-21.1,0.182,164.424,80.775,0.599,0.318,24.01,...,0.492,0.832,0.393,0.613,0.573,0.658,2006,10,NN,NN
2,40709,38.365,48.855,-21.1,-0.405,101.520,44.000,0.554,0.286,17.30,...,0.411,0.651,0.377,0.514,0.480,-0.171,2006,11,NN,NN
3,40709,38.365,48.855,-21.1,-0.303,77.376,35.025,0.462,0.236,10.15,...,0.522,0.460,0.916,0.688,0.633,-0.206,2006,12,NN,NN
4,40709,38.365,48.855,-21.1,-1.251,26.040,60.400,0.418,0.195,9.84,...,0.000,0.486,0.213,0.350,0.233,-1.997,2007,1,MD,SD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4987,99361,36.071,52.843,1805.0,-0.839,20.088,239.275,0.420,0.219,34.26,...,0.222,0.537,0.274,0.406,0.345,0.763,2023,8,NN,NN
4988,99361,36.071,52.843,1805.0,0.694,35.280,176.650,0.380,0.175,26.76,...,0.455,0.375,0.948,0.662,0.593,-0.142,2023,9,NN,NN
4989,99361,36.071,52.843,1805.0,0.585,64.728,123.012,0.394,0.185,22.45,...,0.522,0.445,0.699,0.572,0.555,0.673,2023,10,NN,NN
4990,99361,36.071,52.843,1805.0,0.574,46.080,98.800,0.395,0.187,17.29,...,0.331,0.706,0.000,0.353,0.346,-0.580,2023,11,NN,NN


### X, y

In [11]:
X = df.drop(
    columns=[
        'Station_ID',
        f'{di}_{di_scale}',
        f'{di}_{di_scale}_Class',
        f'GPM_{di}_{di_scale}',
        f'GPM_{di}_{di_scale}_Class'
    ]
)

y = df[f'{di}_{di_scale}']

###  Scale the Features

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Random Forest

### Split the data into training and test sets

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=1
)

### Step 1: Apply Scaling to the features

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Step 2: Feature selection using Random Forest

In [14]:
rf = RandomForestRegressor(random_state=1)

rf.fit(X_train_scaled, y_train)

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature Importance:")
for f in range(X_train.shape[1]):
    print(f"{X_train.columns[indices[f]]}: {importances[indices[f]]}")

sfm = SelectFromModel(
    estimator=rf,
    threshold="mean",
    max_features=5
)

sfm.fit(X_train_scaled, y_train)

X_train_selected = sfm.transform(X_train_scaled)

X_test_selected = sfm.transform(X_test_scaled)

Feature Importance:
PCI_GPM: 0.3973645607549658
PET_MOD16A2GF: 0.0664966355082667
GPM_Precipitation: 0.05373104417039949
LSTDay: 0.053149753891575904
TCI: 0.0479037920724396
Year: 0.0438243029061867
LSTNight: 0.03952892537620254
LST: 0.03944846770432121
NDVI: 0.03679412625626445
CI_GPM: 0.036440212478689074
VCI: 0.032327357674628555
VHI: 0.03127930693279001
EVI: 0.031260815366401726
Station_Longitude: 0.024699963477739954
Station_Elevation: 0.024398964691129902
Station_Latitude: 0.021156187224782706
Month: 0.020195583513215723


### Step 3: Hyperparameter tuning for Random Forest

In [15]:
rf_param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=1),
    param_grid=rf_param_grid, 
    cv=5, 
    scoring='neg_mean_squared_error'
)

rf_grid_search.fit(X_train_selected, y_train)

print("Best Random Forest Hyperparameters:", rf_grid_search.best_params_)

Best Random Forest Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 400}


### Step 4: Evaluate the models

In [16]:
rf_best_model = rf_grid_search.best_estimator_
rf_preds = rf_best_model.predict(X_test_selected)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
rf_r2 = r2_score(y_test, rf_preds)

print(f"Random Forest RMSE: {rf_rmse}")
print(f"Random Forest R²: {rf_r2}")

Random Forest RMSE: 1.2510871883478212
Random Forest R²: 0.2658055236939467


# Step-by-Step Code for AdaBoost Regressor

### Step 1: Data Preprocessing

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.values)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y.values,
    test_size=0.2,
    random_state=1
)

base_estimator = DecisionTreeRegressor()

adaboost_model = AdaBoostRegressor(
    estimator=base_estimator,
    n_estimators=50, 
    random_state=1
)

adaboost_model.fit(X_train, y_train)

### Get Feature Importances

In [None]:
feature_importances = adaboost_model.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({
    'Feature': ['Station_Latitude', 'Station_Longitude', 'Station_Elevation', 'GPM_Precipitation', 'PET_MOD16A2GF', 'NDVI', 'EVI', 'LSTDay', 'LSTNight', 'LST', 'PCI_GPM', 'VCI', 'TCI', 'VHI', 'CI_GPM', 'Year', 'Month'],
    'Importance': feature_importances
})

# Sort features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the sorted feature importances
print(feature_importance_df)

### Select the Top Features

In [40]:
# Select top N features (e.g., top 5)
top_n = 5
top_features = feature_importance_df['Feature'].head(top_n).values

# Filter the original dataset based on the top features
X_top = df[top_features].values

# Scale the selected features
X_top_scaled = scaler.fit_transform(X_top)

# Split the data into training and test sets
X_train_top, X_test_top, y_train, y_test = train_test_split(X_top_scaled, y, test_size=0.2, random_state=42)


###  Retrain AdaBoost with Selected Features

In [None]:
# Train AdaBoost with the selected top features
adaboost_model_top = AdaBoostRegressor(estimator=base_estimator, n_estimators=50, random_state=42)
adaboost_model_top.fit(X_train_top, y_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test data using the new model
y_pred_top = adaboost_model_top.predict(X_test_top)

# Calculate RMSE and R²
rmse_top = np.sqrt(mean_squared_error(y_test, y_pred_top))
r2_top = r2_score(y_test, y_pred_top)

print(f"RMSE with Top Features: {rmse_top}")
print(f"R² with Top Features: {r2_top}")

# LSTM

In [None]:
X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

In [None]:
final_features = list(selected_features)  # From RFE, or you can choose based on RF feature importance

# Subset the original data to only include the selected features
X_selected = X[final_features]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_selected, y_class, test_size=0.2, random_state=42)

print("Selected Features for Final Model:", final_features)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Model Evaluation (Optional: You can evaluate using RMSE, R^2, etc.)
y_pred = rf.predict(X_test)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the metrics
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

# 9. **Plot Precision, Recall, and F1-Score**
metrics = [precision, recall, f1]
metric_names = ['Precision', 'Recall', 'F1-Score']

plt.figure(figsize=(8, 6))
sns.barplot(x=metric_names, y=metrics, palette='viridis')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.show()

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=['ED', 'SD', 'MD', 'NN', 'MW', 'VW', 'EW'])

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['ED', 'SD', 'MD', 'NN', 'MW', 'VW', 'EW'], yticklabels=['ED', 'SD', 'MD', 'NN', 'MW', 'VW', 'EW'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
pd.cut(y, bins=[-10, -2, -1.5, -1, 1, 1.5, 2, 10], labels=['ED', 'SD', 'MD', 'NN', 'MW', 'VW', 'EW'])

print(f"Model R^2: {rf.score(X_test, y_test):.4f}")

In [None]:
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')