In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, GRU, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Add, GlobalAveragePooling1D
from statsmodels.tsa.statespace.sarimax import SARIMAX
import tensorflow as tf
from tensorflow.keras import layers
import warnings
import json
from datetime import datetime
from typing import Dict, List, Tuple, Any
import itertools

warnings.filterwarnings('ignore')

2025-06-12 11:06:02.819103: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-12 11:06:03.303434: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-12 11:06:03.303545: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-12 11:06:03.374356: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-12 11:06:03.544875: I tensorflow/core/platform/cpu_feature_guar

In [2]:
# ------------------ CONFIGURATION ------------------ #
MODEL_TYPE = 'lstm'  # Options: 'lstm', 'sarima'
TRIAL_MODE = 'fixed_seed'  # Options: 'fixed_seed', 'multi_seed'
SEEDS = [42] if TRIAL_MODE == 'fixed_seed' else [42, 123, 456, 11, 245, 56712, 23467, 98, 38, 1506]
TRIALS_PER_CONFIG = 30

LOOKBACKS = [3, 5, 7, 9, 11, 12]
BATCH_SIZES = [8, 16, 32]
EPOCHS_LIST = [50, 100]

DATA_PATH = 'data/state_month_overdose.xlsx'
RESULTS_DIR = 'results'

In [3]:
# data_path = 'data/state_month_overdose.xlsx'
# df = pd.read_excel(data_path)

# results_dir = './results'

# models = {}
# results = []

# os.makedirs(f'{results_dir}/individual_runs', exist_ok=True)
# os.makedirs(f'{results_dir}/aggregated', exist_ok=True)

In [4]:
def load_and_preprocess_data():
    df = pd.read_excel(DATA_PATH)
    df['Deaths'] = df['Deaths'].apply(lambda x: 0 if x == 'Suppressed' else int(x))
    df['Month'] = pd.to_datetime(df['Month'])
    df = df.groupby('Month').agg({'Deaths': 'sum'}).reset_index()
    return df

In [5]:
data = load_and_preprocess_data()
data

Unnamed: 0,Month,Deaths
0,2015-01-01,2771
1,2015-02-01,2627
2,2015-03-01,2907
3,2015-04-01,2822
4,2015-05-01,2850
...,...,...
67,2020-08-01,6165
68,2020-09-01,5683
69,2020-10-01,5613
70,2020-11-01,5613


In [6]:
def create_train_val_test_split_lstm(df: pd.DataFrame, 
                                  train_end: str = '2019-01-01',
                                  val_end: str = '2020-01-01', 
                                  test_end: str = '2020-12-01'):
    """Create proper train/validation/test splits"""
    train = df[df['Month'] < train_end]#['Deaths'].values
    validation = df[(df['Month'] >= train_end) & (df['Month'] < val_end)]#['Deaths'].values
    test = df[(df['Month'] >= val_end)]# & (df['Month'] <= test_end)]#['Deaths'].values
    
    return train, validation, test

In [7]:
train_data, validation_data, test_data = create_train_val_test_split_lstm(data)

In [8]:
validation_data

Unnamed: 0,Month,Deaths
48,2019-01-01,3931
49,2019-02-01,3727
50,2019-03-01,4285
51,2019-04-01,3934
52,2019-05-01,4100
53,2019-06-01,4112
54,2019-07-01,4296
55,2019-08-01,4371
56,2019-09-01,4252
57,2019-10-01,4529


In [9]:
# def create_dataset(dataset, look_back=3):
#     """Create dataset for sequence models"""
#     dataX, dataY = [], []
#     for i in range(len(dataset) - look_back):
#         a = dataset.iloc[i:(i + look_back)]
#         dataX.append(a)
#         dataY.append(dataset.iloc[i + look_back])
#     return np.array(dataX), np.array(dataY)

def create_dataset(dataset, look_back=3):
    """Creates time series sequences for LSTM training"""
    dataX, dataY = [], []
    values = dataset['Deaths'].values  # use only numeric target
    for i in range(len(values) - look_back):
        dataX.append(values[i:(i + look_back)])
        dataY.append(values[i + look_back])
    return np.array(dataX), np.array(dataY)

In [10]:
def evaluate_metrics(y_true, y_pred):
    """Calculate comprehensive evaluation metrics"""
    # Basic metrics
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    
    # MAPE (handle division by zero)
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true != 0, y_true, 1))) * 100
    
    # Additional metrics
    mse = mean_squared_error(y_true, y_pred)
    
    return {
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape,
        'MSE': mse,
        'PI Width': 0,
        'CI Coverage': 0,
        'PI Overlap': 0
    }

In [11]:
# def build_lstm_model(self, look_back: int, units: int = 50, dropout: float = 0.0):
#     """Build LSTM model"""
#     model = Sequential([
#         LSTM(units, activation='relu', input_shape=(look_back, 1), return_sequences=False),
#         Dropout(dropout),
#         Dense(1)
#     ])
#     model.compile(loss='mean_squared_error', optimizer='adam')
#     return model

In [12]:
# def generate_forecast(self, model, initial_sequence, num_predictions, look_back, model_type='lstm'):
#     """Generate forecasts for different model types"""
#     predictions = []
    
#     # if model_type in ['lstm', 'tcn', 'transformer']:
#     if model_type in ['lstm']:
#         current_sequence = initial_sequence.copy()
#         for _ in range(num_predictions):
#             next_prediction = model.predict(current_sequence, verbose=0)
#             predictions.append(next_prediction[0][0])
            
#             # Update sequence
#             new_sequence = np.append(current_sequence[0, 1:], [[next_prediction[0][0]]], axis=0)
#             current_sequence = new_sequence.reshape((1, look_back, 1))
            
#     elif model_type == 'seq2seq':
#         # Simplified seq2seq forecasting
#         encoder_input = initial_sequence
#         for _ in range(num_predictions):
#             decoder_input = np.zeros((1, 1, 1))  # Start token
#             prediction = model.predict([encoder_input, decoder_input], verbose=0)
#             predictions.append(prediction[0][0][0])
#             encoder_input = np.roll(encoder_input, -1, axis=1)
#             encoder_input[0, -1, 0] = prediction[0][0][0]
    
#     return np.array(predictions)

In [13]:
X_train, y_train = create_dataset(train_data, 3)
X_train

array([[2771, 2627, 2907],
       [2627, 2907, 2822],
       [2907, 2822, 2850],
       [2822, 2850, 2634],
       [2850, 2634, 2734],
       [2634, 2734, 2810],
       [2734, 2810, 2925],
       [2810, 2925, 2970],
       [2925, 2970, 2739],
       [2970, 2739, 2718],
       [2739, 2718, 3098],
       [2718, 3098, 3416],
       [3098, 3416, 3721],
       [3416, 3721, 3608],
       [3721, 3608, 3475],
       [3608, 3475, 3384],
       [3475, 3384, 3668],
       [3384, 3668, 3624],
       [3668, 3624, 3518],
       [3624, 3518, 3695],
       [3518, 3695, 3745],
       [3695, 3745, 4044],
       [3745, 4044, 4332],
       [4044, 4332, 4006],
       [4332, 4006, 4252],
       [4006, 4252, 4056],
       [4252, 4056, 4129],
       [4056, 4129, 3950],
       [4129, 3950, 3980],
       [3950, 3980, 4000],
       [3980, 4000, 4022],
       [4000, 4022, 3832],
       [4022, 3832, 3820],
       [3832, 3820, 4017],
       [3820, 4017, 4008],
       [4017, 4008, 3671],
       [4008, 3671, 4155],
 

In [14]:
y_train

array([2822, 2850, 2634, 2734, 2810, 2925, 2970, 2739, 2718, 3098, 3416,
       3721, 3608, 3475, 3384, 3668, 3624, 3518, 3695, 3745, 4044, 4332,
       4006, 4252, 4056, 4129, 3950, 3980, 4000, 4022, 3832, 3820, 4017,
       4008, 3671, 4155, 3922, 4079, 4063, 4085, 4061, 3882, 3999, 3692,
       3976])

In [15]:
pd.concat([train_data[-3:], validation_data])

Unnamed: 0,Month,Deaths
45,2018-10-01,3999
46,2018-11-01,3692
47,2018-12-01,3976
48,2019-01-01,3931
49,2019-02-01,3727
50,2019-03-01,4285
51,2019-04-01,3934
52,2019-05-01,4100
53,2019-06-01,4112
54,2019-07-01,4296


In [16]:
X_test, y_test = create_dataset(pd.concat([train_data[-3:], validation_data]), 3)

y_test

array([3931, 3727, 4285, 3934, 4100, 4112, 4296, 4371, 4252, 4529, 4560,
       4645])

In [27]:
def run_lstm(train, test, look_back, batch_size, epochs, seed):
    np.random.seed(seed)
    X_train, y_train = create_dataset(train, look_back)
    X_test, y_test = create_dataset(pd.concat([train[-look_back:], test]), look_back)
    X_train = X_train.reshape((X_train.shape[0], look_back, 1))
    X_test = X_test.reshape((X_test.shape[0], look_back, 1))
    
    model = Sequential([LSTM(50, activation='relu', input_shape=(look_back, 1)), Dense(1)])
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0) #1
    
    preds = []
    current_input = X_test[0].reshape((1, look_back, 1))
    for _ in range(len(y_test)):
        pred = model.predict(current_input)[0][0]
        preds.append(pred)
        current_input = np.append(current_input[:, 1:, :], [[[pred]]], axis=1)
    return y_test, np.array(preds)

# def run_sarima(train, test):
#     model = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,12),
#                     enforce_stationarity=False, enforce_invertibility=False)
#     results = model.fit(disp=False)
#     forecast = results.predict(start=len(train), end=len(train)+len(test)-1)
#     return test, forecast

def run_sarima(train_df, test_df):
    train_series = train_df['Deaths'].astype(float)
    test_series = test_df['Deaths'].astype(float)

    model = SARIMAX(train_series, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12),
                    enforce_stationarity=False, enforce_invertibility=False)
    results = model.fit(disp=False)

    forecast = results.predict(start=len(train_series), end=len(train_series) + len(test_series) - 1)
    return test_series.values, forecast.values

In [18]:
# def train_sarima_model(self, train_data, validation_data=None, 
#                           order=(1,1,1), seasonal_order=(1,1,1,12)):
#     """Train SARIMA model"""
#     try:
#         if validation_data is not None:
#             # Train on train + validation for final model
#             combined_data = pd.concat([train_data, validation_data])['Deaths']
#         else:
#             combined_data = train_data['Deaths']
            
#         model = SARIMAX(combined_data, order=order, seasonal_order=seasonal_order,
#                        enforce_stationarity=False, enforce_invertibility=False)
#         fitted_model = model.fit(disp=False)
#         return fitted_model
#     except Exception as e:
#         print(f"SARIMA training failed: {e}")
#         return None

In [19]:
# X_train, y_train = create_dataset(train_data, 5)
# X_test, y_test = create_dataset(pd.concat([train_data[-5:], validation_data]), 5)

In [20]:
# y_train

In [21]:
y_true, y_pred = run_lstm(train_data, validation_data, 5, 8, 50, 42)

2025-06-12 11:06:31.250511: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:274] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected




In [22]:
y_true

array([3931, 3727, 4285, 3934, 4100, 4112, 4296, 4371, 4252, 4529, 4560,
       4645])

In [23]:
y_pred

array([4009.4856, 4004.3435, 4014.3464, 4012.4036, 4082.277 , 4106.6885,
       4125.1406, 4148.3706, 4175.4453, 4209.7085, 4235.959 , 4261.9233],
      dtype=float32)

In [26]:
train_data

Unnamed: 0,Month,Deaths
0,2015-01-01,2771
1,2015-02-01,2627
2,2015-03-01,2907
3,2015-04-01,2822
4,2015-05-01,2850
5,2015-06-01,2634
6,2015-07-01,2734
7,2015-08-01,2810
8,2015-09-01,2925
9,2015-10-01,2970


In [28]:
y_true, y_pred = run_sarima(train_data, validation_data)

In [29]:
y_true

array([3931., 3727., 4285., 3934., 4100., 4112., 4296., 4371., 4252.,
       4529., 4560., 4645.])

In [30]:
y_pred

array([4066.45787179, 3925.69393519, 4280.73621334, 4105.93074076,
       4158.23709375, 4057.74799477, 4154.3414548 , 4150.25899724,
       4076.11643433, 4127.18425768, 3976.68625048, 4209.91517304])

In [23]:
for seed in SEEDS:
    for look_back in LOOKBACKS:
        for bs in BATCH_SIZES:
            for ep in EPOCHS_LIST:
                config_name = f'lookback_{look_back}_bs_{bs}_epochs_{ep}'
                base_dir = os.path.join(RESULTS_DIR,
                    'fixed_seed_variability' if TRIAL_MODE == 'fixed_seed' else f'multi_seed_variability/seed_{seed}',
                    config_name)
                os.makedirs(base_dir, exist_ok=True)
                
                metrics_all = []
                for trial in range(TRIALS_PER_CONFIG):
                    if MODEL_TYPE == 'lstm':
                        y_true, y_pred = run_lstm(train_data, test_data, look_back, bs, ep, seed)
                    elif MODEL_TYPE == 'sarima':
                        y_true, y_pred = run_sarima(train_data, test_data)
                    else:
                        raise ValueError("Unknown model type")

                    metrics = evaluate_metrics(y_true, y_pred)
                    metrics_all.append(metrics)
                    pd.DataFrame({"True": y_true, "Pred": y_pred}).to_csv(
                        os.path.join(base_dir, f'trial_{trial}.csv'), index=False)

                df_metrics = pd.DataFrame(metrics_all)
                summary = df_metrics.agg(['mean', 'std'])
                summary.to_csv(os.path.join(base_dir, 'summary_metrics.csv'))

