In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from keras.models import Sequential
from keras.layers import LSTM, Dense
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Load and preprocess the data
df = pd.read_excel('data/state_month_overdose.xlsx')
df['Deaths'] = df['Deaths'].apply(lambda x: 0 if x == 'Suppressed' else int(x))
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.groupby(['Month']).agg({'Deaths': 'sum'}).reset_index()

# Define validation periods
validation_periods = [
    ('2019-11-01', '2020-01-01'),
    ('2019-09-01', '2020-01-01'),
    ('2019-07-01', '2020-01-01'),
    ('2019-01-01', '2020-01-01'),
    ('2018-07-01', '2020-01-01'),
    ('2018-01-01', '2020-01-01')
]

# Define look-back periods
look_back_periods = range(3, 12, 2)  # 3, 5, 7, 9, 11 months look-back

# Helper function to create datasets
def create_dataset(dataset, look_back):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        dataX.append(dataset[i:(i + look_back)])
        dataY.append(dataset[i + look_back])
    return np.array(dataX), np.array(dataY)

# Function to calculate confidence intervals
def calculate_confidence_intervals(predictions, alpha=0.05):
    mean_pred = np.mean(predictions)
    std_pred = np.std(predictions)
    z_score = 1.96  # For 95% confidence
    margin_of_error = z_score * (std_pred / np.sqrt(len(predictions)))
    lower_bound = predictions - margin_of_error
    upper_bound = predictions + margin_of_error
    return lower_bound, upper_bound

# Function to calculate overlap percentage
def calculate_overlap(lower1, upper1, lower2, upper2):
    overlap_count = sum(1 for l1, u1, l2, u2 in zip(lower1, upper1, lower2, upper2) if u1 >= l2 and l1 <= u2)
    return (overlap_count / len(lower1)) * 100

# Define hyperparameter grid
batch_sizes = [1, 16, 32]  # Batch sizes to test
optimizers = ['adam', 'sgd']  # Optimizers to test
epochs_list = [50, 100]  # Number of epochs to test

# Initialize results
results = []

for val_start, val_end in validation_periods:
    for look_back in look_back_periods:
        for batch_size in batch_sizes:
            for optimizer in optimizers:
                for epochs in epochs_list:
                    # Split data into training, validation, and test sets
                    train = df[df['Month'] < val_start]
                    validation = df[(df['Month'] >= val_start) & (df['Month'] < val_end)]
                    test = df[df['Month'] >= val_end]

                    # Include last look-back rows from train in validation
                    extended_validation = pd.concat([train.iloc[-look_back:], validation])
                    # Include last look-back rows from validation in test
                    extended_test = pd.concat([validation.iloc[-look_back:], test])

                    # Prepare LSTM datasets
                    trainX, trainY = create_dataset(train['Deaths'].values, look_back)
                    valX, valY = create_dataset(extended_validation['Deaths'].values, look_back)
                    testX, testY = create_dataset(extended_test['Deaths'].values, look_back)

                    trainX = trainX.reshape((trainX.shape[0], look_back, 1))
                    valX = valX.reshape((valX.shape[0], look_back, 1))
                    testX = testX.reshape((testX.shape[0], look_back, 1))

                    # Train LSTM model with hyperparameters
                    model = Sequential()
                    model.add(LSTM(50, activation='relu', input_shape=(look_back, 1)))
                    model.add(Dense(1))
                    model.compile(loss='mean_squared_error', optimizer=optimizer)
                    model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, verbose=0)

                    # Evaluate LSTM on validation
                    valPred = model.predict(valX).flatten()
                    lstm_mse = mean_squared_error(valY, valPred)
                    lstm_rmse = np.sqrt(lstm_mse)
                    lstm_mape = mean_absolute_percentage_error(valY, valPred)

                    # Save results for this combination of hyperparameters
                    results.append({
                        'Validation Period': f"{val_start} to {val_end}",
                        'Look-back': look_back,
                        'Batch Size': batch_size,
                        'Optimizer': optimizer,
                        'Epochs': epochs,
                        'LSTM MAPE': lstm_mape,
                        'LSTM MSE': lstm_mse,
                        'LSTM RMSE': lstm_rmse
                    })

# Save cross-validation results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('hyperparameter_cross_validation_results.csv', index=False)

# Identify the best model
best_model = results_df.loc[results_df['LSTM MSE'].idxmin()]
best_val_start, best_val_end = best_model['Validation Period'].split(' to ')
best_look_back = best_model['Look-back']
best_batch_size = best_model['Batch Size']
best_optimizer = best_model['Optimizer']
best_epochs = best_model['Epochs']

# Train best model on full training + validation data and evaluate on test
full_train = df[df['Month'] < best_val_end]

# Include last look-back rows from train in test
extended_test = pd.concat([full_train.iloc[-best_look_back:], test])

trainX, trainY = create_dataset(full_train['Deaths'].values, best_look_back)
testX, testY = create_dataset(extended_test['Deaths'].values, best_look_back)

trainX = trainX.reshape((trainX.shape[0], best_look_back, 1))
testX = testX.reshape((testX.shape[0], best_look_back, 1))

# Train the best model configuration
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(best_look_back, 1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer=best_optimizer)
model.fit(trainX, trainY, epochs=best_epochs, batch_size=best_batch_size, verbose=0)

# Evaluate on test set
testPred = model.predict(testX).flatten()
final_test_results = {
    'Best Validation Period': f"{best_val_start} to {best_val_end}",
    'Best Look-back': best_look_back,
    'Best Batch Size': best_batch_size,
    'Best Optimizer': best_optimizer,
    'Best Epochs': best_epochs,
    'Test MAPE': mean_absolute_percentage_error(testY, testPred),
    'Test MSE': mean_squared_error(testY, testPred),
    'Test RMSE': np.sqrt(mean_squared_error(testY, testPred))
}

# Save test results to a separate CSV
pd.DataFrame([final_test_results]).to_csv('test_results_hyperparameter_optimization.csv', index=False)

2024-12-19 10:20:07.117698: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-19 10:20:07.144956: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-19 10:20:07.178266: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-19 10:20:07.186840: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-19 10:20:07.288126: I tensorflow/core/platform/cpu_feature_guar

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m-2s[0m -1643690us/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step


ValueError: Input contains NaN.

In [None]:
print("Cross-validation and hyperparameter optimization completed.")

In [2]:
valY

array([4560, 4645])

In [3]:
valPred

array([nan, nan], dtype=float32)

In [4]:
model.predict(valX).flatten()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step


array([nan, nan], dtype=float32)