In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from keras.models import Sequential
from keras.layers import LSTM, Dense
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Load and preprocess the data
df = pd.read_excel('data/state_month_overdose.xlsx')
df['Deaths'] = df['Deaths'].apply(lambda x: 0 if x == 'Suppressed' else int(x))
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.groupby(['Month']).agg({'Deaths': 'sum'}).reset_index()

# Define validation periods
validation_periods = [
    ('2019-11-01', '2020-01-01'),
    ('2019-09-01', '2020-01-01'),
    ('2019-07-01', '2020-01-01'),
    ('2019-01-01', '2020-01-01'),
    ('2018-07-01', '2020-01-01'),
    ('2018-01-01', '2020-01-01')
]

# Helper function to create datasets
def create_dataset(dataset, look_back):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        dataX.append(dataset[i:(i + look_back)])
        dataY.append(dataset[i + look_back])
    return np.array(dataX), np.array(dataY)

# Function to calculate confidence intervals
def calculate_confidence_intervals(predictions, alpha=0.05):
    mean_pred = np.mean(predictions)
    std_pred = np.std(predictions)
    z_score = 1.96  # For 95% confidence
    margin_of_error = z_score * (std_pred / np.sqrt(len(predictions)))
    lower_bound = predictions - margin_of_error
    upper_bound = predictions + margin_of_error
    return lower_bound, upper_bound

# Function to calculate overlap percentage
def calculate_overlap(lower1, upper1, lower2, upper2):
    overlap_count = sum(1 for l1, u1, l2, u2 in zip(lower1, upper1, lower2, upper2) if u1 >= l2 and l1 <= u2)
    return (overlap_count / len(lower1)) * 100

# Initialize results
results = []
look_back = 3

for val_start, val_end in validation_periods:
    # Adjust validation and test start dates to include look-back rows
    adjusted_val_start = pd.to_datetime(val_start) - pd.DateOffset(months=look_back)
    adjusted_test_start = pd.to_datetime(val_end) - pd.DateOffset(months=look_back)

    # Split data into training, validation, and test sets
    train = df[df['Month'] <= adjusted_val_start]
    validation = df[(df['Month'] >= adjusted_val_start) & (df['Month'] <= val_end)]
    test = df[(df['Month'] >= adjusted_test_start)]

    # Prepare LSTM datasets
    trainX, trainY = create_dataset(train['Deaths'].values, look_back)
    valX, valY = create_dataset(validation['Deaths'].values, look_back)
    testX, testY = create_dataset(test['Deaths'].values, look_back)

    trainX = trainX.reshape((trainX.shape[0], look_back, 1))
    valX = valX.reshape((valX.shape[0], look_back, 1))
    testX = testX.reshape((testX.shape[0], look_back, 1))

    # Train LSTM model
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(look_back, 1)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)

    # Evaluate LSTM on validation
    valPred = model.predict(valX).flatten()
    lstm_mse = mean_squared_error(valY, valPred)
    lstm_rmse = np.sqrt(lstm_mse)
    lstm_mape = mean_absolute_percentage_error(valY, valPred)

    # Prepare SARIMA model
    sarima_model = SARIMAX(train['Deaths'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12),
                           enforce_stationarity=False, enforce_invertibility=False)
    sarima_result = sarima_model.fit(disp=False)

    # Evaluate SARIMA on validation
    sarima_val_pred = sarima_result.predict(start=len(train), end=len(train) + len(validation) - 1, dynamic=False)
    sarima_mse = mean_squared_error(validation['Deaths'], sarima_val_pred)
    sarima_rmse = np.sqrt(sarima_mse)
    sarima_mape = mean_absolute_percentage_error(validation['Deaths'], sarima_val_pred)

    # Calculate confidence intervals and overlap
    lower_bound_val, upper_bound_val = calculate_confidence_intervals(valPred)
    lower_bound_sarima, upper_bound_sarima = calculate_confidence_intervals(sarima_val_pred)
    ci_overlap = calculate_overlap(lower_bound_val, upper_bound_val, lower_bound_sarima, upper_bound_sarima)

    # Save results for this validation period
    results.append({
        'Validation Period': f"{val_start} to {val_end}",
        'Look-back': look_back,
        'LSTM MAPE': lstm_mape,
        'LSTM MSE': lstm_mse,
        'LSTM RMSE': lstm_rmse,
        'SARIMA MAPE': sarima_mape,
        'SARIMA MSE': sarima_mse,
        'SARIMA RMSE': sarima_rmse,
        'CI Overlap %': ci_overlap
    })

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('kfold_cross_validation_results.csv', index=False)

# Train best model on full training + validation data and evaluate on test
best_model_period = results_df.loc[results_df['LSTM MSE'].idxmin()]
best_val_start, best_val_end = best_model_period['Validation Period'].split(' to ')
full_train = df[df['Month'] < best_val_end]

trainX, trainY = create_dataset(full_train['Deaths'].values, look_back)
testX, testY = create_dataset(test['Deaths'].values, look_back)

trainX = trainX.reshape((trainX.shape[0], look_back, 1))
testX = testX.reshape((testX.shape[0], look_back, 1))

model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)
testPred = model.predict(testX).flatten()

# Save test results
test_mse = mean_squared_error(testY, testPred)
test_rmse = np.sqrt(test_mse)
test_mape = mean_absolute_percentage_error(testY, testPred)
print(f"Test Results:\nMAPE: {test_mape:.4f}, MSE: {test_mse:.4f}, RMSE: {test_rmse:.4f}")

  df['Month'] = pd.to_datetime(df['Month'])
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step


  warn('Too few observations to estimate starting parameters%s.'


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
Test Results:
MAPE: 0.0824, MSE: 507502.5185, RMSE: 712.3921


In [20]:
best_model_period

Validation Period    2019-09-01 to 2020-01-01
Look-back                                   3
LSTM MAPE                            0.022304
LSTM MSE                         11090.543981
LSTM RMSE                          105.311652
SARIMA MAPE                          0.426371
SARIMA MSE                     4604725.039945
SARIMA RMSE                       2145.862307
CI Overlap %                             20.0
Name: 1, dtype: object

TEST

In [16]:
val_start, val_end = validation_periods[0]

adjusted_val_start = pd.to_datetime(val_start) - pd.DateOffset(months=look_back)
adjusted_test_start = pd.to_datetime(val_end) - pd.DateOffset(months=look_back)


train = df[df['Month'] <= val_start]
validation = df[(df['Month'] >= adjusted_val_start) & (df['Month'] <= val_end)]
test = df[df['Month'] >= adjusted_test_start]

# Prepare LSTM datasets
trainX, trainY = create_dataset(train['Deaths'].values, look_back)
valX, valY = create_dataset(validation['Deaths'].values, look_back)
testX, testY = create_dataset(test['Deaths'].values, look_back)

trainX = trainX.reshape((trainX.shape[0], look_back, 1))
valX = valX.reshape((valX.shape[0], look_back, 1))
testX = testX.reshape((testX.shape[0], look_back, 1))

In [17]:
validation

Unnamed: 0,Month,Deaths
55,2019-08-01,4371
56,2019-09-01,4252
57,2019-10-01,4529
58,2019-11-01,4560
59,2019-12-01,4645
60,2020-01-01,4727
