In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from keras.models import Sequential
from keras.layers import LSTM, Dense
from statsmodels.tsa.statespace.sarimax import SARIMAX

df = pd.read_excel('data/state_month_overdose.xlsx')
df['Deaths'] = df['Deaths'].apply(lambda x: 0 if x == 'Suppressed' else int(x))
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.groupby(['Month']).agg({'Deaths': 'sum'}).reset_index()

validation_periods = [
    ('2019-11-01', '2020-01-01'),
    ('2019-09-01', '2020-01-01'),
    ('2019-07-01', '2020-01-01'),
    ('2019-01-01', '2020-01-01'),
    ('2018-07-01', '2020-01-01'),
    ('2018-01-01', '2020-01-01')
]

# Define look-back periods
look_back_periods = range(3, 12, 2)  # 3, 5, 7, 9, 11 months look-back

# Helper function to create datasets (properly format for input with lookback)
def create_dataset(dataset, look_back):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        dataX.append(dataset[i:(i + look_back)])
        dataY.append(dataset[i + look_back])
    return np.array(dataX), np.array(dataY)

# Function to calculate confidence intervals
def calculate_confidence_intervals(predictions, alpha=0.05):
    mean_pred = np.mean(predictions)
    std_pred = np.std(predictions)
    z_score = 1.96  # For 95% confidence
    margin_of_error = z_score * (std_pred / np.sqrt(len(predictions)))
    lower_bound = predictions - margin_of_error
    upper_bound = predictions + margin_of_error
    return lower_bound, upper_bound

# Function to calculate overlap percentage
def calculate_overlap(lower1, upper1, lower2, upper2):
    overlap_count = sum(1 for l1, u1, l2, u2 in zip(lower1, upper1, lower2, upper2) if u1 >= l2 and l1 <= u2)
    return (overlap_count / len(lower1)) * 100

results = []

for val_start, val_end in validation_periods:
    for look_back in look_back_periods:
        # Split data into training, validation, and test sets
        train = df[df['Month'] < val_start]
        validation = df[(df['Month'] >= val_start) & (df['Month'] < val_end)]
        test = df[df['Month'] >= val_end]

        # Include last look-back rows from train in validation
        extended_validation = pd.concat([train.iloc[-look_back:], validation])
        # Include last look-back rows from validation in test
        extended_test = pd.concat([validation.iloc[-look_back:], test])

        # Prepare LSTM datasets
        trainX, trainY = create_dataset(train['Deaths'].values, look_back)
        valX, valY = create_dataset(extended_validation['Deaths'].values, look_back)
        testX, testY = create_dataset(extended_test['Deaths'].values, look_back)

        trainX = trainX.reshape((trainX.shape[0], look_back, 1))
        valX = valX.reshape((valX.shape[0], look_back, 1))
        testX = testX.reshape((testX.shape[0], look_back, 1))

        # Train LSTM model
        model = Sequential()
        model.add(LSTM(50, activation='relu', input_shape=(look_back, 1)))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam')
        model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)

        # Evaluate LSTM on validation
        valPred = model.predict(valX).flatten()
        lstm_mse = mean_squared_error(valY, valPred)
        lstm_rmse = np.sqrt(lstm_mse)
        lstm_mape = mean_absolute_percentage_error(valY, valPred)

        # Prepare SARIMA model
        sarima_model = SARIMAX(train['Deaths'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12),
                               enforce_stationarity=False, enforce_invertibility=False)
        sarima_result = sarima_model.fit(disp=False)

        # Evaluate SARIMA on validation
        sarima_val_pred = sarima_result.predict(start=len(train), end=len(train) + len(validation) - 1, dynamic=False)
        sarima_mse = mean_squared_error(validation['Deaths'], sarima_val_pred)
        sarima_rmse = np.sqrt(sarima_mse)
        sarima_mape = mean_absolute_percentage_error(validation['Deaths'], sarima_val_pred)

        # Calculate confidence intervals and overlap
        lower_bound_val, upper_bound_val = calculate_confidence_intervals(valPred)
        lower_bound_sarima, upper_bound_sarima = calculate_confidence_intervals(sarima_val_pred)
        ci_overlap = calculate_overlap(lower_bound_val, upper_bound_val, lower_bound_sarima, upper_bound_sarima)

        # Save results for this combination of validation period and look-back
        results.append({
            'Validation Period': f"{val_start} to {val_end}",
            'Look-back': look_back,
            'LSTM MAPE': lstm_mape,
            'LSTM MSE': lstm_mse,
            'LSTM RMSE': lstm_rmse,
            'SARIMA MAPE': sarima_mape,
            'SARIMA MSE': sarima_mse,
            'SARIMA RMSE': sarima_rmse,
            'CI Overlap %': ci_overlap
        })

2024-12-19 10:14:32.717408: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-19 10:14:32.772468: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-19 10:14:32.807471: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-19 10:14:32.817533: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-19 10:14:32.864710: I tensorflow/core/platform/cpu_feature_guar

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 203ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step


  warn('Too few observations to estimate starting parameters%s.'
  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step


  warn('Too few observations to estimate starting parameters%s.'


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step
Cross-validation and test evaluation completed.


In [22]:
def generate_forecast(model, initial_sequence, look_back, num_predictions=12):
    predictions = []
    for_model = initial_sequence
    
    for _ in range(num_predictions):
        # Generate the next prediction
        pred = model.predict(for_model)
        predictions.append(pred[0][0])
        
        # Update the input for the next prediction
        # We need to construct a new input array of the same shape as the original input
        new_input = np.append(for_model[:, 1:], pred[0][0])  # Shift and append the new prediction
        for_model = new_input.reshape((1, look_back, 1))

    return np.array(predictions)

In [23]:
# Save cross-validation results
results_df = pd.DataFrame(results)
results_df.to_csv('kfold_cross_validation_lookback_results.csv', index=False)

# Identify the best model
best_model = results_df.loc[results_df['LSTM MSE'].idxmin()]
best_val_start, best_val_end = best_model['Validation Period'].split(' to ')
best_look_back = best_model['Look-back']

# Train best model on full training + validation data and evaluate on test
full_train = df[df['Month'] < best_val_end]

# Include last look-back rows from train in test
extended_test = pd.concat([full_train.iloc[-best_look_back:], test])

trainX, trainY = create_dataset(full_train['Deaths'].values, best_look_back)
testX, testY = create_dataset(extended_test['Deaths'].values, best_look_back)

trainX = trainX.reshape((trainX.shape[0], best_look_back, 1))
testX = testX.reshape((testX.shape[0], best_look_back, 1))

model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)
testPred = model.predict(testX).flatten()
trainPred = model.predict(trainX).flatten()

# Save test results
final_test_results = {
    'Best Validation Period': f"{best_val_start} to {best_val_end}",
    'Best Look-back': best_look_back,
    'Train MAPE': mean_absolute_percentage_error(trainY, trainPred),
    'Train MSE': mean_squared_error(trainY, trainPred),
    'Train RMSE': np.sqrt(mean_squared_error(trainY, trainPred)),
    'Test MAPE': mean_absolute_percentage_error(testY, testPred),
    'Test MSE': mean_squared_error(testY, testPred),
    'Test RMSE': np.sqrt(mean_squared_error(testY, testPred))
}
pd.DataFrame([final_test_results]).to_csv('test_results.csv', index=False)

print("Cross-validation and test evaluation completed.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
Cross-validation and test evaluation completed.


In [24]:
final_test_results

{'Best Validation Period': '2019-09-01 to 2020-01-01',
 'Best Look-back': 5,
 'Train MAPE': 0.04199491141506799,
 'Train MSE': 37815.43041051734,
 'Train RMSE': 194.46189963722287,
 'Test MAPE': 0.07576518751761822,
 'Test MSE': 434127.9887722929,
 'Test RMSE': 658.8838962763416}

In [27]:
# # Save LSTM results
# results_df = pd.DataFrame(results)
# results_df.to_csv('kfold_cross_validation_lookback_results_2.csv', index=False)

# # Identify best LSTM model
# best_model = results_df.loc[results_df['LSTM MSE'].idxmin()]
# best_val_start, best_val_end = best_model['Validation Period'].split(' to ')
# best_look_back = best_model['Look-back']

# full_train = df[df['Month'] < best_val_end]
# extended_test = pd.concat([full_train.iloc[-best_look_back:], test])

# trainX, trainY = create_dataset(full_train['Deaths'].values, best_look_back)
# testX, testY = create_dataset(extended_test['Deaths'].values, best_look_back)

# trainX = trainX.reshape((trainX.shape[0], best_look_back, 1))
# testX = testX.reshape((testX.shape[0], best_look_back, 1))

# model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)
# testPred = model.predict(testX).flatten()
# trainPred = model.predict(trainX).flatten()

# # Add LSTM predictions to DataFrame
# initial_sequence = np.array([trainPred[-1]])
# testPredict = generate_forecast(model, initial_sequence, best_look_back, num_predictions=len(test))
# trainPredictlst = trainPred.flatten().tolist()
# testPredictlst = testPredict.flatten().tolist()
# combined_array = [0] + trainPredictlst + testPredictlst
# df['LSTM Predictions'] = combined_array

# # Train SARIMA Model
# sarima_model = SARIMAX(full_train['Deaths'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12),
#                        enforce_stationarity=False, enforce_invertibility=False)
# sarima_result = sarima_model.fit(disp=False)
# sarima_predictions = sarima_result.predict(start=0, end=df.shape[0] - 1, dynamic=False)
# df['SARIMA Predictions'] = sarima_predictions

# sarimaTestPredict = df[df['Month'] >= best_val_end]['SARIMA Predictions']

# # Metrics Calculation
# final_test_results = {
#     'Best Validation Period': f"{best_val_start} to {best_val_end}",
#     'Best Look-back': best_look_back,
#     'LSTM Train MAPE': mean_absolute_percentage_error(trainY, trainPred),
#     'LSTM Train MSE': mean_squared_error(trainY, trainPred),
#     'LSTM Test MAPE': mean_absolute_percentage_error(testY, testPred),
#     'LSTM Test MSE': mean_squared_error(testY, testPred),
#     'SARIMA Test MAPE': mean_absolute_percentage_error(testY, sarimaTestPredict),
#     'SARIMA Test MSE': mean_squared_error(testY, sarimaTestPredict)
# }

# pd.DataFrame([final_test_results]).to_csv('test_results_2.csv', index=False)

# # Plot Results
# plottable = df.iloc[1:]  # Exclude rows used for the first lookback
# plottable.set_index('Month', inplace=True)
# plt.figure(figsize=(10, 6))
# plt.plot(plottable.index, plottable['Deaths'], label='Actual Data', color='blue')
# plt.plot(plottable.index, plottable['SARIMA Predictions'], label='SARIMA Predictions', color='green')
# plt.plot(plottable.index, plottable['LSTM Predictions'], label='LSTM Predictions', color='red')
# plt.title('Deaths: Actual vs LSTM vs SARIMA Predictions')
# plt.xlabel('Date')
# plt.ylabel('Deaths')
# plt.legend()
# plt.show()

In [None]:
df