In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from keras.models import Sequential
from keras.layers import LSTM, Dense
from statsmodels.tsa.statespace.sarimax import SARIMAX
import os

# Load and preprocess data
df = pd.read_excel('data/state_month_overdose.xlsx')
df['Deaths'] = df['Deaths'].apply(lambda x: 0 if x == 'Suppressed' else int(x))
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.groupby(['Month']).agg({'Deaths': 'sum'}).reset_index()

# Create output directory
os.makedirs('tables', exist_ok=True)

# Define functions
def create_dataset(dataset, look_back):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        a = dataset.iloc[i:(i + look_back)]
        dataX.append(a)
        dataY.append(dataset.iloc[i + look_back])
    return np.array(dataX), np.array(dataY)

def generate_forecast(model, initial_sequence, num_predictions, look_back):
    predictions = []
    for _ in range(num_predictions):
        next_prediction = model.predict(initial_sequence)
        predictions.append(next_prediction[0][0])
        new_sequence = np.append(initial_sequence[0, 1:], [[next_prediction[0][0]]], axis=0)
        initial_sequence = new_sequence.reshape((1, look_back, 1))
    return np.array(predictions)

def calculate_confidence_intervals(predictions):
    mean_pred = np.mean(predictions)
    std_pred = np.std(predictions)
    z_score = 1.96  # for 95% confidence
    margin_of_error = z_score * (std_pred / np.sqrt(len(predictions)))
    lower_bound = predictions - margin_of_error
    upper_bound = predictions + margin_of_error
    return lower_bound, upper_bound

def calculate_overlap(lower1, upper1, lower2, upper2):
    overlap_count = 0
    for l1, u1, l2, u2 in zip(lower1, upper1, lower2, upper2):
        if u1 >= l2 and l1 <= u2:
            overlap_count += 1
    percent_overlap = (overlap_count / len(lower1)) * 100
    return percent_overlap

# Iterate over look-back periods
look_back_periods = range(3, 12, 2)
results = []

for look_back in look_back_periods:
    print(f"Processing look-back period: {look_back}")

    # Split data
    train = df[df['Month'] <= '2020-02-01']
    test = df[df['Month'] >= '2020-02-01']

    # Create datasets
    trainX, trainY = create_dataset(train['Deaths'], look_back)
    testX, testY = create_dataset(test['Deaths'], look_back)

    # Reshape for LSTM
    trainX = trainX.reshape((trainX.shape[0], look_back, 1))
    testX = testX.reshape((testX.shape[0], look_back, 1))

    # Build LSTM model
    model = Sequential()
    model.add(LSTM(50, activation='relu', input_shape=(look_back, 1)))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)

    # Generate forecasts
    initial_sequence = trainX[-1].reshape((1, look_back, 1))
    testPredict = generate_forecast(model, initial_sequence, len(testY), look_back)
    trainPredict = model.predict(trainX).flatten()

    # SARIMA model
    sarima_model = SARIMAX(train['Deaths'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12),
                           enforce_stationarity=False, enforce_invertibility=False)
    sarima_result = sarima_model.fit(disp=False)
    sarima_predictions = sarima_result.predict(start=0, end=len(train) + len(test) - 1, dynamic=False)

    # Error metrics
    lstm_mse = mean_squared_error(testY, testPredict)
    lstm_rmse = np.sqrt(lstm_mse)
    lstm_mape = mean_absolute_percentage_error(testY, testPredict)

    sarima_mse = mean_squared_error(testY, sarima_predictions[len(train):len(train)+len(test)])
    sarima_rmse = np.sqrt(sarima_mse)
    sarima_mape = mean_absolute_percentage_error(testY, sarima_predictions[len(train):len(train)+len(test)])

    # Confidence intervals and overlap
    lower_bound_test, upper_bound_test = calculate_confidence_intervals(testPredict)
    sarimaTestPredict = sarima_predictions[len(train):len(train)+len(test)]
    lower_bound_sarima, upper_bound_sarima = calculate_confidence_intervals(sarimaTestPredict)
    ci_overlap = calculate_overlap(lower_bound_test, upper_bound_test, lower_bound_sarima, upper_bound_sarima)

    # Save results
    results.append({
        'Look-back': look_back,
        'LSTM MAPE': lstm_mape,
        'LSTM MSE': lstm_mse,
        'LSTM RMSE': lstm_rmse,
        'SARIMA MAPE': sarima_mape,
        'SARIMA MSE': sarima_mse,
        'SARIMA RMSE': sarima_rmse,
        'CI Overlap %': ci_overlap
    })

    # Save predictions to CSV
    df[f'LSTM Predictions ({look_back})'] = [0] * look_back + list(trainPredict) + list(testPredict)
    df[f'SARIMA Predictions ({look_back})'] = sarima_predictions
    df.to_csv(f'tables/{look_back}_month_predictionresults.csv', index=False)

    # Plot results
    plt.figure(figsize=(10, 6))
    plt.plot(df['Month'], df['Deaths'], label='Actual Data', color='blue')
    plt.plot(df['Month'], df[f'LSTM Predictions ({look_back})'], label=f'LSTM Predictions ({look_back} months)', color='red')
    plt.plot(df['Month'], df[f'SARIMA Predictions ({look_back})'], label=f'SARIMA Predictions ({look_back} months)', color='green')
    plt.title(f'Deaths: Actual vs Predictions (Look-back: {look_back})')
    plt.xlabel('Date')
    plt.ylabel('Deaths')
    plt.legend()
    plt.savefig(f'tables/{look_back}_month_prediction_plot.png')
    plt.close()

# Save summary results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('tables/summary_metrics.csv', index=False)

print("Processing complete. Results saved to 'tables' directory.")

  df['Month'] = pd.to_datetime(df['Month'])
  super().__init__(**kwargs)


Processing look-back period: 3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 220ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


ValueError: Found input variables with inconsistent numbers of samples: [8, 11]