In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
import tensorflow as tf

# Read data
df = pd.read_excel('data/state_month_overdose.xlsx')
df['Deaths'] = df['Deaths'].apply(lambda x: 0 if x == 'Suppressed' else int(x))
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
df = df.groupby(['Month']).agg({'Deaths': 'sum'}).reset_index()

# Create dataset function
def create_dataset(dataset, look_back=3):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        a = dataset.iloc[i:(i + look_back)]
        dataX.append(a)
        dataY.append(dataset.iloc[i + look_back])
    return np.array(dataX), np.array(dataY)

# Function for generating forecast with LSTM
def generate_forecast(model, initial_sequence, num_predictions=12, look_back=3):
    predictions = []
    for _ in range(num_predictions):
        next_prediction = model.predict(initial_sequence)
        predictions.append(next_prediction[0][0])
        new_sequence = np.append(initial_sequence[0, 1:], [[next_prediction[0][0]]], axis=0)
        initial_sequence = new_sequence.reshape((1, look_back, 1))
    return np.array(predictions)

# Function for calculating confidence intervals
def calculate_confidence_intervals(predictions, alpha=0.05):
    mean_pred = np.mean(predictions)
    std_pred = np.std(predictions)
    z_score = 1.96  # for 95% confidence
    margin_of_error = z_score * (std_pred / np.sqrt(len(predictions)))
    lower_bound = predictions - margin_of_error
    upper_bound = predictions + margin_of_error
    return lower_bound, upper_bound

# Function to calculate overlap between two sets of confidence intervals
def calculate_overlap(lower1, upper1, lower2, upper2):
    overlap_count = 0
    for l1, u1, l2, u2 in zip(lower1, upper1, lower2, upper2):
        if u1 >= l2 and l1 <= u2:
            overlap_count += 1
    percent_overlap = (overlap_count / len(lower1)) * 100
    return percent_overlap

# Cross-validation loop for different lookbacks and validation periods
validation_periods = [('2015-01-01', '2017-01-01'), ('2017-01-01', '2019-01-01'), ('2019-01-01', '2020-01-01')]  # Example periods
look_back_periods = [3, 6, 12]  # Example lookback periods
results = []

# Loop through validation periods and lookbacks
for val_start, val_end in validation_periods:
    for look_back in look_back_periods:
        train = df[df['Month'] < val_start]
        validation = df[(df['Month'] >= val_start) & (df['Month'] < val_end)]
        test = df[df['Month'] >= val_end]

        extended_validation = pd.concat([train.iloc[-look_back:], validation])
        extended_test = pd.concat([validation.iloc[-look_back:], test])

        trainX, trainY = create_dataset(train['Deaths'], look_back)
        valX, valY = create_dataset(extended_validation['Deaths'], look_back)
        testX, testY = create_dataset(extended_test['Deaths'], look_back)

        trainX = trainX.reshape((trainX.shape[0], look_back, 1))
        valX = valX.reshape((valX.shape[0], look_back, 1))
        testX = testX.reshape((testX.shape[0], look_back, 1))

        # Build and train LSTM model
        model = Sequential()
        model.add(LSTM(50, activation='relu', input_shape=(look_back, 1)))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam')
        model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)

        # Generate validation predictions with LSTM
        valPred = model.predict(valX).flatten()
        lstm_mse = mean_squared_error(valY, valPred)
        results.append({'Validation Period': f"{val_start} to {val_end}",
                        'Look-back': look_back, 'LSTM MSE': lstm_mse})

# Save LSTM results
results_df = pd.DataFrame(results)
results_df.to_csv('eval_test_kfold_cross_validation_lookback_results.csv', index=False)

# Identify best model
best_model = results_df.loc[results_df['LSTM MSE'].idxmin()]
best_val_start, best_val_end = best_model['Validation Period'].split(' to ')
best_look_back = best_model['Look-back']

# Train and test with best model
full_train = df[df['Month'] < best_val_end]
extended_test = pd.concat([full_train.iloc[-best_look_back:], test])

trainX, trainY = create_dataset(full_train['Deaths'], best_look_back)
testX, testY = create_dataset(extended_test['Deaths'], best_look_back)

trainX = trainX.reshape((trainX.shape[0], best_look_back, 1))
testX = testX.reshape((testX.shape[0], best_look_back, 1))

model.fit(trainX, trainY, epochs=50, batch_size=1, verbose=0)
testPred = model.predict(testX).flatten()
trainPred = model.predict(trainX).flatten()

# Add LSTM predictions to DataFrame
initial_sequence = np.array([trainPred[-1]])
testPredict = generate_forecast(model, initial_sequence, num_predictions=len(test))
trainPredictlst = trainPred.flatten().tolist()
testPredictlst = testPredict.flatten().tolist()
combined_array = [0] + trainPredictlst + testPredictlst
df['LSTM Predictions'] = combined_array

# Train SARIMA Model
sarima_model = SARIMAX(full_train['Deaths'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12),
                       enforce_stationarity=False, enforce_invertibility=False)
sarima_result = sarima_model.fit(disp=False)
sarima_predictions = sarima_result.predict(start=0, end=df.shape[0] - 1, dynamic=False)
df['SARIMA Predictions'] = sarima_predictions

sarimaTestPredict = df[df['Month'] >= best_val_end]['SARIMA Predictions']

# Calculate metrics for best model
final_test_results = {
    'Best Validation Period': f"{best_val_start} to {best_val_end}",
    'Best Look-back': best_look_back,
    'LSTM Train MAPE': mean_absolute_percentage_error(trainY, trainPred),
    'LSTM Train MSE': mean_squared_error(trainY, trainPred),
    'LSTM Test MAPE': mean_absolute_percentage_error(testY, testPred),
    'LSTM Test MSE': mean_squared_error(testY, testPred),
    'SARIMA Test MAPE': mean_absolute_percentage_error(testY, sarimaTestPredict),
    'SARIMA Test MSE': mean_squared_error(testY, sarimaTestPredict)
}

pd.DataFrame([final_test_results]).to_csv('eval_test_results.csv', index=False)

# Plot Results
plottable = df.iloc[1:]  # Exclude rows used for the first lookback
plottable.set_index('Month', inplace=True)
plt.figure(figsize=(10, 6))
plt.plot(plottable.index, plottable['Deaths'], label='Actual Data', color='blue')
plt.plot(plottable.index, plottable['SARIMA Predictions'], label='SARIMA Predictions', color='green')
plt.plot(plottable.index, plottable['LSTM Predictions'], label='LSTM Predictions', color='red')
plt.title('Deaths: Actual vs LSTM vs SARIMA Predictions')
plt.xlabel('Date')
plt.ylabel('Deaths')
plt.legend()
plt.show()

  df['Month'] = pd.to_datetime(df['Month'])
I0000 00:00:1734641886.321060  146104 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-12-19 12:58:06.679096: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
  super().__init__(**kwargs)
2024-12-19 12:58:07.885084: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
  self.gen.throw(typ, value, traceback)
2024-12-19 12:58:07.921940: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting w

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 119ms/step


  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)
2024-12-19 12:58:10.158888: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step


  super().__init__(**kwargs)
  self.gen.throw(typ, value, traceback)
2024-12-19 12:58:12.422565: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step  


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(1,), dtype=float32). Expected shape (None, 12, 1), but input has incompatible shape (1,)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(1,), dtype=float32)
  • training=False
  • mask=None

In [3]:
print("Finished")

Finished


In [4]:
initial_sequence

array([4478.923], dtype=float32)