In [0]:
# Treating the categorical variables as continuous

# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Input, Dense, Reshape
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

Using TensorFlow backend.


In [0]:
# Prepare the data

# Read the ILI activity data, and use only the relavant columns (Remember that the data is already sorted by year, week and then state)
ILI_dataframe = read_csv('final_project.csv', usecols = ['STATENAME', 'ACTIVITY LEVEL', 'WEEK', 'YEAR'])
print(ILI_dataframe)

           STATENAME ACTIVITY LEVEL  YEAR  WEEK
0            Alabama        Level 1  2008    40
1             Alaska        Level 1  2008    40
2            Arizona        Level 1  2008    40
3           Arkansas        Level 6  2008    40
4         California        Level 1  2008    40
...              ...            ...   ...   ...
31865       Virginia       Level 10  2020    12
31866     Washington       Level 10  2020    12
31867  West Virginia       Level 10  2020    12
31868      Wisconsin       Level 10  2020    12
31869        Wyoming        Level 7  2020    12

[31870 rows x 4 columns]


In [0]:
# Get the list of states 
list_of_states_complete = ILI_dataframe['STATENAME'].values # Get entire STATENAME column
states_unique = list(set(list_of_states_complete)) # Unique list of states, but not in alphabetical order
states_unique.sort() # Now in alphabetical order
print(states_unique)

['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Commonwealth of the Northern Mariana Islands', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'New York City', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


In [0]:
# Get the list of activity levels
activity_levels_complete = ILI_dataframe['ACTIVITY LEVEL'].values # Get entire ACTIVITY LEVEL column
activity_levels = list(set(activity_levels_complete)) # Unique list of acitivity levels, not in order
activity_levels.sort() # Activity levels in order
activity_levels.append(activity_levels[2]) # Level 10 was incorrectly sorted, sort it manually
activity_levels.pop(2) # Now correctly sorted
print(activity_levels)

['Level 0', 'Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5', 'Level 6', 'Level 7', 'Level 8', 'Level 9', 'Level 10']


In [0]:
# Loop over all the states, and perform state regression time series 

us_states_predictions = []
state_rmse_train = []
state_rmse_test = []

for state_name in states_unique:

  ILI_Data_state = ILI_dataframe[ILI_dataframe['STATENAME'] == state_name]

  # Then filter the rows that do not have year 2020 (We want to predict for the 2018-2019 season)
  ILI_Data_state = ILI_Data_state[ILI_Data_state['YEAR'] != 2020]

  # Finally, filter out the rows that are in the flu season (Rows with Week 40 to 52)
  ILI_Data_state = ILI_Data_state[ILI_Data_state['WEEK'] >= 40]

  ILI_Data_state_vals = ILI_Data_state.values

  if len(ILI_Data_state_vals) != 158:
    continue

  # Manually remove the duplicates in 2011 and 2016
  ILI_Data_state_vals = np.delete(ILI_Data_state_vals, 52, 0)
  ILI_Data_state_vals = np.delete(ILI_Data_state_vals, 117, 0)

  ILI_state_activities = list(ILI_Data_state_vals[:, 1])

  # Integer code the activity labels
  def activity_to_int(activity_lev):
    activity_int = activity_levels.index(activity_lev)
    return activity_int

  ILI_state_activities_int = []

  for activity_lev in ILI_state_activities:
    ILI_state_activities_int.append(activity_to_int(activity_lev))

  # Plot the state activity as function of weeks 40 to 52 over the years 2008 to 2019
  plt.plot(ILI_state_activities_int)
  plt.xlabel('weeks 40 to 52 over the years 2008 to 2019')
  plt.ylabel('Activity Level')
  plt.title('ILI Activity for the state of ' + state_name)
  plt.show()

  # Convert the list of Activities to a numpy array for scaling
  ILI_state_activities_int = np.asarray(ILI_state_activities_int, dtype = np.float32)
  ILI_state_activities_int = np.reshape(ILI_state_activities_int, (len(ILI_state_activities_int), 1))

  # Scale the Activity to a 0-1 range 
  scaler = MinMaxScaler(feature_range=(0,1))
  ILI_state_activities_scaled = scaler.fit_transform(ILI_state_activities_int)

  # Split into train and test sets
  # 142 is train size if only last flu season to be predicted
  train_size = 142 # int(len(ILI_state_activities_scaled) * 0.67)
  test_size = len(ILI_state_activities_scaled) - train_size

  train, test = ILI_state_activities_scaled[0:train_size, :], ILI_state_activities_scaled[train_size:len(ILI_state_activities_scaled), :]

  # Prepare the data for LSTM, using the sliding window approach. The input X must be of shape (num_samples, num_time_steps, num_features) and the output y must be the same as well.
  num_lag_weeks = 1
  num_step_weeks = 1

  def prepare_LSTM_data(dataset, n_lag, n_step):
    X_list = []
    y_list = []

    for i in range(len(dataset) - (n_lag + n_step) + 1): 
      X_slice = dataset[i:(i+n_lag), :]
      y_slice = dataset[(i+n_lag): (i+n_lag+n_step), 0]

      X_list.append(X_slice)
      y_list.append(y_slice)

    return np.array(X_list), np.array(y_list)

  trainX, trainY = prepare_LSTM_data(train, num_lag_weeks, num_step_weeks)
  testX, testY = prepare_LSTM_data(test, num_lag_weeks, num_step_weeks)

  # create and fit the LSTM network
  model = Sequential()
  model.add(LSTM(4, input_shape=(1, num_lag_weeks)))
  model.add(Dense(1))
  model.compile(loss='mean_squared_error', optimizer='adam')
  model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

  # make predictions
  trainPredict = model.predict(trainX)
  testPredict = model.predict(testX)

  # # invert predictions
  trainPredict = scaler.inverse_transform(trainPredict)
  trainY = scaler.inverse_transform(trainY)
  testPredict = scaler.inverse_transform(testPredict)
  testY = scaler.inverse_transform(testY)

  # calculate root mean squared error
  trainScore = math.sqrt(mean_squared_error(trainY, trainPredict))
  print('Train Score: %.2f RMSE' % (trainScore))
  testScore = math.sqrt(mean_squared_error(testY, testPredict))
  print('Test Score: %.2f RMSE' % (testScore))

  state_rmse_train.append(trainScore)
  state_rmse_test.append(testScore)

  # shift train predictions for plotting
  trainPredictPlot = np.empty_like(ILI_state_activities_scaled)
  trainPredictPlot[:, :] = np.nan
  # trainPredictPlot[num_lag_weeks:len(trainPredict)+num_lag_weeks, :] = trainPredict
  trainPredictPlot[num_lag_weeks:(len(trainPredict) + 1), :] = trainPredict

  # shift test predictions for plotting
  testPredictPlot = np.empty_like(ILI_state_activities_scaled)
  testPredictPlot[:, :] = np.nan
  # testPredictPlot[-(test_size-1):, :] = testPredict
  start_test_point = len(trainPredict) + 2
  testPredictPlot[start_test_point:, :] = testPredict

  # plot baseline and predictions
  plt.plot(scaler.inverse_transform(ILI_state_activities_scaled))
  plt.plot(trainPredictPlot)
  plt.plot(testPredictPlot)
  plt.xlabel('weeks 40 to 52 over the years 2008 to 2019')
  plt.ylabel('Activity Level')
  plt.title('ILI Activity for the state of ' + state_name)
  # plt.legend(['True Activity', 'Train Prediction', 'Test Prediction'])
  plt.show()

  testPredictRound = np.around(testPredict)

  activity_prediction_list = []

  for i in range(len(testPredictRound)):
    activity_prediction_list.append(activity_levels[int(testPredictRound[i, 0])])

  state_result = [state_name, activity_prediction_list]
  us_states_predictions.append(state_result)

  

In [0]:
print(state_rmse_train)
print(state_rmse_test)
print(us_states_predictions)

[2.4592203598167464, 1.6483940371889878, 1.451855997526935, 2.152576136289254, 1.970988923020883, 1.801267082562208, 1.114188537316426, 1.273100311009944, 1.5137000690418607, 1.8947195687050402, 1.3971738593745955, 1.8045325274623558, 1.9717867606320127, 2.1234471874563443, 1.3061275643823584, 2.0241904017864343, 1.8450174662685939, 2.1804941523735293, 0.9137221786918461, 1.7330671998657434, 1.3489291164616835, 1.5660235539171625, 1.7877562972704573, 2.3360393591269255, 2.026449781825772, 0.9872514387764086, 1.91780496740952, 1.6647715602566484, 0.9115732647322649, 1.730317395482192, 2.236462558220573, 1.5931164847468524, 1.5594655899307233, 1.9042077928883454, 1.3181373691837646, 1.6877690559930876, 2.135813073062047, 1.4559335127311455, 1.681735896691482, 1.2292225657969118, 2.050823102251512, 1.2405451828194147, 1.9798191065952155, 1.9795354265365726, 1.9158677288063537, 1.4375718140286078, 1.8389169743990375, 1.5686094243869744, 1.9370119649215245, 1.5783022035122336, 1.65221503728

In [0]:
# Save results in a csv
import pandas as pd

dataframe_rmse_train = pd.DataFrame(state_rmse_train)
dataframe_rmse_train.to_csv('state_rmse_train.csv', header = False, index = False)

dataframe_rmse_test = pd.DataFrame(state_rmse_test)
dataframe_rmse_test.to_csv('state_rmse_test.csv', header = False, index = False)

In [0]:
dataframe_activity_predictions = pd.DataFrame(us_states_predictions)
print(dataframe_activity_predictions)

dataframe_activity_predictions.to_csv('us_states_predictions.csv', header = False, index=False)

                 0                                                  1
0          Alabama  [Level 8, Level 3, Level 3, Level 4, Level 4, ...
1           Alaska  [Level 3, Level 1, Level 1, Level 1, Level 1, ...
2          Arizona  [Level 6, Level 1, Level 1, Level 1, Level 1, ...
3         Arkansas  [Level 6, Level 2, Level 2, Level 3, Level 2, ...
4       California  [Level 5, Level 1, Level 1, Level 1, Level 1, ...
5         Colorado  [Level 7, Level 2, Level 3, Level 3, Level 3, ...
6      Connecticut  [Level 5, Level 2, Level 3, Level 3, Level 3, ...
7         Delaware  [Level 1, Level 1, Level 1, Level 1, Level 1, ...
8          Florida  [Level 4, Level 2, Level 1, Level 2, Level 2, ...
9          Georgia  [Level 8, Level 3, Level 4, Level 4, Level 5, ...
10          Hawaii  [Level 1, Level 1, Level 1, Level 2, Level 2, ...
11           Idaho  [Level 3, Level 1, Level 1, Level 1, Level 1, ...
12        Illinois  [Level 6, Level 1, Level 2, Level 2, Level 3, ...
13         Indiana  

In [0]:
# Calculate avg global rmse 
global_train_rmse = 0.0
global_test_rmse = 0.0

for i in range(len(state_rmse_train)):
  global_train_rmse += state_rmse_train[i]
  global_test_rmse += state_rmse_test[i]

global_train_rmse = global_train_rmse/len(state_rmse_train)
global_test_rmse = global_test_rmse/len(state_rmse_train)

print('Train RMSE across all states: ', global_train_rmse)
print('Test RMSE across all states: ', global_test_rmse)


Train RMSE across all states:  1.7026974298199011
Test RMSE across all states:  1.9715078518548663
