In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy
import json
import matplotlib.pyplot as plt
import datetime
import shutil
import os
import re
import random

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
import keras
import pickle

import requests
from google.cloud import storage
from bs4 import BeautifulSoup

In [None]:
# Arguments
lookahead = 8 # Lookahead weeks. Set to 8 by default.

In [None]:
# Important dates
begin_date_str = '1/27/20' # Make sure it's a monday, and the day before exists in the data
begin_date = datetime.datetime.strptime(begin_date_str, "%m/%d/%y").date()
print(f'Begin date : {begin_date_str}')

today = datetime.date.today()
today_date = f'{today.month}/{today.day}/{today.year-2000}'
print(f'Date today : {today_date}')

end_date = today - datetime.timedelta(today.weekday()+1)
end_date_str = f'{end_date.month}/{end_date.day}/{end_date.year-2000}'
print(f'End of last week : {end_date_str}')

In [None]:
# Create output folder
week_begin = today - datetime.timedelta(today.weekday())
week_end = week_begin + datetime.timedelta(days=6)
cur_week_name = f'{week_begin.month}-{week_begin.day}-{week_begin.year-2000}-to-{week_end.month}-{week_end.day}-{week_end.year-2000}'
preds_folder = f'./preds/weekly/{cur_week_name}'
output_folder = f'./output/weekly/{cur_week_name}'
print(f'Current identifier: *{cur_week_name}*')

In [None]:
df = pd.read_csv(f"{output_folder}/country_daily.csv")

# Write individual predictions to file
with open(f"{output_folder}/cols.pkl","rb") as f:
    cols = pickle.load(f)

In [None]:
base_cols = cols['base_cols']
day_cols = cols['day_cols']

In [None]:
# Add city data 
region_data = pd.read_csv(f'{output_folder}/all_region_df.csv', index_col=0)
region_data.head(3)

In [None]:
region_data.tail(3)

In [None]:
df_all = pd.concat([df, region_data]).reset_index(drop=True)
len(df_all)

# Weekly or daily preds

In [None]:
pred_type = 'weekly'
test_data_size = 2 # number of samples allocated for test and val sets (separately). 
                   # Set to 14 if days, 2 if weeks.

In [None]:
df_select = deepcopy(df_all[base_cols])

if pred_type == 'weekly':
    select_cols = []
    for i in range(len(day_cols) // 7):
        df_select[day_cols[i*7]] = df_all[day_cols[(i*7):((i*7)+7)]].sum(axis=1)
        select_cols.append(day_cols[i*7])
elif pred_type == 'daily':
    select_cols = day_cols
    for day in day_cols:
        df_select[day] = df_all[day]

In [None]:
df_select

In [None]:
# Get cols
train_cols = select_cols[:(2 * -test_data_size)]
val_cols = select_cols[(-2 * test_data_size):-test_data_size]
test_cols = select_cols[-test_data_size:]
print('train cols:', train_cols)
print('val cols:', val_cols)
print('test cols:', test_cols)

# Get data
train_data = df_select[select_cols].iloc[:, :(2 * -test_data_size)]
val_data = df_select[select_cols].iloc[:, (-2 * test_data_size):-test_data_size]
test_data = df_select[select_cols].iloc[:, -test_data_size:]
print(train_data.shape, val_data.shape, test_data.shape)
# weekly_cols

In [None]:
std_val = np.std(train_data.values)
std_val

In [None]:
train_data = train_data / std_val
val_data = val_data / std_val
test_data = test_data / std_val

In [None]:
train_data.shape

In [None]:
# Get UK Index
uk_idx = np.nonzero((df_select['Country'] == 'United Kingdom').values)
uk_idx = uk_idx[0][0]
uk_idx

In [None]:
# Plot the UK
plot_data = df_select.loc[uk_idx, train_cols].values
plt.plot([x for x in range(len(train_cols))], 100000 * plot_data)
plt.ylabel('weekly cases per 100k')
plt.xlabel('week')

# Find hyperparameters for a good predictive model

In [None]:
def create_model(input_shape, layer_count, units, dropout, training=None):
    # Layer count check
    if layer_count < 2:
        return None
    
    # Add the first layer
    inputs = keras.Input(shape=(input_shape, 1))
    x = LSTM(units=units, return_sequences=True)(inputs)
    
    # Add further layers
    for layer in range(1, layer_count):
        # Output layer: set units to 0 and don't return sequences
        if layer == (layer_count-1):
            return_sequences = False
        else:
            return_sequences = True

        # Add LSTM layer
        x = LSTM(units=units, return_sequences=return_sequences)(x)
        x = Dropout(dropout)(x, training=training)

    # Adding the output layer
    outputs = Dense(1, activation='relu')(x)

    # Compiling the model
    model = keras.Model(inputs, outputs)
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    return model

In [None]:
def train_model(model, train_data, val_data, input_steps):
    batch_size = 128
    num_train_cols = train_data.shape[1]
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Create training data
    X_train = []
    y_train = []
    for i in range(len(train_data)):
        for j in range(input_steps, num_train_cols):
            X_train.append(train_data.iloc[i, (j-input_steps):j])
            y_train.append(train_data.iloc[i, j])
    X_train, y_train = np.array(X_train), np.array(y_train)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

    # Remove 80% of the data that is completely zero
    zero_samples = np.all(X_train == 0, axis=1).squeeze() & (y_train == 0)
    valid_samples = ~(zero_samples & (np.random.rand(len(zero_samples)) > 0.4))
    X_train = X_train[valid_samples, :]
    y_train = y_train[valid_samples]
    print('Train', X_train.shape, y_train.shape)
    
    # Create validation data
    if val_data is not None:
        X_val = []
        y_val = []
        combined_data = pd.concat([train_data, val_data], axis=1)
        print(combined_data.shape)

        for i in range(len(val_data)):
            for j in range(num_train_cols, combined_data.shape[1]):
                X_val.append(combined_data.iloc[i, (j-input_steps):j])
                y_val.append(combined_data.iloc[i, j])
        X_val, y_val = np.array(X_val), np.array(y_val)
        X_val = np.reshape(X_val, (X_val.shape[0], X_val.shape[1], 1))
        val_data_agg = (X_val, y_val)
        print('Val', X_val.shape, y_val.shape)    
        history = model.fit(X_train, y_train, validation_data=val_data_agg, epochs = 150, verbose=2, batch_size = batch_size, callbacks=[])
        history = model.fit(X_train, y_train, validation_data=val_data_agg, epochs = 150, verbose=2, batch_size = batch_size, callbacks=[callback])
    else: 
        history = model.fit(X_train, y_train, validation_split=0.1, epochs = 150, verbose=2, batch_size = batch_size, callbacks=[])
        history = model.fit(X_train, y_train, validation_split=0.1, epochs = 150, verbose=2, batch_size = batch_size, callbacks=[callback])

    # Fitting the LSTM to the training set
    return min(history.history['val_loss']), history
                

In [None]:
# # Perform a grid search for parameters
# stats = {}
# for input_steps in [15, 10, 8, 5]:
#     for units in [10, 20, 50, 100]:
#         for dropout in [0.2, 0.3, 0.5]:
#             for layers in [3, 5, 8]:
#                 print('Training ', input_steps, units, dropout, layers)
#                 model = create_model(input_steps, layers, units, dropout)
#                 min_val_loss = train_model(model, train_data, val_data, input_steps)
#                 stats[input_steps, units, dropout, layers] = min_val_loss
#                 print('Min val loss for ', input_steps, units, dropout, layers, ':', min_val_loss)

Outcome: best model = create_model(5, 3, 50, 0.2)

In [None]:
input_steps = 8
layer_count = 3
unit_count = 50
dropout = 0.4
best_model_path = f"{output_folder}/best_{input_steps}_{layer_count}_{unit_count}_{dropout}.h5"
model = create_model(input_steps, layer_count, unit_count, dropout)

# Train model
min_val_loss, history = train_model(model, pd.concat([train_data, val_data, test_data], axis=1), None, input_steps)
model.save_weights(best_model_path)

In [None]:
x_vals = [x for x in range(len(history.history['val_loss']))]
plt.plot(x_vals, history.history['val_loss'])
plt.plot(x_vals, history.history['loss'])

In [None]:
# Get data, determine lookahead
combined_data = pd.concat([train_data, val_data, test_data], axis=1)

In [None]:
# Save data
pred_vars = {}

pred_vars['std_val'] = std_val
pred_vars['test_data_size'] = test_data_size
pred_vars['pred_type'] = pred_type
pred_vars['base_cols'] = base_cols
pred_vars['combined_data'] = combined_data
pred_vars['df_select'] = df_select
pred_vars['input_steps'] = input_steps
pred_vars['layer_count'] = layer_count
pred_vars['unit_count'] = unit_count
pred_vars['dropout'] = dropout
pred_vars['best_model_path'] = f"{output_folder}/best_{input_steps}_{layer_count}_{unit_count}_{dropout}.h5"
with open(f"{output_folder}/pred_vars.pkl","wb") as f:
    pickle.dump(pred_vars,f)

# Sanity check

In [None]:
# # Train with the best model
# input_steps = 5
# model = create_model(input_steps, 3, 50, 0.2)
# min_val_loss = train_model(model, pd.concat([train_data], axis=1), None, input_steps)

In [None]:
# tmp_lookahead = 4

# all_preds = []
# cur_data = train_data.values[:, -input_steps:]
# # Get preds for all future days
# for day in range(tmp_lookahead):
#     tmp_data = np.reshape(cur_data, (cur_data.shape[0], cur_data.shape[1], 1))
#     preds = model.predict(tmp_data)
#     all_preds.append(preds)
#     cur_data = np.concatenate([cur_data, preds], axis=1)[:, -input_steps:]
# all_preds = np.concatenate(all_preds, axis=1)

In [None]:
# real_data = np.concatenate([train_data, all_preds], axis=1) * std_val
# pred_data = np.concatenate([train_data, val_data, test_data], axis=1) * std_val

In [None]:
# show_cols = 20
# buffer = 0.2
# ybuffer = 0.5
# future_weeks = val_data.shape[1] + test_data.shape[1]
# if os.path.exists(f'{output_folder}/sanity_check'):
#     shutil.rmtree(f'{output_folder}/sanity_check')
# os.makedirs(f'{output_folder}/sanity_check')
# for idx in range(len(df_select)):
#     country_name = df_select.loc[idx, 'Country']
#     code_name = df_select.loc[idx, 'CCODE']
#     data_to_show = np.maximum(0.0, 100000 * real_data[idx, -show_cols:])
#     real_data_to_show = np.maximum(0.0, 100000 * pred_data[idx, -show_cols:])

#     plt.figure(figsize=(10, 5))
#     plt.title(f'Weekly Covid-19 data for {country_name}')
#     plt.plot([x for x in range(len(data_to_show))], data_to_show, 'c--o')
#     plt.plot([x for x in range(len(real_data_to_show))], real_data_to_show, 'b-o')
#     plt.plot([0, show_cols-1], [20, 20], 'r--')
#     plt.ylabel('weekly cases per 100k')
#     plt.xlabel('week')
#     plt.xlim([-buffer, (show_cols-1)+buffer])
#     plt.gcf().subplots_adjust(bottom=0.3, left=0.1, right=0.96)
#     plt.savefig(f'{output_folder}/sanity_check/{code_name}.jpg', dpi=200)
#     plt.show()
#     plt.close()