In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy
import json
import matplotlib.pyplot as plt
import datetime
import shutil
import os
import re
import random

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
import keras
import pickle

import requests
from bs4 import BeautifulSoup

In [None]:
if os.path.exists('./data'):
    shutil.rmtree('./data')

In [None]:
# Get the relevant statistics
!git clone https://github.com/CSSEGISandData/COVID-19.git data/COVID-19
!git clone https://github.com/datasets/population.git data/population

In [None]:
# Arguments
lookahead = 8 # Lookahead weeks. Set to 8 by default.

In [None]:
# Hard-coded paths
population_csv = './data/population/data/population.csv'
confirmed_csv = './data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'

In [None]:
# Helper functions
def moving_average(x, w):
    return np.convolve(x, np.ones(w), 'valid') / w

In [None]:
# Important dates
begin_date_str = '1/27/20' # Make sure it's a monday, and the day before exists in the data
begin_date = datetime.datetime.strptime(begin_date_str, "%m/%d/%y").date()
print(f'Begin date : {begin_date_str}')

today = datetime.date.today()
today_date = f'{today.month}/{today.day}/{today.year-2000}'
print(f'Date today : {today_date}')

end_date = today - datetime.timedelta(today.weekday()+1)
end_date_str = f'{end_date.month}/{end_date.day}/{end_date.year-2000}'
print(f'End of last week : {end_date_str}')

In [None]:
# Create output folder
week_begin = today - datetime.timedelta(today.weekday())
week_end = week_begin + datetime.timedelta(days=6)
cur_week_name = f'{week_begin.month}-{week_begin.day}-{week_begin.year-2000}-to-{week_end.month}-{week_end.day}-{week_end.year-2000}'
preds_folder = f'./preds/weekly/{cur_week_name}'
output_folder = f'./output/weekly/{cur_week_name}'
print(f'Current identifier: *{cur_week_name}*')

### Recreate existing prediction folder 

In [None]:
if os.path.exists(preds_folder):
    shutil.rmtree(preds_folder)
os.makedirs(preds_folder)
print(preds_folder)

if os.path.exists(output_folder):
    shutil.rmtree(output_folder)
os.makedirs(output_folder)
print(output_folder)

# Read data from locally downloaded datasets and pre-process the data.

In [None]:
fix_case_country_names = {
    'Taiwan*': 'Taiwan',
    'Congo (Brazzaville)': 'Congo, Rep.',
    'Congo (Kinshasa)': 'Congo, Dem. Rep.',
    'Bahamas': 'Bahamas, The',
    'Czechia': 'Czech Republic',
    'US': 'United States',
}

fix_country_names = {
    'Brunei Darussalam': 'Brunei',
    'Egypt, Arab Rep.': 'Egypt',
    'Gambia, The': 'Gambia',
    'Iran, Islamic Rep.': 'Iran',
    'Korea, Rep.': 'Korea, South',
    'Russian Federation': 'Russia',
    'St. Kitts and Nevis': 'Saint Kitts and Nevis',
    'St. Lucia': 'Saint Lucia',
    'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
    'Syrian Arab Republic': 'Syria',
    'Venezuela, RB': 'Venezuela',
    'Yemen, Rep.': 'Yemen',
    'Kyrgyz Republic': 'Kyrgyzstan'
}

In [None]:
# Covid data
df = pd.read_csv(confirmed_csv)
last_col = df.columns[-1]
df = df.rename(columns={'Country/Region':'Country'})
df = df.sort_values(['Country', last_col], ascending=[True, True])
df = df.drop_duplicates(['Country'], 'last')
df = df.drop(columns=['Province/State', 'Lat', 'Long']).reset_index(drop=True)

# Get day columns
day_cols = []
cur_date = begin_date - datetime.timedelta(1)
while cur_date <= end_date:
    cur_date_str = f'{cur_date.month}/{cur_date.day}/{cur_date.year-2000}'
    day_cols.append(cur_date_str)
    cur_date += datetime.timedelta(1)

# Get daily numbers
day_idx = day_cols.index(begin_date_str) - 1
day_cols = day_cols[day_idx:]
cum_df = deepcopy(df[day_cols])
for col_itr in range(1, len(day_cols)):
    df[day_cols[col_itr]] = np.maximum(0, cum_df[day_cols[col_itr]] - cum_df[day_cols[col_itr-1]])
    
# Remove the first day (it is a Sunday!)
day_cols = day_cols[1:]
    
# Fix the actual names to be used
for itr in range(len(df)):
    existing_name = df.loc[itr, 'Country']
    df.loc[itr, 'Country'] = fix_case_country_names[existing_name] if existing_name in fix_case_country_names else existing_name

print(len(df))
display(df.tail(3))
df_cases = deepcopy(df)

# Population data
df_pop = pd.read_csv(population_csv)
df_pop = df_pop.sort_values(['Country Name', 'Year'], ascending=[True, True])
df_pop = df_pop.drop_duplicates(subset=['Country Name'], keep='last').reset_index(drop=True)
df_pop = df_pop.rename(columns={'Country Name':'Country', 'Country Code': 'CCODE', 'Value':'Population'})[['Country', 'CCODE', 'Population']].reset_index(drop=True)
df_pop.loc[len(df_pop)] = ['Taiwan', 'TWN', 22894384]
df_pop.loc[len(df_pop)] = ['Slovakia', 'SVK', 5455000]
for itr in range(len(df_pop)):
    existing_name = df_pop.loc[itr, 'Country']
    df_pop.loc[itr, 'Country'] = fix_country_names[existing_name] if existing_name in fix_country_names else existing_name
print(len(df_pop))
display(df_pop.tail(3))

# Join both
df = df.merge(df_pop, on='Country').reset_index(drop=True)
df['is_country'] = 1
df['Lat'] = -1
df['Lon'] = -1
df['Name'] = df['Country']
base_cols = ['Name', 'Country', 'is_country', 'CCODE', 'Population', 'Lat', 'Lon']
df = df[base_cols + day_cols]

print(len(df))
display(df.tail(3))

In [None]:
# We are skipping the following "countries" which have cases in the dataset but they don't exist in population dataset
set(df_cases['Country'].values) - set(df['Name'].values)

In [None]:
# Divide numbers by population and drop population column
for day in day_cols:
    df[day] = df[day] / df['Population']
df = df.drop(columns=['Population'])
base_cols = [x for x in base_cols if x != 'Population']
base_cols

In [None]:
df.head(3)

In [None]:
df.to_csv(f"{output_folder}/country_daily.csv")

# Write individual predictions to file
cols = {
    'base_cols': base_cols,
    'day_cols': day_cols,
}
with open(f"{output_folder}/cols.pkl","wb") as f:
    pickle.dump(cols,f)