In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy
import json
import matplotlib.pyplot as plt
import datetime
import shutil
import os
import re

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
import keras
import pickle

import requests
from bs4 import BeautifulSoup

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [None]:
# Hard-coded paths
# population_csv = './data/population/data/population.csv'
daily_reports_path = './data/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports'
us_meta = './US_state_pop_coords.csv'
us_df_path = './data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'

In [None]:
df_us = pd.read_csv(us_df_path)
df_us = df_us.rename(columns={'Province_State': 'State'})
df_us = df_us.drop(columns=['Lat', 'Long_'])
df_us_meta = pd.read_csv(us_meta)
df_us = df_us.merge(df_us_meta, on='State')
df_us = df_us.rename(columns={'State': 'Region'})

In [None]:
df_us.head(3)

In [None]:
# Important dates
begin_date_str = '1/27/20' # Make sure it's a monday, and the day before exists in the data
begin_date = datetime.datetime.strptime(begin_date_str, "%m/%d/%y").date()
print(f'Begin date : {begin_date_str}')

today = datetime.date.today()
today_date = f'{today.month}/{today.day}/{today.year-2000}'
print(f'Date today : {today_date}')

end_date = today - datetime.timedelta(today.weekday()+1)
end_date = f'{end_date.month}/{end_date.day}/{end_date.year-2000}'
print(f'End of last week : {end_date}')

In [None]:
# Create output folder
week_begin = today - datetime.timedelta(today.weekday())
week_end = week_begin + datetime.timedelta(days=6)
cur_week_name = f'{week_begin.month}-{week_begin.day}-{week_begin.year-2000}-to-{week_end.month}-{week_end.day}-{week_end.year-2000}'
output_folder = f'./output/weekly/{cur_week_name}'
print(f'Current identifier: *{cur_week_name}*')

In [None]:
cur_date = begin_date
df = None
daily_cols = []
us_daily_cols = []
coords = {}
prev_df_daily = None

while cur_date < today:
    cur_date_str = cur_date.strftime('%m-%d-%Y')
    cur_date_str_no_padding = cur_date.strftime('%m/%d/%y').lstrip("0").replace("/0", "/")
    cur_date += datetime.timedelta(1)
    print(cur_date_str_no_padding)
    daily_report = f'{daily_reports_path}/{cur_date_str}.csv'
    if (not os.path.exists(daily_report)) or (cur_date_str_no_padding not in df_us.columns):
        break
    us_daily_cols.append(cur_date_str_no_padding)
        
    df_daily_org = pd.read_csv(daily_report)
    if ('Combined_Key' not in df_daily_org) or \
       ('Lat' not in df_daily_org) or \
       ('Long_' not in df_daily_org) or \
       ('Incidence_Rate' not in df_daily_org):
        continue
        
    df_daily_org['Lat'] = np.round(df_daily_org['Lat'],3)
    df_daily_org['Lon'] = np.round(df_daily_org['Long_'],3)

    df_daily = df_daily_org[['Province_State', 'Country_Region', 'Combined_Key', 'Lat', 'Lon', 'Incidence_Rate']]
    
    # Save
    df_daily = df_daily.rename(columns={'Incidence_Rate': cur_date_str_no_padding})
    daily_cols.append(cur_date_str_no_padding)
    if df is None:
        df = deepcopy(df_daily)
    else:
        df = df.merge(df_daily, on=['Province_State', 'Country_Region', 'Lat', 'Lon', 'Combined_Key'], how='outer')
# week_begin = today - datetime.timedelta(today.weekday())

In [None]:
# Clean the dataset
last_weeks_valid = 10
df_clean = deepcopy(df)
print(len(df_clean))
df_clean = df_clean[~df_clean['Province_State'].isna()]
print('After None name cleaning:', len(df_clean))
df_clean = df_clean[~np.isnan(df_clean[daily_cols[-(7 * last_weeks_valid):]].values).any(axis=1)]
print('After None value cleaning:', len(df_clean))
df_clean = df_clean[df_clean['Country_Region'] != 'US']
print('After US city cleaning:', len(df_clean))

df_clean = df_clean.rename(columns={'Province_State': 'Region', 'Country_Region': 'Country'})

# Calculate daily incidence rate, remove first date
df_clean_org = deepcopy(df_clean)
for itr in range(1, len(daily_cols)):
    # Get the difference from the previous day
    df_clean[daily_cols[itr]] = (df_clean_org[daily_cols[itr]] - df_clean_org[daily_cols[itr-1]]) / 100000.0
df_clean = df_clean.drop(columns=[daily_cols[0]])

In [None]:
df_clean.head(5)

In [None]:
state_cases = df_us.groupby('Region').sum()[us_daily_cols].reset_index()
df_us_meta_full = deepcopy(df_us_meta)
df_us_meta_full = df_us_meta_full.rename(columns={'State': 'Region'})
df_us_meta_full['Country'] = 'United States'
df_us_meta_full['Combined_Key'] = df_us_meta_full['Region'] + ', ' + df_us_meta_full['Country']
state_df = df_us_meta_full.merge(state_cases, on='Region')
state_df['Lat'] = np.round(state_df['Lat'],3)
state_df['Lon'] = np.round(state_df['Lon'],3)
state_df_org = deepcopy(state_df)
for itr in range(1, len(us_daily_cols)):
    state_df[us_daily_cols[itr]] = (state_df_org[us_daily_cols[itr]] - state_df_org[us_daily_cols[itr-1]]) / state_df['Population']
state_df = state_df.drop(columns=['Population', us_daily_cols[0]])
state_df.head(3)

In [None]:
df_us_meta_full['Combined_Key']

In [None]:
common_cols = list(set(state_df.columns) & set(df_clean.columns))
all_region_df = state_df.merge(df_clean, on=list(set(state_df.columns) & set(df_clean.columns)), how='outer')
all_region_df['CCODE'] = -1
all_region_df['is_country'] = 0
all_region_df = all_region_df[['Region', 'Country', 'is_country', 'CCODE', 'Combined_Key', 'Lat', 'Lon'] + us_daily_cols[1:]]

In [None]:
all_region_df = all_region_df.rename(columns={'Region': 'Name'})
all_region_df = all_region_df.drop(columns=['Combined_Key'])

In [None]:
all_region_df = all_region_df.fillna(0.0)

In [None]:
all_region_df.tail(3)

In [None]:
len(all_region_df)

In [None]:
all_region_df['Country'].value_counts()

In [None]:
all_region_df.to_csv(f'{output_folder}/all_region_df.csv')