In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy
import json
import matplotlib.pyplot as plt
import datetime
import shutil
import os
import re

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
import keras
import pickle
from scipy import stats

import requests
from google.cloud import storage
from bs4 import BeautifulSoup

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
# Make sure the following environment variable is set! 
creds_path = os.environ['GOOGLE_APPLICATION_CREDENTIALS_PATH']
gcs_client = storage.Client.from_service_account_json(creds_path)
bucket = gcs_client.get_bucket('covid-19-forecaster-data')

In [None]:
# Params
begin_date_str = '1/27/20' # Make sure it's a monday, and the day before exists in the data
begin_date = datetime.datetime.strptime(begin_date_str, "%m/%d/%y")

today = datetime.date.today()
today_date = f'{today.month}/{today.day}/{today.year-2000}'
today_date

In [None]:
# Hard-coded paths and values
travel_advice_pre_url = 'https://www.gov.uk/foreign-travel-advice'
travel_advice_url = "https://www.gov.uk/guidance/coronavirus-covid-19-travel-corridors"

In [None]:
# Create output folder
week_begin = today - datetime.timedelta(today.weekday())
week_end = week_begin + datetime.timedelta(days=6)
cur_week_name = f'{week_begin.month}-{week_begin.day}-{week_begin.year-2000}-to-{week_end.month}-{week_end.day}-{week_end.year-2000}'
output_folder = f'./preds/weekly/{cur_week_name}'
print(f'Current identifier: *{cur_week_name}*')

In [None]:
# Helper functions
def moving_average(x, w):
    return np.convolve(x, np.ones(w), 'valid') / w

# Read existing data

In [None]:
df_final = pd.read_csv(f'./output/weekly/{cur_week_name}/df_final.csv', index_col=0)
with open(f"./output/weekly/{cur_week_name}/config.pkl","rb") as f:
    config = pickle.load(f)
with open(f"./output/weekly/{cur_week_name}/all_preds.pkl","rb") as f:
    pred_weeks_data = pickle.load(f)
with open(f"./output/weekly/{cur_week_name}/risk_preds.pkl","rb") as f:
    risk_preds = pickle.load(f)

# First, calculate restriction likelihood for last week, along with future weeks

In [None]:
pred_weeks = config['week_names'][-config['lookahead']:]
last_week_name = config['week_names'][-(config['lookahead']+1)]

In [None]:
last_week_name

In [None]:
pred_weeks_data.shape

In [None]:
# UK-based limit is 20 in 100k, we don't need to learn it
category_limits = [[-np.inf, 0.00020], [0.00020, np.inf]]
confidence_limits = [[0, 95], [95, 100], [100, np.inf]]

In [None]:
# Calculate category-based predictions and confidence values

# Start with last week
last_week_data = df_final[last_week_name].values
last_week_category_preds = last_week_data.copy()
for category in range(len(category_limits)):
    last_week_category_preds[(last_week_data < category_limits[category][1]) & \
                             (last_week_data >= category_limits[category][0])] = category

# Get category prediction for every predicted scenario
category_counts = pred_weeks_data.copy()
for category in range(len(category_limits)):
    category_counts[(pred_weeks_data < category_limits[category][1]) & \
                    (pred_weeks_data >= category_limits[category][0])] = category
    
# Get singular predictions and confidence.
mode = stats.mode(category_counts, axis=2)
category_preds = mode.mode.squeeze()
category_preds = category_preds

# Calculate confidence
category_probs = mode.count.squeeze()
category_conf = category_probs.copy()
for conf in range(len(confidence_limits)):
    category_conf[(category_probs < confidence_limits[conf][1]) & \
                  (category_probs >= confidence_limits[conf][0])] = conf

In [None]:
last_week_category_preds[84]

In [None]:
df_final.loc[df_final['Country'] == 'Japan', last_week_name]

In [None]:
unique, counts = np.unique(category_conf, return_counts=True)
print(np.int32(np.asarray((unique, counts)).T))

In [None]:
pred_weeks_data.max()

In [None]:
category_preds.shape

# Get current UK-based travel advice

## Get URLs

In [None]:
travel_country_name_mapping = {
    'Cabo Verde': 'Cape Verde',
    'Bahamas, The': 'Bahamas',
    'Congo, Rep.': 'Congo',
    'Congo, Dem. Rep.': 'Democratic Republic of the Congo',
    "Cote d'Ivoire": 'Cote d Ivoire',
    'Korea, South': 'South Korea',
    'Saint Kitts and Nevis': 'St Kitts and Nevis',
    'Saint Lucia': 'St Lucia',
    'Saint Vincent and the Grenadines': 'St Vincent and the Grenadines',
    'United States': 'USA',
}

In [None]:
df_final.head(5)

In [None]:
# First, get the list of countries which have specific travel advice.
country_lst = df_final['Country'].drop_duplicates()
responses = {}
advice_urls = {}

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
for country in country_lst:
    formatted_name = country.lower().replace(' ', '-')
    if country in travel_country_name_mapping:
        formatted_name = travel_country_name_mapping[country].lower().replace(' ', '-')
    new_url = f'{travel_advice_pre_url}/{formatted_name}'
    req = requests.get(new_url, headers)
    responses[country] = req
    advice_urls[country] = new_url

In [None]:
country_lst

In [None]:
# Get summary of the advice.
update_dates = {}
summaries = {}
summary_htmls = {}
countries_no_advice = []
removed_str = ['stay up to date', 'Find out how to return to', 'See ', 'Sign up ', 'Preparing for your return journey']
early_break = False
for country in country_lst:
    req = responses[country]
    soup = BeautifulSoup(req.content, 'html.parser')
    start = soup.find('h1', class_='part-title', text=re.compile('Summary'))
    if start is None:
        continue
    el = start.find_next_sibling("div", class_='gem-c-govspeak govuk-govspeak direction-ltr')
    div_stop = False
    message_started = False
    message = []
    elements = []
    for field in el.find_all():
#         print(field)
        if (field.name == 'div') and (len(message) > 0):
            break
        if field.name not in ['p', 'li']:
            continue
        if field.text is not None:
#             text = field.text
            text = field.text.replace('\n', '')
            if field.name == 'li':
                text = '-' + text
            if (text != 'FCDO') and ((len(message) == 0) or (message[-1] != text)):
                if any([(x in text) for x in removed_str]):
                    continue
                if field.name == 'p':
                    message_started = True
                if message_started:
                    message.append(text)
                    elements.append(str(field))
#     message = [(x + ".\n") if x[-1] not in ':.,;' else (x + '\n') for x in message]
    message = [(x + ". ") if x[-1] not in ':.,;' else (x + ' ') for x in message]
    if len(message) == 0:
        countries_no_advice.append(country)
    message = "".join(message)
    summaries[country] = message
    summary_htmls[country] = elements


In [None]:
countries_no_advice

In [None]:
summaries

In [None]:
# Check which country names are valid
invalid = []
for country in country_lst:
    if responses[country].status_code != 200:
        invalid.append(country)
invalid

## Get list of exempted countries

In [None]:
req = requests.get(travel_advice_url, headers)
soup = BeautifulSoup(req.content, 'html.parser')

In [None]:
fo_listed_countries = []
exempt_words = {'coronavirus', 'travel', 'guidance', 'advice', 'england', 'scotland', 'wales', 'northern ireland'}
start = soup.find('h2', id='countries-territories-and-regions-on-the-travel-corridor-list')
for tr in start.find_next_siblings("ul"):
    # Get lists of countries
    tds = tr.find_all("li")
    for td in tds:
        for tl in td.find_all('a', class_="govuk-link"):
            name = tl.get_text()
            if any([1 if x in name.lower() else 0 for x in exempt_words]):
                continue
            else:
                fo_listed_countries.append(name.lower())
fo_listed_countries = set(fo_listed_countries)

In [None]:
# Try and reconcile the two country lists
allowed_countries = []
for country in country_lst:
    name = country
    if country in travel_country_name_mapping:
        name = travel_country_name_mapping[country]
#     print(name)
    if (name.lower() in fo_listed_countries) or (name.replace(' and', ' &').lower() in fo_listed_countries):
        allowed_countries.append(country)

In [None]:
country_lst.shape

## Check the following list to ensure its integrity.

In [None]:
allowed_countries

## Check the following list and make sure we are not missing any important countries

In [None]:
fo_listed_countries - set([travel_country_name_mapping[x].lower() if x in travel_country_name_mapping else x.lower() for x in allowed_countries])

## Add these extra bits of info to a dataframe.

In [None]:
country_last_week_category_preds = {}
for itr, row in df_final.iterrows():
    if row['is_country']:
        country_last_week_category_preds[row['Country']] = last_week_category_preds[itr]

In [None]:
df_advice = deepcopy(df_final[['Country']])
df_advice['category'] = 1
df_advice['advice_url'] = ""
df_advice['summary'] = ''
df_advice['last_week_predicted_category'] = 0
df_advice['is_consistent'] = None

for i, row in df_advice.iterrows():
    if row['Country'] in allowed_countries:
        df_advice.loc[i, 'category'] = 0
    if row['Country'] in advice_urls:
        df_advice.loc[i, 'advice_url'] = advice_urls[row['Country']]
    if row['Country'] in summaries:
        df_advice.loc[i, 'summary'] = summaries[row['Country']]
    if country_last_week_category_preds[row['Country']] > 0:
        df_advice.loc[i, 'last_week_predicted_category'] = 1
df_advice.tail(3)

## Compare last week's category prediction to the current category.
If they do not hold, then we shouldn't trust our predictions. Let's take note of this.

In [None]:
df_tmp = df_advice[df_final['is_country'] == 1]
consistent_countries = list(df_tmp[(df_tmp['category'] == 0) | (df_tmp['category'] == df_tmp['last_week_predicted_category'])]['Country'].values)
for i, row in df_advice.iterrows():
    if row['Country'] in consistent_countries:
        df_advice.loc[i, 'is_consistent'] = 1
    else:
        df_advice.loc[i, 'is_consistent'] = 0
        

# Prepare response

In [None]:
df_final.head(3)

In [None]:
category_preds.shape

In [None]:
df_final

In [None]:
response = {}
fine_response = {}
for itr in range(len(df_final)):
    risk_vals = risk_preds[itr, :]
    vals = category_preds[itr, :]
    conf = category_conf[itr, :]
    risk_vals = [int(x) for x in risk_vals]
    vals = [int(x) for x in vals]
    conf = [int(x) for x in conf]
    country_response = {}
    country_response["risk_preds"] = risk_vals
    
    if df_final.loc[itr, 'is_country'] == 0:
        country_response['lat'] = df_final.loc[itr, "Lat"]
        country_response['lon'] = df_final.loc[itr, "Lon"]
        country_response['ccode'] = df_final.loc[itr, 'CCODE']
        country_response['country'] = df_final.loc[itr, 'Country']
        country_response['graph_name'] = df_final.loc[itr, 'graph_name']
        response[df_final.loc[itr, 'Name'] + ' ' + df_final.loc[itr, 'CCODE']] = country_response
    else:
        country_response["category"] = int(df_advice.loc[itr, "category"]) * 2 # (2*)Necessary to align UK and US
        if df_advice.loc[itr, 'is_consistent']:
            country_response["summary"] = df_advice.loc[itr, "summary"]
            country_response["category_preds"] = [x * 2 for x in vals] # (2*)Necessary to align UK and US
        else:
            country_response["summary"] = '[CTP: Non-Covid factors may be at play. Check link for details.] ' + df_advice.loc[itr, "summary"]
            country_response["category_preds"] = [country_response["category"] for x in vals]
        country_response["confidence_preds"] = conf
        if df_final.loc[itr, 'CCODE'] == 'GBR':
            country_response["advice_url"] = 'N/A'
        else:
            country_response["advice_url"] = df_advice.loc[itr, "advice_url"]
        response[df_final.loc[itr, 'CCODE']] = country_response

In [None]:
with open(f'{output_folder}/response-UK.json', 'w') as json_file:
    json.dump(response, json_file, indent='\t')

In [None]:
# Upload to the cloud
blob = bucket.blob(f"preds/weekly/{cur_week_name}/response-UK.json")
blob.upload_from_filename(f'{output_folder}/response-UK.json')

In [None]:
c1 = set(df_final[df_final['CCODE'] == '-1']['Country'])

In [None]:
c2 = set(df_final[df_final['CCODE'] != '-1']['Country'])

In [None]:
# Make sure is empty
c1 - c2

In [None]:
response