# This notebook scrapes US travel restrictions and travel advice. 
Bugra Ozkan (bugraozkan@live.com) contributed to this codebase by scraping US categories and summaries from each country's website, and those parts are re-implemented in Python to include them in this codebase.

In [None]:
import pandas as pd
import numpy as np
from copy import deepcopy
import json
import matplotlib.pyplot as plt
import datetime
import shutil
import os
import re

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import *
import keras
import pickle
import seaborn as sns

import requests
from google.cloud import storage
from bs4 import BeautifulSoup

In [None]:
# Make sure the following environment variable is set! 
creds_path = os.environ['GOOGLE_APPLICATION_CREDENTIALS_PATH']
gcs_client = storage.Client.from_service_account_json(creds_path)
bucket = gcs_client.get_bucket('covid-19-forecaster-data')

In [None]:
travel_advice_url = "https://www.cdc.gov/coronavirus/2019-ncov/travelers/map-and-travel-notices.html"

In [None]:
# Params
begin_date_str = '1/27/20' # Make sure it's a monday, and the day before exists in the data
begin_date = datetime.datetime.strptime(begin_date_str, "%m/%d/%y")

today = datetime.date.today()
today_date = f'{today.month}/{today.day}/{today.year-2000}'
today_date

In [None]:
# Create output folder
week_begin = today - datetime.timedelta(today.weekday())
week_end = week_begin + datetime.timedelta(days=6)
cur_week_name = f'{week_begin.month}-{week_begin.day}-{week_begin.year-2000}-to-{week_end.month}-{week_end.day}-{week_end.year-2000}'
output_folder = f'./preds/weekly/{cur_week_name}'
print(f'Current identifier: *{cur_week_name}*')

In [None]:
df_final = pd.read_csv(f'./output/weekly/{cur_week_name}/df_final.csv', index_col=0)
with open(f"./output/weekly/{cur_week_name}/config.pkl","rb") as f:
    config = pickle.load(f)
with open(f"./output/weekly/{cur_week_name}/all_preds.pkl","rb") as f:
    pred_weeks_data = pickle.load(f)
with open(f"./output/weekly/{cur_week_name}/risk_preds.pkl","rb") as f:
    risk_preds = pickle.load(f)

# # Load US scrape
# with open('./us-covid-scraper/output.json') as json_file: 
#     us_data = json.load(json_file)

## Get top-down US COUNTRY-CCODE MAPPING

In [None]:
country_name_mapping = {
    'Cabo Verde': 'Cape Verde', 
    'Congo, Rep.': 'Congo, Republic of the', 
    'Congo, Dem. Rep.': 'Democratic Republic of the Congo', 
    "Cote d'Ivoire": 'Ivory Coast ', 
    'Eswatini': 'Eswatini (Swaziland)', 
    'Israel': 'Israel, including the West Bank and Gaza', 
    'Korea, South': 'South Korea', 
    'Netherlands': 'Netherlands, The', 
    'Sao Tome and Principe': 'São Tomé and Príncipe', 
    'West Bank and Gaza': 'Israel, including the West Bank and Gaza', 
}

In [None]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [None]:
classes = [1,2,3]
div_classes = {
    1: ['ral3', 'ral4'],
    2: ['ral2'],
    3: ['ral1', 'ral5']
}

In [None]:
req = requests.get(travel_advice_url, headers)
soup = BeautifulSoup(req.content, 'html.parser')

In [None]:
country_scraped_data = {}
for cls in classes:
    for div in div_classes[cls]:
        containers = soup.find_all('div', class_=div)
        for container in containers:
            for country_entry in container.find_all('li'):
                try:
                    country_name = country_entry.a.text
                    country_url = country_entry.a['href']
                except:
                    country_name = country_entry.text
                    country_url = "N/A"
                country_scraped_data[country_name] = {'category': cls, 'advice_url': country_url}

In [None]:
us_data = {}
invalid_countries = []
for i, row in df_final[df_final['is_country'] == 1].iterrows():
    country_name = row['Country']
    if country_name in country_name_mapping:
        country_name = country_name_mapping[country_name]
    if country_name in country_scraped_data:
        us_data[row['CCODE']] = country_scraped_data[country_name]
    else:
        invalid_countries.append(row['Country'])

In [None]:
invalid_countries

In [None]:
# First, get the list of countries which have specific travel advice.
responses = {}
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
for ccode in us_data:
    new_url = us_data[ccode]['advice_url']
    if new_url != "N/A":
        try:
            req = requests.get(new_url, headers, timeout = 4)
            responses[ccode] = req
        except:
            pass

In [None]:
# Get summary of the advice.
summaries = {}
for ccode in us_data:
    us_data[ccode]['summary'] = ''
    if ccode in responses:
        req = responses[ccode]
        soup = BeautifulSoup(req.content, 'html.parser')
        start = soup.find('div', class_='card-body')
        for lst in start.find_all('ul'):
            for el in lst.find_all('li'):
                us_data[ccode]['summary'] += el.text.replace('\xa0', '') + ' '
        us_data[ccode]['summary'] =  us_data[ccode]['summary'][:-1]
        us_data[ccode]['summary'] = us_data[ccode]['summary'].replace('illnessfrom', 'illness from')

In [None]:
us_data['GBR']

## RUN US SCRAPER

In [None]:
max_category = max([us_data[x]['category'] for x in us_data])
max_category

## First, calculate restriction likelihood for future weeks
### Update: US does not seem to have a numbers-based way of classifying countries in tiers. As we will not be able to make risk predictions considering external factors, we will simply keep existing categories.

In [None]:
last_week = df_final[config['week_names'][-(config['lookahead']+1)]].values
cat_info = {x: [] for x in range(max_category)}
for itr in range(len(df_final)):
    if not df_final.loc[itr, 'is_country']:
        continue
    ccode = df_final.loc[itr, 'CCODE']
    if ccode in us_data:
        cat_info[us_data[ccode]['category']-1].append(last_week[itr])
# print(cat_info)
# sns.distplot(cat_info[0], hist=True, norm_hist=True, kde=True)
# sns.distplot(cat_info[1], hist=True, norm_hist=True)
sns.distplot(cat_info[2], hist=True, norm_hist=True, kde=True, bins=50)

# US Special: Repeat current categories with low confidence
We can not predict categories from the data we have.

In [None]:
cur_cat_info = {}
df_cat = deepcopy(df_final[['CCODE']])
df_cat['category'] = 2
for itr in range(len(df_final)):
    if not df_final.loc[itr, 'is_country']:
        continue
    ccode = df_final.loc[itr, 'CCODE']
    if ccode in us_data:
        df_cat.loc[itr, 'category'] = us_data[ccode]['category']-1

In [None]:
# Calculate category-based predictions and confidence values

# Get category prediction for every predicted scenario
category_preds = np.stack([df_cat['category'].values] * config['lookahead'], 1)

# Calculate confidence
category_conf = category_preds.copy()
category_conf[:] = 0

## Add these extra bits of info to a dataframe.

In [None]:
df_advice = deepcopy(df_final[['CCODE', 'Country', 'is_country']])
df_advice['category'] = 2
df_advice['advice_url'] = "N/A"
df_advice['summary'] = 'N/A'

for i, row in df_advice.iterrows():
    # Get travel advice for countries and regions
    if row['CCODE'] in us_data:
        country_data = us_data[row['CCODE']]
        df_advice.loc[i, 'category'] = country_data['category']-1
        df_advice.loc[i, 'advice_url'] = country_data['advice_url']
        df_advice.loc[i, 'summary'] = country_data['summary']
    else:
        if (row['Country'] == 'United States') and row['is_country']:
            df_advice.loc[i, 'summary'] = ''
        elif row['Country'] == 'United States':
            df_advice.loc[i, 'summary'] = 'State-level travel advice not available.' 
        else:
            df_advice.loc[i, 'summary'] = 'Travel advice not available.'
        
df_advice.head(3)

# Prepare response

In [None]:
response = {}
fine_response = {}
for itr in range(len(df_final)):
    risk_vals = risk_preds[itr, :]
    vals = category_preds[itr, :]
    conf = category_conf[itr, :]
    risk_vals = [int(x) for x in risk_vals]
    vals = [int(x) for x in vals]
    conf = [int(x) for x in conf]
    country_response = {}
    country_response["risk_preds"] = risk_vals
    if not df_final.loc[itr, 'is_country']:
        country_response['lat'] = df_final.loc[itr, "Lat"]
        country_response['lon'] = df_final.loc[itr, "Lon"]
        country_response['country'] = df_final.loc[itr, "Country"]
        country_response['ccode'] = df_final.loc[itr, 'CCODE']
        country_response['graph_name'] = df_final.loc[itr, 'graph_name']
        response[df_final.loc[itr, 'Name'] + ' ' + df_final.loc[itr, 'CCODE']] = country_response
    else:
        country_response["category"] = int(df_advice.loc[itr, "category"])
        country_response["category_preds"] = vals
        country_response["confidence_preds"] = conf
        country_response["summary"] = df_advice.loc[itr, "summary"]
        country_response["advice_url"] = df_advice.loc[itr, "advice_url"]
        response[df_final.loc[itr, 'CCODE']] = country_response

In [None]:
with open(f'{output_folder}/response-US.json', 'w') as json_file:
    json.dump(response, json_file, indent='\t')

In [None]:
# Upload to the cloud
blob = bucket.blob(f"preds/weekly/{cur_week_name}/response-US.json")
blob.upload_from_filename(f'{output_folder}/response-US.json')

In [None]:
c1 = set(df_final[df_final['CCODE'] == '-1']['Country'])

In [None]:
c2 = set(df_final[df_final['CCODE'] != '-1']['Country'])

In [None]:
# Make sure is empty
c1 - c2

In [None]:
response