# Detailed CA data

This notebook pulls in detailed data from Canada using the [ViriHealth](https://virihealth.com) website.

This notebook requires `lxml`.

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import pycountry
import datetime
import pytz

In [None]:
# papermill parameters
output_folder = "../output/"

In [None]:
url = "https://virihealth.com"
html = requests.get(url)
soup = BeautifulSoup(html.content, 'html5lib')

## National level data

In [None]:
summary_table = soup.find_all("table")[0]
summary_df = pd.read_html(str(summary_table), header=None)[0]

In [None]:
last_update = summary_df.columns[1]

In [None]:
summary_df.columns = ["indicator", "value"]

In [None]:
summary_df

## Province level data

In [None]:
province_level_data = soup.find_all("table")[1]
province_level_df = pd.read_html(str(province_level_data), header=None)[0]
province_level_df.columns = ["Province/State", "Cases", "Deaths", "Cases_per_million"]

### Replacements for ISO-3316-2 compliance

In [None]:
province_level_df.drop(province_level_df.index.max(), inplace=True)

In [None]:
province_level_df.replace({
    "Newfoundland": "Newfoundland and Labrador",
    "PEI": "Prince Edward Island",
    "Yukon": "Yukon Territory"
}, inplace=True)

In [None]:
subdivisions = {s.name: s.code.replace("CA-", "") for s in pycountry.subdivisions.get(country_code="CA")}

In [None]:
province_level_df["State/Region"] = "Canada"
province_level_df["ISO3166_1"] = "CA"
province_level_df["ISO3166_2"] = province_level_df.apply(lambda x: subdivisions.get(x["Province/State"], ""), axis=1)

### Add time stamp for last update

In [None]:
# update_timestamp = datetime.datetime.strptime(last_update, "%B %d, %Y %I:%M%p %Z").astimezone(pytz.UTC)

In [None]:
# province_level_df["Last_Update_Date"] = update_timestamp

# Export to CSV

In [None]:
province_level_df.to_csv(path_or_buf=output_folder + "VH_CAN_DETAILED.csv",
                         header=True,
                         columns=["State/Region", "Province/State", "Cases", "Deaths", "Cases_per_million", "ISO3166_1", "ISO3166_2"],
                         index=False)