diff --git a/notebooks/WHO_DAILY_REPORT.ipynb b/notebooks/WHO_DAILY_REPORT.ipynb new file mode 100644 index 0000000..ba04718 --- /dev/null +++ b/notebooks/WHO_DAILY_REPORT.ipynb @@ -0,0 +1,222 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "unique-cookie", + "metadata": {}, + "source": [ + "## WHO Coronavirus disease (COVID-2019) 24 hour reports\n", + "\n", + "24 hour report from https://covid19.who.int/table\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abandoned-lender", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pycountry\n", + "import requests\n", + "import os\n", + "import re\n", + "import numpy\n", + "from datetime import datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "talented-mason", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# papermill parameters\n", + "output_folder = \"../output/\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "instructional-guinea", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://covid19.who.int/WHO-COVID-19-global-table-data.csv\"\n", + "df = pd.read_csv(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rocky-europe", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"Date\"] = datetime.utcnow().strftime(\"%Y-%m-%d\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "musical-cholesterol", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"Name\"] = df[\"Name\"].str.replace(\"\\[1\\]\", \"\")\n", + "df[\"Name\"] = df[\"Name\"].replace(r\"(.*)\\s+\\(.*\\)\", r\"\\1\", regex=True)\n", + "df[\"ISO3166_1\"] = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "every-stamp", + "metadata": {}, + "outputs": [], + "source": [ + "countries = list(df[\"Name\"].unique())\n", + "for name in countries:\n", + " search_name = name\n", + " if name == \"Global\":\n", + " continue\n", + " elif name == \"The United Kingdom\":\n", + " search_name = \"United Kingdom\"\n", + " elif name == \"United States of America\":\n", + " search_name = \"United States\"\n", + " elif name == \"occupied Palestinian territory, including east Jerusalem\":\n", + " search_name = \"Jerusalem\"\n", + " elif name == \"Pitcairn Islands\":\n", + " search_name = \"Pitcairn\"\n", + " elif name == \"Côte d’Ivoire\":\n", + " search_name = \"Côte d'Ivoire\"\n", + " elif name == \"Democratic Republic of the Congo\":\n", + " search_name = \"Congo, The Democratic Republic of the\"\n", + " elif name == \"United States Virgin Islands\":\n", + " search_name = \"Virgin Islands, U.S.\"\n", + " \n", + " \n", + " try:\n", + " pyc = pycountry.countries.get(name=search_name)\n", + " \n", + " if pyc:\n", + " df[\"ISO3166_1\"].loc[name == df[\"Name\"]] = pyc.alpha_2\n", + " df[\"Name\"].loc[name == df[\"Name\"]] = pyc.name\n", + " continue\n", + " try:\n", + " pyc_list = pycountry.countries.search_fuzzy(search_name)\n", + " if len(pyc_list):\n", + "\n", + " df[\"ISO3166_1\"].loc[name == df[\"Name\"]] = pyc_list[0].alpha_2\n", + " df[\"Name\"].loc[name == df[\"Name\"]] = pyc_list[0].name\n", + " continue\n", + " except:\n", + " pass\n", + " pass\n", + " except LookupError:\n", + " try:\n", + " pyc_list = pycountry.countries.search_fuzzy(search_name)\n", + " if len(pyc_list):\n", + " df[\"ISO3166_1\"].loc[name == df[\"Name\"]] = pyc_list[0].alpha_2\n", + " df[\"Name\"].loc[name == df[\"Name\"]] = pyc_list[0].name\n", + " continue\n", + " except:\n", + " pass\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "hispanic-learning", + "metadata": {}, + "outputs": [], + "source": [ + "column_map = {\n", + " \"Name\": \"COUNTRY_REGION\",\n", + " \"Cases - cumulative total\": \"CASES_TOTAL\",\n", + " \"Cases - cumulative total per 100000 population\": \"CASES_TOTAL_PER_100000\",\n", + " \"Cases - newly reported in last 24 hours\": \"CASES\",\n", + " \"Deaths - cumulative total\": \"DEATHS_TOTAL\",\n", + " \"Deaths - cumulative total per 100000 population\": \"DEATHS_TOTAL_PER_100000\",\n", + " \"Deaths - newly reported in last 24 hours\": \"DEATHS\",\n", + " \"Transmission Classification\": \"TRANSMISSION_CLASSIFICATION\",\n", + " \"Date\": \"DATE\",\n", + " \"ISO3166_1\": \"ISO3166_1\"\n", + "}\n", + "df = df.rename(columns=column_map)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "baking-eclipse", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "phantom-mountain", + "metadata": {}, + "source": [ + "```sql\n", + "CREATE TABLE WHO_DAILY_REPORT (\n", + " COUNTRY_REGION varchar,\n", + " CASES_TOTAL int,\n", + " CASES_TOTAL_PER_100000 float,\n", + " CASES int,\n", + " DEATHS_TOTAL int,\n", + " DEATHS_TOTAL_PER_100000 float,\n", + " DEATHS int,\n", + " TRANSMISSION_CLASSIFICATION varchar,\n", + " DATE timestamp_ntz,\n", + " ISO3166_1 VARCHAR(2)\n", + ")\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "after-former", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(output_folder + \"WHO_DAILY_REPORT.csv\", index=False, columns=column_map.values())" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/snowflake/sql/WHO_DAILY_REPORT_timeseries.sql b/snowflake/sql/WHO_DAILY_REPORT_timeseries.sql new file mode 100644 index 0000000..7f783f9 --- /dev/null +++ b/snowflake/sql/WHO_DAILY_REPORT_timeseries.sql @@ -0,0 +1,40 @@ +CREATE TABLE IF NOT EXISTS WHO_TIMESERIES ( + COUNTRY_REGION VARCHAR, + CASES_TOTAL int, + CASES_TOTAL_PER_100000 float, + CASES int, + DEATHS_TOTAL int, + DEATHS_TOTAL_PER_100000 float, + DEATHS int, + TRANSMISSION_CLASSIFICATION varchar, + DATE timestamp_ntz + ISO3166_1 VARCHAR(2), +); + +INSERT INTO WHO_TIMESERIES ( + COUNTRY_REGION, + CASES_TOTAL, + CASES_TOTAL_PER_100000, + CASES, + DEATHS_TOTAL, + DEATHS_TOTAL_PER_100000, + DEATHS, + TRANSMISSION_CLASSIFICATION, + DATE, + ISO3166_1 +) +SELECT COUNTRY_REGION, + CASES_TOTAL, + CASES_TOTAL_PER_100000, + CASES, + DEATHS_TOTAL, + DEATHS_TOTAL_PER_100000, + DEATHS, + TRANSMISSION_CLASSIFICATION, + DATE, + ISO3166_1, + FROM WHO_DAILY_REPORT + WHERE WHO_DAILY_REPORT.DATE NOT IN ( + SELECT MAX(DATE) FROM WHO_TIMESERIES.DATE + ) +; \ No newline at end of file