# European Centre for Disease Prevention and Control Dataset

In [1]:
import pandas as pd
import datetime
import pycountry
import re
import os
import numpy as np

In [2]:
# papermill parameters
output_folder = "../output/"

### Fetch data

In [3]:
df = pd.read_csv("https://opendata.ecdc.europa.eu/covid19/casedistribution/csv")

In [13]:
df

Unnamed: 0,DATE,cases_weekly,deaths_weekly,COUNTRY_REGION,ISO3166_1,popData2019,continentExp,notification_rate_per_100000_population_14-days,CASES_SINCE_PREV_WEEK,DEATHS_SINCE_PREV_WEEK,LAST_UPDATE_DATE,LAST_REPORTED_FLAG
0,2020-12-21,740,111,Afghanistan,AF,38041757.0,Asia,6.56,0,0,2020-12-29 12:43:38.524091,True
1,2020-12-14,1757,71,Afghanistan,AF,38041757.0,Asia,9.01,1017,-40,2020-12-29 12:43:38.524091,False
2,2020-12-07,1672,137,Afghanistan,AF,38041757.0,Asia,7.22,-85,66,2020-12-29 12:43:38.524091,False
3,2020-11-30,1073,68,Afghanistan,AF,38041757.0,Asia,6.42,-599,-69,2020-12-29 12:43:38.524091,False
4,2020-11-23,1368,69,Afghanistan,AF,38041757.0,Asia,6.66,295,1,2020-12-29 12:43:38.524091,False
...,...,...,...,...,...,...,...,...,...,...,...,...
9146,2020-04-20,11,0,Zimbabwe,ZW,14645473.0,Africa,0.11,5,-1,2020-12-29 12:43:38.524091,False
9147,2020-04-13,5,2,Zimbabwe,ZW,14645473.0,Africa,0.05,-6,2,2020-12-29 12:43:38.524091,False
9148,2020-04-06,2,0,Zimbabwe,ZW,14645473.0,Africa,0.05,-3,-2,2020-12-29 12:43:38.524091,False
9149,2020-03-30,5,1,Zimbabwe,ZW,14645473.0,Africa,0.05,3,1,2020-12-29 12:43:38.524091,False


### Parse date

In [4]:
df["dateRep"] = pd.to_datetime(df["dateRep"], format="%d/%m/%Y")

### Add difference

In [7]:
df['CASES_SINCE_PREV_WEEK'] = df.groupby(['countriesAndTerritories','continentExp'])['cases_weekly'].diff().fillna(0).astype(int)
df['DEATHS_SINCE_PREV_WEEK'] = df.groupby(['countriesAndTerritories','continentExp'])['deaths_weekly'].diff().fillna(0).astype(int)

### Drop cols

In [8]:
df = df.drop(columns=["year_week", "countryterritoryCode"])

In [9]:
int_conveyance = df["geoId"].loc["JPG11668" == df["geoId"]].index
df["geoId"].iloc[int_conveyance] = np.nan
df["popData2019"].iloc[int_conveyance] = np.nan
df["continentExp"].iloc[int_conveyance] = np.nan
df["countriesAndTerritories"].iloc[int_conveyance] = "Cases on an international conveyance Japan"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


### Resolve Country/Region name

In [10]:
country_codes = df["geoId"].unique()
for code in country_codes:
    try:
        pyc = pycountry.countries.get(alpha_2=code)
        if pyc:
            df["countriesAndTerritories"].loc[code == df["geoId"]] = pyc.name
    except LookupError:
        df["countriesAndTerritories"].loc[code == df["geoId"]] = None

### Set Last Update Date and Last Reported Flag

In [11]:
df["LAST_UPDATE_DATE"] = datetime.datetime.utcnow()
df["LAST_REPORTED_FLAG"] = df["dateRep"].max() == df["dateRep"]

### Rename Cols

In [12]:
df = df.rename(columns={
    "dateRep": "DATE", 
    "countriesAndTerritories": "COUNTRY_REGION", 
    "geoId": "ISO3166_1", 
    "popData2018": "POPULATION",
})

### Save dataframe

In [15]:
df.to_csv(output_folder + "ECDC_GLOBAL_WEEKLY.csv", index=False, columns=[
    "COUNTRY_REGION",
    "continentExp",
    "ISO3166_1",
    "cases_weekly",
    "deaths_weekly",
    "CASES_SINCE_PREV_WEEK",
    "DEATHS_SINCE_PREV_WEEK",
    "popData2019",
    "DATE",
    "LAST_UPDATE_DATE",
    "LAST_REPORTED_FLAG"
])