In [None]:
import pandas as pd
import datetime

In [None]:
df_age_sex_province = pd.read_csv("https://epistat.sciensano.be/Data/COVID19BE_CASES_AGESEX.csv", encoding="ISO-8859-1")
df_hospitalisations = pd.read_csv("https://epistat.sciensano.be/Data/COVID19BE_HOSP.csv", encoding="ISO-8859-1")
df_mortality = pd.read_csv("https://epistat.sciensano.be/Data/COVID19BE_MORT.csv", encoding="ISO-8859-1")
df_tests = pd.read_csv("https://epistat.sciensano.be/Data/COVID19BE_tests.csv", encoding="ISO-8859-1")

In [None]:
# papermill parameters
output_folder = "../output/"

# DQ fixes

## Filtering out rows that have no dates

In [None]:
df_age_sex_province = df_age_sex_province[df_age_sex_province["DATE"].isna() == False]
df_hospitalisations = df_hospitalisations[df_hospitalisations["DATE"].isna() == False]
df_mortality = df_mortality[df_mortality["DATE"].isna() == False]
df_tests = df_tests[df_tests["DATE"].isna() == False]

## Recoding to ISO-3166-1, ISO-3166-2 and ISO-3166-3

In [None]:
province_codes = {
    "Liège": {
        "iso3166_2": "WAL",
        "iso3166_3": "WLG"
    },
    "WestVlaanderen": {
        "iso3166_2": "VLG",
        "iso3166_3": "VWV"
    },
    "Hainaut": {
        "iso3166_2": "WAL",
        "iso3166_3": "WHT"
    },
    "BrabantWallon": {
        "iso3166_2": "WAL",
        "iso3166_3": "WBR"
    },
    "Namur": {
        "iso3166_2": "WAL",
        "iso3166_3": "WNA"
    },
    "Antwerpen": {
        "iso3166_2": "VLG",
        "iso3166_3": "VAN"
    },
    "Brussels": {
        "iso3166_2": "BRU",
        "iso3166_3": "BRU"
    },
    "OostVlaanderen": {
        "iso3166_2": "VLG",
        "iso3166_3": "VOV"
    },
    "VlaamsBrabant": {
        "iso3166_2": "VLG",
        "iso3166_3": "VBR"
    },
    "Limburg": {
        "iso3166_2": "VLG",
        "iso3166_3": "VBR"
    },
    "Luxembourg": {
        "iso3166_2": "WAL",
        "iso3166_3": "WLX"
    }
}

### Age stratified case counts by sex and province

In [None]:
df_age_sex_province["ISO3166_1"] = "BE"
codable_df_asp = df_age_sex_province.loc[df_age_sex_province.PROVINCE.isna() == False]
uncodable_df_asp = df_age_sex_province.loc[df_age_sex_province.PROVINCE.isna()]

In [None]:
codable_df_asp["ISO3166_2"] = codable_df_asp.PROVINCE.apply(lambda x: province_codes[x].get("iso3166_2"))
codable_df_asp["ISO3166_3"] = codable_df_asp.PROVINCE.apply(lambda x: province_codes[x].get("iso3166_3"))

In [None]:
df_age_sex_province = codable_df_asp.append(uncodable_df_asp)

In [None]:
df_age_sex_province = df_age_sex_province.merge(
    df_age_sex_province.groupby(
        ["ISO3166_1", "ISO3166_2", "ISO3166_3", "AGEGROUP", "SEX",
         "DATE"]).sum().groupby(
             ["ISO3166_1", "ISO3166_2", "ISO3166_3", "AGEGROUP",
              "SEX"]).cumsum().reset_index(),
    how="left",
    on=["ISO3166_1", "ISO3166_2", "ISO3166_3", "AGEGROUP", "SEX",
        "DATE"]).rename(columns={
            "CASES_x": "NEW_CASES",
            "CASES_y": "TOTAL_CASES"
        })

In [None]:
df_age_sex_province["TOTAL_CASES"] = df_age_sex_province.TOTAL_CASES.astype(pd.Int64Dtype())

### Hospitalisations by province

In [None]:
df_hospitalisations["ISO3166_1"] = "BE"
df_hospitalisations["ISO3166_2"] = df_hospitalisations.PROVINCE.apply(lambda x: province_codes[x].get("iso3166_2"))
df_hospitalisations["ISO3166_3"] = df_hospitalisations.PROVINCE.apply(lambda x: province_codes[x].get("iso3166_3"))

### Mortality by age, sex and region

In [None]:
df_mortality["ISO3166_1"] = "BE"
df_mortality["ISO3166_2"] = df_mortality.REGION.apply(lambda x: {"Brussels": "BRU", "Flanders": "VLG", "Wallonia": "WAL"}.get(x))

## Adding last update date

In [None]:
df_age_sex_province["LAST_UPDATED_DATE"] = datetime.datetime.now()
df_hospitalisations["LAST_UPDATED_DATE"] = datetime.datetime.now()
df_mortality["LAST_UPDATED_DATE"] = datetime.datetime.now()
df_tests["LAST_UPDATED_DATE"] = datetime.datetime.now()

## Export to CSV files

In [None]:
df_age_sex_province.to_csv(
    output_folder + "SCS_BE_DETAILED_PROVINCE_CASE_COUNTS.csv",
    columns=[
        "PROVINCE", "REGION", "SEX", "AGEGROUP", "DATE", "ISO3166_1",
        "ISO3166_2", "ISO3166_3", "NEW_CASES", "TOTAL_CASES",
        "LAST_UPDATED_DATE"
    ],
    index=False)

In [None]:
df_hospitalisations.to_csv(
    output_folder + "SCS_BE_DETAILED_HOSPITALISATIONS.csv",
    columns=[
        "PROVINCE", "REGION", "DATE", "NR_REPORTING", "TOTAL_IN",
        "TOTAL_IN_ICU", "TOTAL_IN_RESP", "TOTAL_IN_ECMO", "NEW_IN", "NEW_OUT",
        "ISO3166_1", "ISO3166_2", "ISO3166_3", "LAST_UPDATED_DATE"
    ], index=False)

In [None]:
df_mortality.to_csv(output_folder + "SCS_BE_DETAILED_MORTALITY.csv",
                    columns=[
                        "REGION", "SEX", "AGEGROUP", "DATE", "DEATHS",
                        "ISO3166_1", "ISO3166_2", "LAST_UPDATED_DATE"
                    ], index=False)

In [None]:
df_tests.to_csv(output_folder + "SCS_BE_DETAILED_TESTS.csv",
                columns=["DATE", "TESTS", "LAST_UPDATED_DATE"], index=False)