### Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

import altair as alt

from covidviz import data

In [3]:
DATA_PATH = Path.cwd() / "data"
MIN_N_CASES = 50
day_counter = f"days_since_{MIN_N_CASES}_cases"

### Data preparation

#### Infections

In [4]:
# Unfortunately, the mopo data seems to be refreshed only once a week at most
# mopo = pd.read_csv(
#     "https://interaktiv.morgenpost.de/corona-virus-karte-infektionen-deutschland-weltweit/data/Coronavirus.history.v2.csv"
# ).query("parent == 'Deutschland'")
# mopo["date"].max()

In [5]:
# We'll use the RKI data instead
df = data.get_data(
    out_path=DATA_PATH,
    states=pd.read_csv(DATA_PATH / "bundeslaender.csv")["Bundesland"].unique(),
)

In [6]:
daily_infections = df.pipe(data.prepare_daily_infections, n_cases=MIN_N_CASES).pipe(
    data.add_measures, measures=data.read_measure_data(DATA_PATH)
)

In [7]:
daily_infections.head(2)

Unnamed: 0,Bundesland,Meldedatum,Neuinfektionen,infections_cumulative,date_50_cases,days_since_50_cases,Maßnahmen
0,Baden-Württemberg,2020-02-16,1,1,2020-03-03,-16,
1,Baden-Württemberg,2020-02-23,1,2,2020-03-03,-9,


#### Google's mobility reports

In [8]:
STATE_MAPPER = {
    "Baden-Württemberg": "Baden-Württemberg",
    "Bavaria": "Bayern",
    "Berlin": "Berlin",
    "Brandenburg": "Brandenburg",
    "Bremen": "Bremen",
    "Hamburg": "Hamburg",
    "Hesse": "Hessen",
    "Lower Saxony": "Niedersachsen",
    "Mecklenburg-Vorpommern": "Mecklenburg-Vorpommern",
    "North Rhine-Westphalia": "Nordrhein-Westfalen",
    "Rhineland-Palatinate": "Rheinland-Pfalz",
    "Saarland": "Saarland",
    "Saxony": "Sachsen",
    "Saxony-Anhalt": "Sachsen-Anhalt",
    "Schleswig-Holstein": "Schleswig-Holstein",
    "Thuringia": "Thüringen",
}

In [9]:
mobility = (
    pd.read_csv(
        DATA_PATH / "Global_Mobility_Report.csv",
        low_memory=False,
        parse_dates=["date"],
        infer_datetime_format=True,
    )
    .query("country_region_code == 'DE' and not sub_region_1.isna()")
    .rename(columns={"sub_region_1": "Bundesland", "date": "Meldedatum"})
    .drop(columns=["country_region_code", "country_region", "sub_region_2"])
    .assign(Bundesland=lambda df: df["Bundesland"].map(STATE_MAPPER))
)

### Visualization

In [11]:
percentage_change_cols_mapper = {
    c: c[: c.find("percent_change") - 1].replace("_", " ").title()
    for c in mobility.columns
    if "percent_change" in c
}

plot_df = (
    daily_infections.assign(
        **{
            "absolute_growth": lambda df: df.groupby("Bundesland")[
                "infections_cumulative"
            ].transform(lambda s: s.diff()),
            "cases_logratio": lambda df: df.groupby("Bundesland")[
                "infections_cumulative"
            ].transform(lambda s: np.log(s).diff()),
            "num_measures": lambda df: df["Maßnahmen"].apply(
                lambda x: len(x.split("und")) if isinstance(x, str) else 0
            ),
        }
    )
    .merge(mobility, on=["Bundesland", "Meldedatum"], how="outer")
    .assign(
        total_activity=lambda df: df[list(percentage_change_cols_mapper.keys())].sum(
            axis=1
        )
        / 100
    )
    # .query(f"{day_counter} >= 0")
    .query("Meldedatum >= '2020-03-01'")
    .rename(columns=percentage_change_cols_mapper)
)

In [12]:
plot_df.head(2)

Unnamed: 0,Bundesland,Meldedatum,Neuinfektionen,infections_cumulative,date_50_cases,days_since_50_cases,Maßnahmen,absolute_growth,cases_logratio,num_measures,Retail And Recreation,Grocery And Pharmacy,Parks,Transit Stations,Workplaces,Residential,total_activity
8,Baden-Württemberg,2020-03-01,2.0,25.0,2020-03-03,-2.0,,2.0,0.083382,0.0,9.0,20.0,40.0,6.0,-1.0,0.0,0.74
9,Baden-Württemberg,2020-03-02,11.0,36.0,2020-03-03,-1.0,,11.0,0.364643,0.0,0.0,8.0,-2.0,-3.0,-1.0,1.0,0.03


#### Relative increase

In [None]:
X_VARIABLE = "days_since_50_cases"
Y_VARIABLE = "daily_increase"

if Y_VARIABLE == "daily_increase":
    expression = "pow(E, datum.cases_logratio) - 1"
    y_title = "Daily Increase in Cumulative Cases"
    y_format = "%"
    y_domain = (0, 0.9)
    measure_level = "0.85"
    title = "Daily Increase of COVID-19 Cases in German States"
elif Y_VARIABLE == "doubling_time":
    expression = "log(2) / datum.cases_logratio"
    y_title = "Doubling Time (Days)"
    y_format = ""
    y_domain = (50, 0)
    measure_level = "45"
    title = "Doubling Time of COVID-19 Cases in German States"
elif Y_VARIABLE == "absolute_growth":
    raise NotImplementedError("This doesn't work yet.")
    expression = "datum.absolute_increase"
    y_title = "Absolute Growth in Cumulative Cases"
    y_format = ""
    title = "Absolute Growth of COVID-19 Cases in German States"
else:
    raise NotImplementedError(f"y variable {Y_VARIABLE} is not implemented.")

combined_charts = []
line_charts = []
for state in plot_df["Bundesland"].unique():
    base = alt.Chart(plot_df.query(f"Bundesland == '{state}'"), title=state).encode(
        x=alt.X(
            X_VARIABLE,
            axis=alt.Axis(title=X_VARIABLE.replace("_", " ").title(), offset=5),
        ),
        y=alt.Y("cases_logratio:Q"),
    )
    points = (
        base.transform_calculate(as_=Y_VARIABLE, calculate=expression)
        .mark_point()
        .encode(
            y=alt.Y(
                f"{Y_VARIABLE}:Q",
                scale=alt.Scale(domain=y_domain),
                axis=alt.Axis(format=y_format, title=y_title),
            ),
            color="Bundesland:N",
        )
    )
    measure_points = (
        base.mark_point(size=300, shape="diamond", color="grey", fill=None)
        .transform_calculate(y_level=measure_level)
        .encode(
            y="y_level:Q",
            size=alt.Size("Anzahl Maßnahmen:Q"),
            tooltip=["Meldedatum", "Maßnahmen"],
        )
        .interactive()
    )
    lines = (
        points.transform_loess(
            on=X_VARIABLE,
            loess=Y_VARIABLE,
            as_=[X_VARIABLE, f"{Y_VARIABLE}_loess"],
            groupby=["Bundesland"],
        )
        .mark_line()
        .encode(
            y=alt.Y(
                f"{Y_VARIABLE}_loess:Q",
                scale=alt.Scale(domain=y_domain),
                axis=alt.Axis(format=y_format, title=y_title),
            ),
            tooltip=[X_VARIABLE],
        )
    )
    line_charts.append(lines.properties(width=900, height=300, title=title))
    combined_charts.append(
        (points + measure_points + lines).properties(width=900, height=300)
    )

In [None]:
alt.layer(*line_charts)

In [None]:
alt.vconcat(*combined_charts)

#### Absolute increase

In [None]:
X_VARIABLE = "days_since_50_cases"
Y_VARIABLE = "absolute_growth"
y_title = "Absolute Growth in Cumulative Cases"
y_format = ""
title = "Absolute Growth of COVID-19 Cases in German States"

In [None]:
combined_charts = []
line_charts = []
for state in plot_df["Bundesland"].unique():

    base = alt.Chart(plot_df.query(f"Bundesland == '{state}'"), title=state).encode(
        x=alt.X(
            X_VARIABLE, axis=alt.Axis(title=X_VARIABLE.replace("_", " ").title(), offset=5)
        ),
        y=alt.Y(f"{Y_VARIABLE}:Q"),
    )
    points = base.mark_point().encode(y=alt.Y(f"{Y_VARIABLE}:Q"), color="Bundesland:N")
    measure_points = (
        base.mark_point(size=600, shape="diamond", color="grey", fill=None)
        .transform_calculate(y_level="0")
        .encode(
            y="y_level:Q",
            size=alt.Size("Anzahl Maßnahmen:Q"),
            tooltip=["Meldedatum", "Maßnahmen"],
        )
        .interactive()
    )
    lines = (
        points.transform_loess(
            on=X_VARIABLE,
            loess=Y_VARIABLE,
            as_=[X_VARIABLE, f"{Y_VARIABLE}_loess"],
            groupby=["Bundesland"],
        )
        .mark_line()
        .encode(
            y=alt.Y(f"{Y_VARIABLE}_loess:Q", axis=alt.Axis(format="", title=y_title)),
            tooltip=[X_VARIABLE],
        )
    )
    combined_charts.append(
        (points + measure_points + lines).properties(width=900, height=300)
    )

In [None]:
alt.vconcat(*combined_charts)

#### Absolute increase + mobility data

In [None]:
X_VARIABLE = "Meldedatum"
Y_VARIABLE = "absolute_growth"
y_title = "Absolute Growth in Cumulative Cases"
y_format = ""
title = "Absolute Growth of COVID-19 Cases in German States"

In [None]:
combined_charts = []
line_charts = []

activity_fields = plot_df[list(percentage_change_cols_mapper.values())]
max_activity = (
    max(
        abs(activity_fields[activity_fields < 0].sum(axis=1).min()),
        abs(activity_fields[activity_fields > 0].sum(axis=1).max()),
    )
    // 50
    + 1
) * 0.5

for state in plot_df["Bundesland"].unique():
    base = alt.Chart(plot_df.query(f"Bundesland == '{state}'"), title=state).encode(
        x=alt.X(
            X_VARIABLE,
            axis=alt.Axis(title=X_VARIABLE.replace("_", " ").title(), offset=5),
        ),
        y=alt.Y(f"{Y_VARIABLE}:Q"),
    )
    points = base.mark_point(color="DarkSlateBlue").encode(
        y=alt.Y(f"{Y_VARIABLE}:Q"), tooltip=list(set(["Meldedatum", X_VARIABLE]))
    )
    measure_points = (
        base.mark_point(
            size=400, shape="diamond", color="DarkSlateGrey", fill="DarkSlateGrey"
        )
        .transform_calculate(y_level="0")
        .encode(y="y_level:Q", tooltip=["Meldedatum", "Maßnahmen"])
        .transform_filter("datum.num_measures > 0")
    )
    lines = (
        points.transform_loess(
            on=X_VARIABLE,
            loess=Y_VARIABLE,
            as_=[X_VARIABLE, f"{Y_VARIABLE}_loess"],
            groupby=["Bundesland"],
        )
        .mark_line(color="DarkSlateBlue")
        .encode(
            y=alt.Y(f"{Y_VARIABLE}_loess:Q", axis=alt.Axis(format="", title=y_title))
        )
    )
    activity = (
        base.mark_area()
        .transform_fold(
            fold=list(percentage_change_cols_mapper.values()),
            as_=["Mobility Category", "mobility_change_percent"],
        )
        .transform_calculate(
            as_="Mobility Change", calculate="datum.mobility_change_percent / 100"
        )
        .encode(
            y=alt.Y(
                "Mobility Change:Q",
                axis=alt.Axis(format="%", orient="right"),
                scale=alt.Scale(domain=(-max_activity, max_activity)),
            ),
            color=alt.Color("Mobility Category:N", scale=alt.Scale(scheme="blues")),
            opacity=alt.value(0.5),
        )
    )
    combined_charts.append(
        (activity + (points + measure_points + lines))
        .resolve_scale(y="independent")
        .properties(width=900, height=300)
    )

In [None]:
alt.vconcat(*combined_charts)

#### Absolute increase + interactive activity data

In [13]:
activity_df = plot_df[list(percentage_change_cols_mapper.values())]
max_activity = (
    max(
        abs(activity_df[activity_df < 0].sum(axis=1).min()),
        abs(activity_df[activity_df > 0].sum(axis=1).max()),
    )
    // 50
    + 1
) * 0.5

In [18]:
def plot_infection_activity_summary(
    df, state, x_var, x_title, y_var, y_title, max_activity, width=900, height=300
):
    min_date = df.query(f"Bundesland == '{state}'")["Meldedatum"].min()
    max_date = df.query(f"Bundesland == '{state}'")["Meldedatum"].max()
    date_range = [
        str(time.date())
        for time in pd.date_range(start=min_date, freq="W", end=max_date)
    ]
    base = alt.Chart(
        df.query(f"Bundesland == '{state}'"),
        title=f"{state}: Infections and Aggregate Mobility",
    ).encode(
        x=alt.X(
            x_var,
            axis=alt.Axis(
                title=x_title, offset=0, grid=False, values=date_range, format="%b %d"
            ),
        ),
        y=alt.Y(f"{y_var}:Q"),
    )
    points = base.mark_point(color="DarkSlateBlue").encode(
        y=alt.Y(f"{y_var}:Q"),
        tooltip=list(
            set(["Meldedatum", x_var, "Neuinfektionen", "infections_cumulative"])
        ),
    )
    lines = (
        points.transform_loess(
            on=x_var, loess=y_var, as_=[x_var, f"{y_var}_loess"], groupby=["Bundesland"]
        )
        .mark_line(color="DarkSlateBlue")
        .encode(y=alt.Y(f"{y_var}_loess:Q", axis=alt.Axis(format="", title=y_title)))
    )

    total_activity = base.mark_area(color="#5ba3cf").encode(
        y=alt.Y(
            "total_activity:Q",
            axis=alt.Axis(format="%", orient="right", title="Google Mobility Index"),
            scale=alt.Scale(domain=(-max_activity, max_activity)),
        ),
        opacity=alt.value(0.2),
    )
    measures = (
        base.mark_point(size=400, shape="diamond", color="#125ca4", fill="#125ca4")
        .transform_calculate(y_level="0")
        .encode(
            y=alt.Y("y_level:Q", axis=alt.Axis(orient="right")),
            tooltip=["Meldedatum", "Maßnahmen"],
        )
        .transform_filter("datum.num_measures > 0")
    )
    infections_activity_summary = (
        ((total_activity + measures) + (points + lines))
        .resolve_scale(y="independent")
        .properties(width=width, height=height)
    )
    infections_activity_summary.layer[1].encoding.y.title = ""
    return infections_activity_summary

In [19]:
def plot_activity_details(df, state, x_var, x_title, activity_cols, max_activity, width=900, height=300):
    min_date = df.query(f"Bundesland == '{state}'")["Meldedatum"].min()
    max_date = df.query(f"Bundesland == '{state}'")["Meldedatum"].max()
    date_range = [
        str(time.date())
        for time in pd.date_range(start=min_date, freq="W", end=max_date)
    ]
    selection = alt.selection_multi(fields=["Mobility Category"])
    color = alt.condition(
        selection,
        alt.Color("Mobility Category:N", scale=alt.Scale(scheme="blues"), legend=None),
        alt.value("lightgray"),
    )

    activity_base = (
        alt.Chart(
            df.query(f"Bundesland == '{state}'"),
            title=f"{state}: Detailed Mobility Report",
        )
        .transform_fold(
            fold=activity_cols, as_=["Mobility Category", "mobility_change_percent"]
        )
        .transform_calculate(
            as_="Google Mobility Index", calculate="datum.mobility_change_percent / 100"
        )
    )

    activity = activity_base.mark_area().encode(
        x=alt.X(
            x_var,
            axis=alt.Axis(
                title=x_title, offset=0, grid=False, values=date_range, format="%b %d"
            ),
        ),
        y=alt.Y(
            "Google Mobility Index:Q",
            axis=alt.Axis(format="%", orient="right"),
            scale=alt.Scale(domain=(-max_activity, max_activity)),
        ),
        color=color,
        opacity=alt.value(0.8),
    )

    legend = (
        activity_base.mark_point()
        .encode(
            y=alt.Y(
                "Mobility Category:N",
                axis=alt.Axis(orient="right", grid=False, ticks=False, offset=2),
                title="Click to select",
            ),
            color=color,
        )
        .add_selection(selection)
    )
    legend.title = "Category"
    return (activity.properties(width=width, height=height) | legend).configure_axis(
        grid=True
    )

In [20]:
plot_infection_activity_summary(
    df=plot_df,
    state="Bayern",
    x_var="Meldedatum",
    x_title="Date",
    y_var="absolute_growth",
    y_title="Absolute Growth in Cumulative Cases",
    max_activity=max_activity,
)

In [21]:
plot_activity_details(
    df=plot_df,
    state="Bayern",
    x_var="Meldedatum",
    x_title="Date",
    activity_cols=list(percentage_change_cols_mapper.values()),
    max_activity=max_activity,
)

In [25]:
# TODO: Create small plots with the infections + activity summary for all states
combined_plots = []
sub_combined = []
for state in enumerate(plot_df["Bundesland"].unique()):
    pass