In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as plt # data visualization I'm not sure that I need
import seaborn as sns # data visualization

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# also look into https://github.com/GoogleCloudPlatform/covid-19-open-data

# Cases and Deaths (RKI)
Official COVID19 data for Germany publicized by Robert Koch Institute

In [None]:
# importing with category datatypes is more memory friendly - but have not made it work yet https://www.youtube.com/watch?v=RlIiVeig3hc&ab_channel=DataSchool
#dtypes= {'IdBundesland':'category','Bundesland':'category','Landkreis':'category','Altersgruppe':'category','Geschlecht':'category','IdLandkreis':'category','NeuerFall':'category','NeuGenesen':'category','IstErkrankungsbeginn':'category','Altersgruppe2':'category',}

cov = pd.read_csv('../input/covid19-data-germany-robert-koch-institute/dd4580c810204019a7b8eb3e0b329dd6_0.csv', parse_dates=['Meldedatum','Datenstand','Refdatum'])
cov.head()

In [None]:
# how to group data. a dicitonary of columnname and aggregation function can be passed into .agg({})
# agg_dict = {'IdBundesland':'count','Bundesland':'count'}

# lets drop the numeric columns that don't makes sense as sums.
cov_drop = cov.drop(columns=['ObjectId','IdBundesland', 'IdLandkreis','NeuerFall','NeuerTodesfall','IstErkrankungsbeginn','NeuGenesen'])

# now lets group by Meldedatum while summing up the cases.
tsCov = cov_drop.groupby(cov['Meldedatum'].dt.date).sum()

# latest data
tsCov.tail()

In [None]:
# lets calculate a weekly running average (center=False by default so moving window aligns right)
tsCov['7dRollingMean_Cases'] = tsCov['AnzahlFall'].rolling(7).mean()
tsCov['7dRollingMean_Deaths'] = tsCov['AnzahlTodesfall'].rolling(7).mean()

# lets also calculate the percentage case fatality for each day by dividing the number of deaths by the number of cases multiplied by 100
tsCov['7dRollingMean_DeathsPerCases_percent'] = (tsCov['7dRollingMean_Deaths'] / tsCov['7dRollingMean_Cases']) * 100

# lets calculate the ratio of cases per deaths - a high ratio would suggest possible numeric mania.
tsCov['7dRollingMean_CasesPerDeaths'] = (tsCov['7dRollingMean_Cases'] / tsCov['7dRollingMean_Deaths'])

tsCov.tail()

In [None]:
# ok now lets look at what we have so far for all of Germany
tsCov[['7dRollingMean_Cases','7dRollingMean_Deaths','7dRollingMean_DeathsPerCases_percent','7dRollingMean_CasesPerDeaths']].plot.area(figsize=(20,10), subplots=True)

*So we can already make some interesting observations:*
> 1. 7 day mean cases are now as high as during peak winter wave.
> 2. 7 day mean deaths and "case fatality" is now about as low as before the winter.
> 3. "case fatality" varies over time and was highest in early 2020.
> 4. case waves have grown while "case fatality" waves have been flattening and stretching.
> 5. Cases per deaths were high in the Summer and have gone Gamesstop recently.

# Bundesländer

In [None]:
# lets look at these waves in more regional detail.
cov.Bundesland.unique()

In [None]:
def groupSum(_data,_groups):
    # a function that takes in a dataframe and a list, groups the dataframe by the list items und sums values. Then drops specific columns with useless sums.
    # calculating the sum only makes sense for a few columns - so lets drop the others first and then calculate the others. (first drop then calculate = faster)
    dataDropped = _data.drop(columns=['ObjectId','IdBundesland', 'IdLandkreis','NeuerFall','NeuerTodesfall','IstErkrankungsbeginn','NeuGenesen'])
    dfGrouped = dataDropped.groupby(_groups).sum()
    
    # lets add three columns based on the cases and deaths
    dfGrouped['7dRollingMean_Cases'] = dfGrouped['AnzahlFall'].rolling(7).mean()
    dfGrouped['7dRollingMean_Deaths'] = dfGrouped['AnzahlTodesfall'].rolling(7).mean()
    dfGrouped['7dRollingMean_DeathsPerCases_percent'] = (dfGrouped['7dRollingMean_Deaths'] / dfGrouped['7dRollingMean_Cases']) * 100
    dfGrouped['7dRollingMean_CasesPerDeaths'] = (dfGrouped['7dRollingMean_Cases'] / dfGrouped['7dRollingMean_Deaths'])
    
    # rolling mean ... hmmm maybe .resample is s good option here too

    return dfGrouped

In [None]:
tsLand = groupSum(cov, ['Bundesland','Meldedatum'])
tsLand.tail(3)

In [None]:
# df[df.index.isin(['...'], level=0)])
# tsLand[tsLand.index.isin([])]

# lets look at the cases of all Bundesländer
tsLand_un = tsLand.unstack(level=0)
tsLand_un['7dRollingMean_Cases'][].plot.area(figsize=(20,6), subplots=True)

In [None]:
tsLand_un['7dRollingMean_DeathsPerCases_percent'].plot.area(figsize=(20,16), subplots=True)

# Altersgruppen

In [None]:
tsAge = groupSum(cov, ['Altersgruppe', 'Meldedatum'])
tsAge.tail(3)

In [None]:
tsAge_un = tsAge.unstack(level=0)
tsAge_un.tail(3)

In [None]:
tsAge_un[['AnzahlFall', 'AnzahlTodesfall']].sum()

In [None]:
# compared to the worst weeks how does the current week compare in terms of percent of 
tsAge_un = tsAge_un.resample('W').sum()
nowVsWorst = (tsAge_un['7dRollingMean_Deaths'].iloc[-1] / tsAge_un['7dRollingMean_Deaths'].max()) * 100
nowVsWorst

In [None]:
tsAgeFatality = (tsAge_un['7dRollingMean_Deaths'] / tsAge_un['7dRollingMean_Cases']) * 100

# the case fatality can vary a lot depending on the resample window!
tsAgeFatality = tsAgeFatality.resample('W').sum()
# tsAgeFatality.describe().loc['mean']
tsAgeFatality.tail(3)

In [None]:
tsAgeFatality.max()

In [None]:
# latest
tsAgeFatality.iloc[-1]

In [None]:
# Fallzahlen pro Altersgruppe
tsAge_un['7dRollingMean_Cases'].plot.area(figsize=(20,10), subplots=True)

In [None]:
tsAge_un['7dRollingMean_Deaths'].plot.area(figsize=(20,10), subplots=True)

In [None]:
#filt_alt = tsAge_un['2020':'2021'].resample('M').sum()
# let's resample the raw daily age group data into weekly chunks
tsAge_w = tsAge_un.resample('W').sum()
tsAge_w['AnzahlTodesfall'].plot.bar(figsize=(20,10), subplots=True)

In [None]:
tsAge_un['7dRollingMean_DeathsPerCases_percent'].describe()

In [None]:
gesch_ts = groupSum(cov, ['Geschlecht','Meldedatum'])
gesch_ts.tail(10)

In [None]:
gesch_ts_un = gesch_ts.unstack(level=0)
gesch_ts_un.tail(10)

In [None]:
gesch_ts_un['7dRollingMean_CasesPerDeaths'].plot.area(figsize=(20,10), subplots=True)

In [None]:
# Example code from 
# Import Data
# df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/mpg_ggplot2.csv")
# df_counts = df.groupby(['hwy', 'cty']).size().reset_index(name='counts')

# Draw Stripplot 
#sns.stripplot(x=tsAge_un['Altersgruppe'], y=cov['Bundesland'], size=)
help(sns.stripplot)

# Decorations
# plt.title('Counts Plot - Size of circle is bigger as more points overlap', fontsize=22)


**Landkreise**

In [None]:
cov['Landkreis'].unique()

In [None]:
selected = ['LK Würzburg','SK Würzburg', 'LK Neustadt a.d.Aisch-Bad Windsheim','LK Schwäbisch Hall', 'LK Main-Tauber-Kreis', 'LK Heilbronn', 'SK Heilbronn']

**let's define a function for grouping and calculating rolling averages and case fatalities...**

In [None]:
cov_ref = reformat(cov,['Landkreis','Meldedatum'])
cov_ref.head()

In [None]:
cov_ref = cov_ref.unstack(level=0)

In [None]:
cov_ref_weakly = cov_ref.resample('W').sum()

In [None]:
cov_ref_weakly['AnzahlTodesfall'][selected].plot.bar(figsize=(20,10), subplots=True)

# ICU Data (DIVI)

**...now lets look at the situation in the ICU - presumably this is where most COVID patients perish.**

In [None]:
# DIVI Intensivregister Zahlen
divi = pd.read_csv("../input/divi-bundesland-zeitreihe/bundesland-zeitreihe.csv",  parse_dates = ["Datum"])
divi.index = divi["Datum"]
divi.sort_index()
divi.head(2)

In [None]:
# DIVI filter 'Bundesland' to only 'Deutschland'
divi_de = divi[divi["Bundesland"] == 'DEUTSCHLAND']
# alternatively we can use divi["Bundesland"] == 'DEUTSCHLAND' divi["Bundesland"].str.match("DEUTSCHLAND")
divi_de.head(2)

In [None]:
divi_de['Aktuelle_COVID_Faelle_Erwachsene_ITS'].plot.area(figsize=(20,2))

In [None]:
# what is the exact date of the peak
print(divi_de['Aktuelle_COVID_Faelle_Erwachsene_ITS'].idxmax())

In [None]:
location = (divi['Bundesland'] == 'HESSEN')
columns = ['Aktuelle_COVID_Faelle_Erwachsene_ITS', 'Belegte_Intensivbetten_Erwachsene']
divi[location][columns].plot.area(figsize=(20,4), subplots=True)

# Vaccination Data ()

In [None]:
# imf = pd.read_excel('../input/covid-impfzahlen/COVID_Impfzahlen.xlsx')
