In [None]:
# init
library(tidyverse)
list.files(path = "../input")

In [None]:
# load data
df <- read.csv('../input/covid19-tracking-germany/covid_de.csv')
df_demo <- read.csv('../input/covid19-tracking-germany/demographics_de.csv')

In [None]:
df$date <-as.Date(df$date, format = '%Y-%m-%d')
str(df)

In [None]:
str(df_demo)

### Be careful: Values for cases and deaths are incremental!

In [None]:
# calc and add aggregated version of cases and deaths
suppressWarnings(df <- df %>% group_by(state, county, gender, age_group) %>% arrange(date) %>% mutate(cases_agg = cumsum(cases), deaths_agg = cumsum(deaths)))

In [None]:
# check if aggregation is plausible by drilling down
check1 <- dplyr::filter(df, county=='SK Muenchen' && gender=='M' && age_group=='60-79')
check1

In [None]:
plot(check1$date, check1$cases_agg, main='Cases in Munich/Bavaria - Male 60-79')
grid()

In [None]:
# different cross section
check2 <- dplyr::filter(df, county=='SK Muenchen')
select_date <- '2020-03-29'
check2 <- dplyr::filter(check2, date==select_date)
check2

In [None]:
# cases in Munich on 2020-03-29:
sum(check2$cases_agg)

In [None]:
# check one day later
check2 <- dplyr::filter(df, county=='SK Muenchen')
select_date <- '2020-03-30'
check2 <- dplyr::filter(check2, date==select_date)
check2

In [None]:
# cases in Munich on 2020-03-30:
sum(check2$cases_agg)

In [None]:
# check availability of data per date
check_dates <- dplyr::group_by(df, date) %>% dplyr::summarise(n_values=n())
check_dates

In [None]:
plot(check_dates$date, check_dates$n_values, type='b', main='Number of values per date')
grid()

### Most recent date is incomplete. But there are further recurring dips, e. g. on 2020-03-22...

In [None]:
# get most recent available date per county
date_stats <- dplyr::group_by(df, county) %>% summarise(md = max(date))
date_stats

In [None]:
# overview of data timeliness
table(date_stats$md)

In [None]:
# calc summary by state
df_stats_by_state  <- dplyr::group_by(df, state) %>% dplyr::summarise(total_cases=sum(cases), total_deaths=sum(deaths))
df_stats_by_state$death_ratio <- round(df_stats_by_state$total_deaths / df_stats_by_state$total_cases,4)
df_stats_by_state

# Demographic Data

In [None]:
df_demo

In [None]:
# check overall population
population_germany = sum(df_demo$population)
print(population_germany)

In [None]:
# aggregation by state
df_demo_state <- dplyr::group_by(df_demo, state) %>% summarise(s=sum(population))
df_demo_state$percentage = df_demo_state$s / population_germany
df_demo_state

states <- levels(df$state)

In [None]:
ggplot(df_demo_state, aes(x=state, y=s, fill=state)) + geom_bar(stat='identity') + 
    theme(axis.text.x = element_text(angle = 90)) +  ggtitle('Population per State')

In [None]:
# combine COVID stats + demographics per state
df_combined_state <- as.data.frame(cbind(states, df_demo_state$s, df_stats_by_state$total_cases, round(100*df_stats_by_state$total_cases / df_demo_state$s, 4)))
colnames(df_combined_state) <- c('state','population','cases','cases_vs_pop_percent')
df_combined_state

In [None]:
ggplot(df_combined_state, aes(x=state, y=cases_vs_pop_percent, fill=state)) + geom_bar(stat='identity') + 
    theme(axis.text.x = element_text(angle = 90)) + ggtitle('Total cases per population in %')

#### => Bayern (Bavaria) does not only have the most cases but also the highest percentage of cases relative to its population.

In [None]:
# summary by state and gender
df_demo_state_gender <- dplyr::group_by(df_demo, state, gender) %>% summarise(count=sum(population))
df_demo_state_gender <- dplyr::left_join(df_demo_state_gender, df_demo_state, by='state')
df_demo_state_gender$percentage <- df_demo_state_gender$count / df_demo_state_gender$s
colnames(df_demo_state_gender) <- c('state','gender','count','state_population','percentage_in_state')
df_demo_state_gender

In [None]:
# summary by state and age-group
df_demo_state_age <- dplyr::group_by(df_demo, state, age_group) %>% summarise(count=sum(population))
df_demo_state_age <- dplyr::left_join(df_demo_state_age, df_demo_state, by='state')
df_demo_state_age$percentage <- df_demo_state_age$count / df_demo_state_age$s
colnames(df_demo_state_age) <- c('state','age_group','count','state_population','percentage_in_state')
df_demo_state_age

In [None]:
# plot age distribution for each state
i <- 0
for (st in states) {
    i <- i+1
    sel_state <- states[i]
    foo <- dplyr::filter(df_demo_state_age, state==sel_state)
    my_title <- paste0(sel_state,'; population = ', foo$state_population[1])
    g <- ggplot(foo, aes(x=age_group, y=percentage_in_state)) + geom_bar(stat='identity') + ggtitle(my_title) + theme(axis.text.x = element_text(angle = 90))   
    plot(g)
}

In [None]:
# percentage of oldest group (80-99) per state
df_demo_state_oldest <- dplyr::filter(df_demo_state_age, age_group=='80-99')
df_demo_state_oldest

In [None]:
ggplot(df_demo_state_oldest, aes(x=state, y=percentage_in_state, fill=state)) + geom_bar(stat='identity') + 
    theme(axis.text.x = element_text(angle = 90)) + ggtitle('Percentage of age group 80-99 per state')

=> Sachsen (Saxonia) has the highest percentage of 80+ aged people, Berlin the lowest percentage.