# Table of Contents
* [Load / Data Preparation](#1)
* [Select and Explore Subset](#2)
* [p-Scores by Age Group](#3)
* [Historical Development (looking for trends)](#4)

In [None]:
# PACKAGES

# standard
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt

<a id='1'></a>
# Load / Data Preparation

In [None]:
# read file + show overview
df = pd.read_csv('../input/excess-mortality-during-the-covid19-pandemic/excess_mortality.csv')
df.info()

In [None]:
# convert date + extract features
df.date = pd.to_datetime(df.date)
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.day

In [None]:
# location stats
loc_stats = df.groupby('location', as_index=False).agg(
    time_unit = pd.NamedAgg(column='time_unit', aggfunc='first'),
    n_values = pd.NamedAgg(column='time_unit', aggfunc='count'))

pd.set_option('display.max_rows', None)
print(loc_stats)
pd.set_option('display.max_rows', 10)

<a id='2'></a>
# Select and Explore Subset

In [None]:
# select specific location + year
my_loc = 'Germany'
my_year = 2020
df_select = df[(df.location==my_loc) & (df.year==my_year)].copy()
# show table
df_select

In [None]:
# plot 2020 deaths vs previous years' average
my_title = 'Location ' + my_loc
plt.figure(figsize=(12,6))
plt.scatter(df_select.date, df_select.average_deaths_2015_2019_all_ages, label='Avg 2015-2019')
plt.scatter(df_select.date, df_select.deaths_2020_all_ages, label='2020')
plt.title(my_title)
plt.grid()
plt.legend(loc='upper left')
plt.show()

In [None]:
# add difference / relative deviation
df_select['excess_deaths'] = df_select.deaths_2020_all_ages - df_select.average_deaths_2015_2019_all_ages
df_select['excess_deaths_rel'] = df_select.excess_deaths / df_select.average_deaths_2015_2019_all_ages

In [None]:
# plot absolute excess deaths
my_title = 'Location ' + my_loc + ' - Nominal Excess Deaths (absolute)'
plt.figure(figsize=(12,6))
plt.scatter(df_select.date, df_select.excess_deaths)
plt.title(my_title)
plt.grid()
plt.show()

In [None]:
# plot relative excess deaths
my_title = 'Location ' + my_loc + ' - Nominal Excess Deaths (relative)'
plt.figure(figsize=(12,6))
plt.scatter(df_select.date, df_select.excess_deaths_rel)
plt.title(my_title)
plt.grid()
plt.show()

In [None]:
# Aggregation over full year
deaths_2020 = df_select.deaths_2020_all_ages.sum()
deaths_prev = df_select.average_deaths_2015_2019_all_ages.sum()

print('Annual View:')
print('============')
print('Deaths 2020          :', int(deaths_2020))
print('Avg.Deaths 2015-2019 :', deaths_prev)
print('Difference           :', deaths_2020 - deaths_prev)
print('Relative Deviation % :', np.round(100*(deaths_2020 - deaths_prev)/deaths_prev,2))

<a id='3'></a>
# p-Scores by Age Group

#### The p-Score is the relative deviation between the number of deaths in 2020–2021 (weekly or monthly) and the average number of deaths in the benchmark period 2015–2019 (weekly or monthly).

In [None]:
# plot p-scores by age group
my_title = 'Location ' + my_loc + ' | p-scores by age group'
plt.figure(figsize=(12,6))
plt.scatter(df_select.date, df_select.p_scores_15_64, label='p_scores_15_64')
plt.scatter(df_select.date, df_select.p_scores_65_74, label='p_scores_65_74')
plt.scatter(df_select.date, df_select.p_scores_75_84, label='p_scores_75_84')
plt.scatter(df_select.date, df_select.p_scores_85plus, label='p_scores_85plus')
plt.title(my_title)
plt.grid()
plt.legend(loc='upper left')
plt.show()

<a id='4'></a>
# Historical Development (looking for trends)

In [None]:
history = ['deaths_2010_all_ages', 'deaths_2011_all_ages', 'deaths_2012_all_ages',
           'deaths_2013_all_ages', 'deaths_2014_all_ages', 'deaths_2015_all_ages',
           'deaths_2016_all_ages', 'deaths_2017_all_ages', 'deaths_2018_all_ages',
           'deaths_2019_all_ages'] 

In [None]:
deaths_development = df_select[history + ['deaths_2020_all_ages']].sum()
print(deaths_development)

In [None]:
# plot development
plt.scatter(range(2010,2021), deaths_development.values)
plt.ylim(0,1e6)
plt.title(my_loc + ' - Deaths per year')
plt.grid()

### We see a general upward trend in 2010-2019 (probably driven by aging population) so we have to be somewhat careful in interpreting the nominal excess deaths! However, we are using only years 2015-2019 as benchmark for comparison which reduces the impact of a trend:

In [None]:
deaths_development_short_term = deaths_development[5:]
deaths_development_short_term

In [None]:
# check if mean figure for 2015-2019 used above is correct:
deaths_development_short_term[0:5].mean()

In [None]:
# plot development for 2015+ only (used for average figures above)
plt.scatter(range(2015,2021), deaths_development_short_term.values)
plt.ylim(0,1e6)
plt.title(my_loc + ' - Deaths per year')
plt.grid()

In [None]:
# zoom in
plt.scatter(range(2015,2021), deaths_development_short_term.values)
plt.title(my_loc + ' - Deaths per year')
plt.grid()