In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Explore the data**

In [None]:
df = pd.read_csv("/kaggle/input/provisional-covid19-deaths-counts-by-age-in-years/Provisional_COVID-19_Deaths_Counts_by_Age_in_Years.csv")
df

In [None]:
df.info()

# **Cleaning the data**
**Changes to the "Age Years" column**
* Remove "Years"
* Change "<1 year" to "0"
* Change "85 Years and over" to "86"
* Convert column data type to integers


In [None]:
df["Age Years"] = df["Age Years"].str.replace(" Years", "")
df

In [None]:
df["Age Years"] = df["Age Years"].str.replace("<1 year", "0")
df["Age Years"] = df["Age Years"].str.replace("85 and over", "86")
df

# **Analyse the data**
Deaths per age groups

In [None]:
df["Age Years"] = pd.to_numeric(df["Age Years"])
df.info()

In [None]:
df.describe()

In [None]:
# Store bins in DataFrame
df["quartile"] = pd.qcut(df["Age Years"], q = 4, precision = 0)
df["decitile"] = pd.qcut(df["Age Years"], q = 10,precision = 0)
df

In [None]:
#How values are distributed across the bins
print(df["quartile"].value_counts())
print("="*50)
print(df["decitile"].value_counts())

In [None]:
# Name bins
quartile_bin_labels = ["Age 0 - 21", "Age 22 - 42", "Age 43 - 64", "Age 65 and over"]
df["quartile_category"] = pd.qcut(df["Age Years"], 
                                 q = [0, 0.25, 0.5, 0.75, 1], 
                                 labels = quartile_bin_labels)

decitile_bin_labels = ["Age 0 - 8", "Age 9 - 17", "Age 18 - 25", "Age 26 - 34", "Age 35 - 42", 
                       "Age 43 - 51","Age 52 - 60", "Age 61 - 68", "Age 69 - 77", 
                       "Age 78 and over" ]

df["decitile_category"] = pd.qcut(df["Age Years"], 
                                 q = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 
                                 labels = decitile_bin_labels)

df

# **Plot the data**
* 4 different age categories
* 10 different age categories
* % of all COVID-19 deaths

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.style

matplotlib.style.use("bmh")

In [None]:
quartile_df = df.groupby("quartile_category").sum()
ax = quartile_df.plot(kind = "bar", figsize = (18,8),y = ["Total deaths", "COVID-19 Deaths"])

plt.xlabel("Age category")
plt.ylabel("Number of deaths")
plt.xticks(rotation = 45)
plt.title("Total deaths vs COVID-19 deaths for USA : 4 age categories")

ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))


In [None]:
decitile_df = df.groupby("decitile_category").sum()
ax = decitile_df.plot(kind = "bar", figsize = (18,8),y = ["Total deaths", "COVID-19 Deaths"])

plt.xlabel("Age category")
plt.ylabel("Number of deaths")
plt.xticks(rotation = 45)
plt.title("Total deaths vs COVID-19 deaths for USA : 10 age categories")

ax.yaxis.set_major_formatter(matplotlib.ticker.StrMethodFormatter('{x:,.0f}'))


In [None]:
# Age category show as a perentage of total COVID-19 deaths : 4 age categories
quartile_df["% of all COVID-19 deaths"] = round(quartile_df["COVID-19 Deaths"] / quartile_df["COVID-19 Deaths"].sum() * 100, 2)
quartile_df[["% of all COVID-19 deaths"]]

In [None]:
quartile_df.plot(figsize = (18, 10), kind = "bar", y = "% of all COVID-19 deaths")
plt.xlabel("Age category")
plt.ylabel("Perenctage of deaths")
plt.xticks(rotation = 45)
plt.title("Percentage of deaths per age category")

In [None]:
# Age category show as a perentage of total COVID-19 deaths : 10 age categories
decitile_df["% of all COVID-19 deaths"] = round(decitile_df["COVID-19 Deaths"] / decitile_df["COVID-19 Deaths"].sum() * 100, 2)
decitile_df[["% of all COVID-19 deaths"]]

In [None]:
decitile_df.plot(figsize = (18, 10), kind = "bar", y = "% of all COVID-19 deaths")
plt.xlabel("Age category")
plt.ylabel("Perenctage of deaths")
plt.xticks(rotation = 45)
plt.title("Percentage of deaths per age category")