In this notebook, I will continue from the last part and do some data visualisation.

In [None]:
# import modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import datetime

In [None]:
# read the cleaned dataframe
df = pd.read_csv("../input/covid-cases-deaths-vaccine-doses-23082021/covid_data_cleaned.csv")

The dataframe saved from last time changed the date back to objects, so I need to parse them again.

In [None]:
# A few issues need to be resolved before EDA, the date needs to be parsed again
df.date_parsed = pd.to_datetime(df.date_parsed, format="%Y/%m/%d")

# extract the country list
country_list = df.Country.unique()

Firstly, I am going to create a few functions that will help comparing the data from different countries.

In [None]:
# Function, create a dataframe for new cases in given countries
def new_cases_df(countries):
    index_col=df.date_parsed.unique()
    country_dict = {}
    for c in countries:
        country_dict[c] = list(df[df.Country == c]["New_cases_per_100"])
    return pd.DataFrame(country_dict, index=index_col)

# Give the plot for the dataframe above 
def new_cases_timeplot(countries):
    plt.figure(figsize=(16,6))
    sns.lineplot(data=new_cases_df(countries))
    plt.title("New cases per 100 in selected countries")
    plt.xlabel("Months")
    plt.ylabel("New cases per 100")
    plt.show()

In [None]:
# do the same for new deaths
def new_deaths_df(countries):
    index_col=df.date_parsed.unique()
    country_dict = {}
    for c in countries:
        country_dict[c] = list(df[df.Country == c]["New_deaths_per_100"])
    return pd.DataFrame(country_dict, index=index_col)

def new_deaths_timeplot(countries):
    plt.figure(figsize=(16,6))
    sns.lineplot(data=new_deaths_df(countries))
    plt.title("New deaths per 100 in selected countries")
    plt.xlabel("Months")
    plt.ylabel("New deaths per 100")
    plt.show()

In [None]:
# cumulative cases
def cul_cases_df(countries):
    index_col=df.date_parsed.unique()
    country_dict = {}
    for c in countries:
        country_dict[c] = list(df[df.Country == c]["Cumulative_cases_per_100"])
    return pd.DataFrame(country_dict, index=index_col)

def cul_cases_timeplot(countries):
    plt.figure(figsize=(16,6))
    sns.lineplot(data=cul_cases_df(countries))
    plt.title("Cumulative cases per 100 in selected countries")
    plt.xlabel("Months")
    plt.ylabel("Cumulative cases per 100")
    plt.show()

In [None]:
# cumulative cases
def cul_cases_df(countries):
    index_col=df.date_parsed.unique()
    country_dict = {}
    for c in countries:
        country_dict[c] = list(df[df.Country == c]["Cumulative_cases_per_100"])
    return pd.DataFrame(country_dict, index=index_col)

def cul_cases_timeplot(countries):
    plt.figure(figsize=(16,6))
    sns.lineplot(data=cul_cases_df(countries))
    plt.title("Cumulative cases per 100 in selected countries")
    plt.xlabel("Months")
    plt.ylabel("Cumulative cases per 100")
    plt.show()

In [None]:
# cumulative deaths
def cul_deaths_df(countries):
    index_col=df.date_parsed.unique()
    country_dict = {}
    for c in countries:
        country_dict[c] = list(df[df.Country == c]["Cumulative_deaths_per_100"])
    return pd.DataFrame(country_dict, index=index_col)

def cul_deaths_timeplot(countries):
    plt.figure(figsize=(16,6))
    sns.lineplot(data=cul_deaths_df(countries))
    plt.title("Cumulative deaths per 100 in selected countries")
    plt.xlabel("Months")
    plt.ylabel("Cumulative deaths per 100")
    plt.show()

In [None]:
# and cumulative vaccinations
def vacc_df(countries):
    index_col=df.date_parsed.unique()
    country_dict = {}
    for c in countries:
        country_dict[c] = list(df[df.Country == c]["total_vaccinations_per_hundred"])
    return pd.DataFrame(country_dict, index=index_col)

def vacc_timeplot(countries):
    plt.figure(figsize=(16,6))
    sns.lineplot(data=vacc_df(countries))
    plt.title("Vaccine doses given per 100 in selected countries")
    plt.xlabel("Months")
    plt.ylabel("Vaccine doses given per 100")
    plt.show()

As a test to the function, I will compare UK, India, Italy, and Malaysia, but the function should work for any list of countries

In [None]:
# Example: Compare UK, India, Italy, and Malaysia (can be replaced by any list of countries in country_list)
selected_countries = ["United Kingdom", "India", "Italy","Malaysia"]
new_cases_timeplot(selected_countries)

In [None]:
new_deaths_timeplot(selected_countries)

In [None]:
cul_cases_timeplot(selected_countries)

In [None]:
cul_deaths_timeplot(selected_countries)

In [None]:
vacc_timeplot(selected_countries)

Now, I want to analyse the relationship between new cases and deaths, in a single country, under different vaccination stage. The new cases and new deaths columns will be useful. For clarity, I will standardise the distribution of the new cases/deaths.

In [None]:
# combines new cases and deaths of a single country
def cases_deaths(country):
    df_cases_deaths = new_cases_df([country]).join(new_deaths_df([country]), lsuffix='_Cases', rsuffix='_Deaths')
    df_cases_deaths = (df_cases_deaths-df_cases_deaths.mean())/df_cases_deaths.std()
    return df_cases_deaths


The following function is created by André Araújo. It allows time lag for each column of the given dataframe. Since an increase in case number will not immediately cause an increase in death number, but rather with a time lag. It will be useful to find a better (lagged) correlation between the two.

Source: https://www.kaggle.com/dedecu/cross-correlation-time-lag-with-pandas

In [None]:
# try to shift cases to achieve better correlation
def df_derived_by_shift(df,lag=0,NON_DER=[]):
    df = df.copy()
    if not lag:
        return df
    cols ={}
    for i in range(1,lag+1):
        for x in list(df.columns):
            if x not in NON_DER:
                if not x in cols:
                    cols[x] = ['{}_{}'.format(x, i)]
                else:
                    cols[x].append('{}_{}'.format(x, i))
    for k,v in cols.items():
        columns = v
        dfn = pd.DataFrame(data=None, columns=columns, index=df.index)    
        i = 1
        for c in columns:
            dfn[c] = df[k].shift(periods=i)
            i+=1
        df = pd.concat([df, dfn], axis=1)
    return df


As an experiment, I'm using a specific country to test the code (UK), and attempt to draw a graph.

In [None]:
df_uk = cases_deaths("United Kingdom").reset_index().rename(columns={"index": "Date"})
df_uk_lagged = df_derived_by_shift(df_uk, 30, ["Date"]).set_index("Date")
df_uk_lagged

In [None]:
# now consider the correlation matrix
corr = df_uk_lagged.corr()
corr

In [None]:
# Want the maximum correlation between deaths and a lagged case number
case_lag = corr.loc["United Kingdom_Cases_1":"United Kingdom_Cases_30", "United Kingdom_Deaths"].sort_values(ascending=False)

In [None]:
# the lag that produces the highest correlation
case_lag.index[0][-2:]

In [None]:
# Get the vaccination column into play
df_uk_lagged_with_vacc = df_uk_lagged.join(vacc_df(["United Kingdom"]))
df_uk_lagged_with_vacc.rename(columns={"United Kingdom": "Vaccine per 100"}, inplace=True)

The following plot allows us to see the relationship between case and death number in the UK, under different vaccination stages.

In [None]:
# Use the 21st lag (best correlation) to produce a plot
plt.figure(figsize=(16,16))
sns.regplot(x=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] == 0]["United Kingdom_Cases_21"],
            y=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] == 0]["United Kingdom_Deaths"],
            label="no vaccine",
           scatter=False)
sns.regplot(x=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] > 0].loc[df_uk_lagged_with_vacc["Vaccine per 100"] <= 10]["United Kingdom_Cases_21"],
            y=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] > 0].loc[df_uk_lagged_with_vacc["Vaccine per 100"] <= 10]["United Kingdom_Deaths"],
           label="low vaxxed population",
           scatter=False
           )
sns.regplot(x=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] > 10].loc[df_uk_lagged_with_vacc["Vaccine per 100"] <= 60]["United Kingdom_Cases_21"],
            y=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] > 10].loc[df_uk_lagged_with_vacc["Vaccine per 100"] <= 60]["United Kingdom_Deaths"],
           label="medium vaxxed population",
           scatter=False)
sns.regplot(x=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] > 60].loc[df_uk_lagged_with_vacc["Vaccine per 100"] <= 100]["United Kingdom_Cases_21"],
            y=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] > 60].loc[df_uk_lagged_with_vacc["Vaccine per 100"] <= 100]["United Kingdom_Deaths"],
           label="high vaxxed population",
           scatter=False)
sns.regplot(x=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] > 100]["United Kingdom_Cases_21"],
            y=df_uk_lagged_with_vacc.loc[df_uk_lagged_with_vacc["Vaccine per 100"] > 100]["United Kingdom_Deaths"],
           label="very high vaxxed population",
           scatter=False)
plt.xlabel("Cases per 100 (lagged)")
plt.ylabel("Deaths per 100")
plt.title("Cases versus Deaths in Different Vaccination Stage")
plt.legend()
plt.show()

Now I will do it in the general case.

In [None]:
# First compute the lagged correlation in the given country by introduce a function
def lag_corr(country):
    df_c = cases_deaths(country).reset_index().rename(columns={"index": "Date"})
    df_c_lagged = df_derived_by_shift(df_c, 60, ["Date"]).set_index("Date")
    corr = df_c_lagged.corr()
    case_lag = corr.loc[country+"_Cases_1":country+"_Cases_60", country+"_Deaths"].sort_values(ascending=False)
    return case_lag.index[0][-2:].replace("_", "")


In [None]:
# use the lag to define the next function
def case_death_plot(country):
    df_c = cases_deaths(country).reset_index().rename(columns={"index": "Date"})
    df_c_lagged = df_derived_by_shift(df_c, 30, ["Date"]).set_index("Date")
    df_c_lagged_with_vacc = df_c_lagged.join(vacc_df([country]))
    df_c_lagged_with_vacc.rename(columns={country: "Vaccine per 100"}, inplace=True)
    plt.figure(figsize=(16,16))
    plt.xlim(0, None)
    sns.regplot(x=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] == 0][country+"_Cases_"+lag_corr(country)],
                y=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] == 0][country+"_Deaths"],
                label="no vaccine",
                truncate=False,
                scatter=False)
    sns.regplot(x=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] > 0].loc[df_c_lagged_with_vacc["Vaccine per 100"] <= 10][country+"_Cases_"+lag_corr(country)],
                y=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] > 0].loc[df_c_lagged_with_vacc["Vaccine per 100"] <= 10][country+"_Deaths"],
               label="low vaxxed population",
               truncate=False,
               scatter=False)
    sns.regplot(x=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] > 10].loc[df_c_lagged_with_vacc["Vaccine per 100"] <= 60][country+"_Cases_"+lag_corr(country)],
                y=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] > 10].loc[df_c_lagged_with_vacc["Vaccine per 100"] <= 60][country+"_Deaths"],
               label="medium vaxxed population",
               truncate=False,
               scatter=False)
    sns.regplot(x=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] > 60].loc[df_c_lagged_with_vacc["Vaccine per 100"] <= 100][country+"_Cases_"+lag_corr(country)],
                y=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] > 60].loc[df_c_lagged_with_vacc["Vaccine per 100"] <= 100][country+"_Deaths"],
               label="high vaxxed population",
               truncate=False,
               scatter=False)
    sns.regplot(x=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] > 100][country+"_Cases_"+lag_corr(country)],
                y=df_c_lagged_with_vacc.loc[df_c_lagged_with_vacc["Vaccine per 100"] > 100][country+"_Deaths"],
               label="very high vaxxed population",
               truncate=False,
               scatter=False)
    plt.xlabel("Cases per 100 (lagged)")
    plt.ylabel("Deaths per 100")
    plt.title("Cases versus Deaths in Different Vaccination Stage for "+country)
    plt.legend()
    plt.show()
    
    

The function is now complete, I will apply it to a couple of countries to try to see a trend.

In [None]:
# A few Examples
case_death_plot("India")

In [None]:
case_death_plot("Italy")

In [None]:
case_death_plot("United Kingdom")

In [None]:
case_death_plot("United States")

In [None]:
case_death_plot("South Africa")

In [None]:
case_death_plot("Canada")

In [None]:
case_death_plot("China")

In [None]:
case_death_plot("Australia")

For all these countries, we can see that as the vaccination increases, the gradient of case to death is decreasing in most cases. This shows that high vaccination number tends to a lower death rate. However, exceptional cases tend to appear in the low or medium vaccination rates. This is likely due to the fact that most infected people are not vaccinated, an ease of quarantine laws in the country, or an outbreak due to another country. Nevertheless, we can conclude that once the vaccination rate is high enough, it can effectively lower the death rate even if the case number is high.