In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Check data

In [None]:
covid_impact = pd.read_csv('../input/impact-of-covid19-pandemic-on-the-global-economy/raw_data.csv', parse_dates=['date'])
covid_impact.head()

In [None]:
covid_impact.columns

In [None]:
# Drop unnamed columns
covid_impact = covid_impact[covid_impact.columns.drop(list(covid_impact.filter(regex='Unnamed')))]
covid_impact.columns

In [None]:
# Check date range
min(covid_impact['date']), max(covid_impact['date'])

# Explore data

In [None]:
covid_impact.info()

# By Country

## Total_cases

### Raw numbers

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_impact.sort_values(['total_cases'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.lineplot(x='date', y='total_cases', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

Comment: As reported in the news, USA was the highest, followed by India and Brazil

### Normalized by population

In [None]:
covid_impact['total_cases_pop'] = covid_impact['total_cases'] / covid_impact['population']

# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_impact.sort_values(['total_cases_pop'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.lineplot(x='date', y='total_cases_pop', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

Comment: With lower population, low number of cases will turn out to be high in percentage

## Total_deaths

### Raw numbers

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_impact.sort_values(['total_deaths'], ascending = False)
# print(total_cases_sorted['location'].unique()[:5]) 
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.lineplot(x='date', y='total_deaths', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

Comment: USA had the highest deaths, followed by Brazil and India

### Normalized by population

In [None]:
covid_impact['total_deaths_pop'] = covid_impact['total_deaths'] / covid_impact['population']

# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_impact.sort_values(['total_deaths_pop'], ascending = False)
# print(total_cases_sorted['location'].unique()[:5]) 
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.lineplot(x='date', y='total_deaths_pop', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

Comment: Again, countries with low population showed higher deaths per capita even when the deaths were low

## Death rate (total_death / total_cases)
The value is the maximum since the day with COVID-19. It will decrease if there are more cases but less death

In [None]:
covid_impact['death_rate'] = covid_impact['total_deaths'] / covid_impact['total_cases']

# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_impact.sort_values(['death_rate'], ascending = False)
# print(total_cases_sorted['location'].unique()[:5]) 
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.lineplot(x='date', y='death_rate', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

The "top 5" were hit hardest in the beginning and the death rate dropped down in a month. Either they had figured out how to give better treatment or the new patients were younger and more likely to recover

In [None]:
covid_impact['death_rate'].describe()

Comment: When checking statistics, the average death rate was at 3.7%, with the 75th percentile at 4.6%

# Stringency index

What is Stringency Index?

*     It is among the metrics being used by the Oxford COVID-19 Government Response Tracker.
*     The Tracker involves a team of 100 Oxford community members who have continuously updated a database of 17 indicators of government response.
*     These indicators examine containment policies such as school and workplace closings, public events, public transport, stay-at-home policies.
*     The Stringency Index is a number from 0 to 100 that reflects these indicators. A higher index score indicates a higher level of stringency.

(Source: https://www.civilsdaily.com/news/what-is-stringency-index/)

In [None]:
covid_impact['stringency_index'].describe()

## Overlay stringency_index with total_cases (1 country)
Red = total cases, blue = stringent index

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def plot_func(country_name):    
    fig, ax = plt.subplots(figsize=(10,5))        
    sns.lineplot(x='date', y='total_cases', data = covid_impact[covid_impact['location'] == country_name], ax=ax, color='red')
    plt.xticks(rotation=45)
    ax2 = plt.twinx()
    sns.lineplot(x='date', y='stringency_index', data = covid_impact[covid_impact['location'] == country_name], ax=ax2, color='blue')
    plt.show()

interact(plot_func, country_name = covid_impact['location'].unique().tolist())

## Overlay stringency_index with total_deaths (1 country)
Red = total deaths, blue = stringent index

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def plot_func(country_name):    
    fig, ax = plt.subplots(figsize=(10,5))        
    sns.lineplot(x='date', y='total_deaths', data = covid_impact[covid_impact['location'] == country_name], ax=ax, color='red')
    plt.xticks(rotation=45)
    ax2 = plt.twinx()
    sns.lineplot(x='date', y='stringency_index', data = covid_impact[covid_impact['location'] == country_name], ax=ax2, color='blue')
    plt.show()

interact(plot_func, country_name = covid_impact['location'].unique().tolist())

Comment: Even with higher stringent index, the total cases and total deaths still increased. Maybe people were not following rules? Maybe need to take at least 2 months to see the effect?

# Economy

In [None]:
pd.set_option('display.max_rows',None)

In [None]:
# Aggregate by country - max total cases, max total deaths, GDP
covid_agg = covid_impact[['location', 'total_cases','total_deaths','gdp_per_capita','human_development_index','population']].copy()
covid_agg = covid_agg.groupby('location').agg(
            max_cases=("total_cases", "max"),
            max_deaths=("total_deaths", "max"),            
            gdp_per_capita = ("gdp_per_capita", 'first'),
            human_development_index = ("human_development_index", 'first'),
            population = ("population", "first")
            ).reset_index()

In [None]:
# Quick check if each country has only 1 gdp_per_capita value
gdp_checking = covid_agg.groupby('location').agg(gdp_per_capita = ("gdp_per_capita", 'unique'))
print([i for i in gdp_checking['gdp_per_capita'] if len(i)>1])

In [None]:
# Quick check if each country has only 1 human_development_index value
hdi_checking = covid_agg.groupby('location').agg(human_development_index = ("human_development_index", 'unique'))
print([i for i in hdi_checking['human_development_index'] if len(i)>1])

## GDP per capita

In [None]:
covid_agg['gdp_per_capita'].describe()

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_agg.dropna(subset = ['gdp_per_capita']).sort_values(['max_cases'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.scatterplot(x='gdp_per_capita', y='max_cases', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_agg.dropna(subset = ['gdp_per_capita']).sort_values(['max_deaths'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.scatterplot(x='gdp_per_capita', y='max_deaths', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

## human_development_index

In [None]:
covid_agg['human_development_index'].describe()

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_agg.dropna(subset = ['human_development_index']).sort_values(['max_cases'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.scatterplot(x='human_development_index', y='max_cases', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_agg.dropna(subset = ['human_development_index']).sort_values(['max_deaths'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.scatterplot(x='human_development_index', y='max_deaths', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

Comment: The "top 5" countries had low to middle GDP per capita and high human_development_index

## Population
What is the relationship between population and cases (deaths)

### Total cases

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_agg.dropna(subset = ['population']).sort_values(['max_cases'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.scatterplot(x='population', y='max_cases', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

### Total deaths

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_agg.dropna(subset = ['population']).sort_values(['max_deaths'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.scatterplot(x='population', y='max_deaths', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

## Log the population for more normal distribution

In [None]:
covid_agg['lnpopulation'] = np.log(covid_agg['population'])

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_agg.dropna(subset = ['lnpopulation']).sort_values(['max_cases'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.scatterplot(x='lnpopulation', y='max_cases', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

In [None]:
# Set top 5 as colored, the rest as grey
total_cases_sorted = covid_agg.dropna(subset = ['lnpopulation']).sort_values(['max_deaths'], ascending = False)
colorlist = ['red','orange','green','blue','purple']
palette = {c:colorlist[i] for i, c in enumerate(total_cases_sorted['location'].unique()[:5])}
print(palette)
other_palette = {c:'lightgrey' for c in total_cases_sorted['location'].unique()[5:]}
palette = {**palette, **other_palette}

plt.figure(figsize = [10,5])
sns.set(style="darkgrid")

ax = sns.scatterplot(x='lnpopulation', y='max_deaths', hue='location', palette = palette, data = total_cases_sorted)
ax.get_legend().remove()
plt.xticks(rotation=45)
plt.show()

Comment: taking natural log makes the gap between extremes smaller

# Statistical modeling
After adjusted for population, do GDP per capita and human_development_index affect maximum number of total cases and deaths?

## Clean-up data
Data cannot contain NA

In [None]:
covid_agg.info()

In [None]:
# If we drop all NA
covid_agg.dropna().info()

In [None]:
# Still have 165 countries, not too bad
covid_agg_cleaned = covid_agg.dropna()

In [None]:
# Check relationship among factors

sns.pairplot(covid_agg_cleaned)

In [None]:
# Use VIF to check for multicollinearity

from statsmodels.stats.outliers_influence import variance_inflation_factor

Y = covid_agg_cleaned['max_cases']
X = covid_agg_cleaned[['gdp_per_capita', 'human_development_index', 'lnpopulation']]

# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

Comment: 
* gdp_per_capita and human_development_index had high VIF, indicating that these two variables are highly correlated. Only input one factor into the model.
* In this case, gdp_per_capita is affected by number of population. Hence if we want to adjust for population, we should use human_development_index

In [None]:
# The figures didn't show a linear relationship between the variables, but let's try linear regression first

import statsmodels.api as sm

Y = covid_agg_cleaned['max_cases']
X = covid_agg_cleaned[['human_development_index', 'lnpopulation']]

result = sm.OLS( Y, X ).fit()
result.summary()

In [None]:
Y = covid_agg_cleaned['max_deaths']
X = covid_agg_cleaned[['human_development_index', 'lnpopulation']]

result = sm.OLS( Y, X ).fit()
result.summary()

**Comment:** Adj. R-squared for both cases and deaths are very low, indicating that the model is not good. Possible fix is to add more factors, which we do not have here.

# Conclusion
1. Some countries managed to curb the cases and deaths with increased stringent index
2. Higher human development index does not translate to lower cases and deaths