In [None]:
import os
import numpy as np
import random

seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import shap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from collections import Counter
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [None]:
data_covid  = pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')
data_whr_21 = pd.read_csv('../input/world-happiness-report-2021/world-happiness-report-2021.csv')
print('COVID-19 Data')
print(data_covid.info())
print('WHR 2021 Data')
print(data_whr_21.info())

In [None]:
data_covid.date.value_counts()

In [None]:
# The most recent date with 149 country entries is 2021-03-17, so let's use it
last_date    = '2021-03-17'
last_date_df = data_covid[data_covid.date == last_date]
data_whr_21.rename(columns={'Country name':'country'}, inplace=True)
merged_df = pd.merge(last_date_df, data_whr_21, how='inner', on=['country'])
useful_dl = merged_df.iloc[:,[0,3,4,5,6,7,8,9,10,11,12,13,16,20,21,22,23,24,25]]; useful_dl.sample(5)

In [None]:
print('Countries that we will use ({}): {}'.format((len(set(useful_dl.country.values))),set(useful_dl.country.values)))

In [None]:
br_total = useful_dl['total_vaccinations_per_hundred'].values[useful_dl['country'] == 'Brazil'][0]
fig      = plt.figure(dpi=150)
plt.title('Total vaccinations per hundred. Min is {} and max is {}.'.format(min(useful_dl['total_vaccinations_per_hundred'].values),
                                                                            max(useful_dl['total_vaccinations_per_hundred'].values)))
ax     = sns.distplot(useful_dl['total_vaccinations_per_hundred'].values)
ax2    = ax.twinx()
sns.boxplot(x=useful_dl['total_vaccinations_per_hundred'].values, ax=ax2)
plt.axvline(br_total, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))
plt.show()

## Correlation Matrix

In [None]:
plt.figure(dpi=150)
sns.heatmap(useful_dl.iloc[:,[1,2,3,9,13,14,15,18]].corr(), cmap='crest', annot=True)
plt.tick_params(axis="x", labelsize=8)
plt.tick_params(axis="y", labelsize=8)
plt.xticks(rotation=90)
plt.show()

In [None]:
br_ph    = useful_dl['people_vaccinated_per_hundred'].values[useful_dl['country'] == 'Brazil'][0]
br_ss    = useful_dl['Social support'].values[useful_dl['country'] == 'Brazil'][0]
br_gdp   = useful_dl['Logged GDP per capita'].values[useful_dl['country'] == 'Brazil'][0]
br_fully = useful_dl['people_fully_vaccinated_per_hundred'].values[useful_dl['country'] == 'Brazil'][0]
br_hle   = useful_dl['Healthy life expectancy'].values[useful_dl['country'] == 'Brazil'][0]
br_corr  = useful_dl['Perceptions of corruption'].values[useful_dl['country'] == 'Brazil'][0]

fig = plt.figure(figsize=(18,12))
plt.subplot(321)
plt.title('Social support.')
ax = sns.distplot(useful_dl['Social support'].values)
ax2 = ax.twinx()
sns.boxplot(x=useful_dl['Social support'].values, ax=ax2)
plt.axvline(br_ss, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(322)
plt.title('People vaccinated per hundred. Brazil vaccinated {}%.'.format(br_ph))
ax = sns.distplot(useful_dl['people_vaccinated_per_hundred'].values)
ax2 = ax.twinx()
sns.boxplot(x=useful_dl['people_vaccinated_per_hundred'].values, ax=ax2)
plt.axvline(br_ph, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(323)
plt.title('Logged GDP per capita. Min is {} and max is {}.'.format(min(useful_dl['Logged GDP per capita'].values), max(useful_dl['Logged GDP per capita'].values)))
ax = sns.distplot(useful_dl['Logged GDP per capita'].values)
ax2 = ax.twinx()
sns.boxplot(x=useful_dl['Logged GDP per capita'].values, ax=ax2)
plt.axvline(br_gdp, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(324)
plt.title('People fully vaccinated per hundred. Brazil vaccinated {}%.'.format(br_fully))
ax = sns.distplot(useful_dl['people_fully_vaccinated_per_hundred'].values)
ax2 = ax.twinx()
sns.boxplot(x=useful_dl['people_fully_vaccinated_per_hundred'].values, ax=ax2)
plt.axvline(br_fully, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(325)
plt.title('Healthy life expectancy. Min is {} and max is {}.'.format(min(useful_dl['Healthy life expectancy'].values), max(useful_dl['Healthy life expectancy'].values)))
ax = sns.distplot(useful_dl['Healthy life expectancy'].values)
ax2 = ax.twinx()
sns.boxplot(x=useful_dl['Healthy life expectancy'].values, ax=ax2)
plt.axvline(br_hle, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.subplot(326)
plt.title('Perceptions of corruption.')
ax = sns.distplot(useful_dl['Perceptions of corruption'].values)
ax2 = ax.twinx()
sns.boxplot(x=useful_dl['Perceptions of corruption'].values, ax=ax2)
plt.axvline(br_corr, 0,1, color='r', linestyle='--')
ax2.set(ylim=(-.5, 10))

plt.show()

## My take thus far
* It is interesting to observe that perceptions of corruption negatively correlate with values of daily vaccination and number of people fully vaccinated;I
* GDP per capita seems to positively correlate with all vaccination data observed in the confusion matrix;
* Social support and healthy life expectancy are positively correlated with daily vaccinations per million;

# FUTURE WORK (remember, I'm doing this for fun)
* Estimated projection date for at least 30% of the brazilian population fully vaccinated;
* What are the most prejudicial socio-economical factors (listed here) for vaccination programs in the world and in Brazil;
* Don't know