In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

#Read files that contain the information about country profile and covid vaccine info
covid_file_path ="../input/covid-world-vaccination-progress/country_vaccinations.csv"
cntry_file_path ="../input/country-profile-data/country_profile_variables.csv"


#Read vaccination and country files
cntry_df = pd.read_csv(cntry_file_path)
covid_df = pd.read_csv(covid_file_path)


EDA is done using COVID vaccination data and Country Statistics - UN Data. This is my first attempt. I would appreciate suggestions and feedbacks to improve my work. 
Thank you Gabriel Preda for the initiative and great foundation work.

View the Data, clean up and extract the needed information 

In [None]:
#check for NaN values
cntry_df.head()
cntry_df.isnull().sum()

Rename some columns with long names and create new Dataframe with columns that we are interested in


In [None]:

cntry_df = cntry_df.rename(columns={'GDP: Gross domestic product (million current US$)':'GDP (million in US$)', 'Population in thousands (2017)' :'Population in thousands'})

country_profile=cntry_df[['country', 'Region', 'Surface area (km2)','Population in thousands','GDP (million in US$)','GDP per capita (current US$)','Life expectancy at birth (females/males, years)','Population age distribution (0-14 / 60+ years, %)','Health: Total expenditure (% of GDP)','Health: Physicians (per 1000 pop.)','Seats held by women in national parliaments %','Mobile-cellular subscriptions (per 100 inhabitants)']].copy()
country_profile.info()

In [None]:
country_profile.describe()

As we can observe from the above, there are many columns with minimum values of -99. These outliers will upset when we are analysing data. Let us fix them.


In [None]:
#check for outliers
qLow = country_profile.quantile(0.01)
qLow


In [None]:
#plot boxplot to see the outliers
plt.boxplot(country_profile['GDP (million in US$)'])
plt.title('GDP (million in US$)')
plt.show()

plt.boxplot(country_profile['Health: Total expenditure (% of GDP)'])
plt.title('Health: Total expenditure (% of GDP)')
plt.show()

In [None]:
#Fix the ouliers
qLow = country_profile['GDP (million in US$)'].quantile(0.01)
country_profile['GDP (million in US$)']= np.where(country_profile['GDP (million in US$)'] == qLow,0,country_profile['GDP (million in US$)'])



qLow = country_profile['Health: Total expenditure (% of GDP)'].quantile(0.01)
qLow = country_profile['Health: Total expenditure (% of GDP)']= np.where(country_profile['Health: Total expenditure (% of GDP)'] == qLow, 0,country_profile['Health: Total expenditure (% of GDP)'])

In [None]:
plt.boxplot(country_profile['GDP (million in US$)'])
plt.title('GDP (million in US$)')
plt.show()


In [None]:
plt.boxplot(country_profile['Health: Total expenditure (% of GDP)'])
plt.title('Health: Total expenditure (% of GDP)')
plt.show()


In [None]:
#group the covid dataset by country 
g_covid_df = covid_df.groupby(by=['country','iso_code'], sort=False,as_index = False)['total_vaccinations'].max()

#print countries present in country profile data set but not in covid dataset
print([x for x in country_profile.country.unique() if x not in g_covid_df.country.unique()])



In [None]:
#replace country name for USA as it is not same in both dataset
g_covid_df.country=g_covid_df.country.replace({'United States':'United States of America'})

#merge total vaccinations column with country profile data
m_df = country_profile.merge(g_covid_df['total_vaccinations'], left_on=country_profile['country'], right_on=g_covid_df['country']).drop(columns=['key_0'])

m_df.corr()

In [None]:
#plot heatmap to see the correlation between features
plt.subplots(figsize=(9, 9))
sns.heatmap(m_df.corr(), annot=True, square=True)
plt.show()

From the above heat map, we can see that there is a very strong correlation between 'GDP' and 'total vaccinations'. This result is obvious as we can see the countries with high GDP are the ones leading in total vaccination effort.  

There is a fairly good correlation between 'population' and 'total vaccinations'. Countries with higher population are definitely trying hard to administer more vaccines to more people because. 

Also we can see that there is small correlation between "Health: Total expenditure" and 'total vaccinations', but it is not very strong. 


Let us plot the graph to see the correlation between these features

In [None]:
#normalize values before plotting
normalized_GDP=(m_df['GDP (million in US$)']-m_df['GDP (million in US$)'].min())/(m_df['GDP (million in US$)'].max()-m_df['GDP (million in US$)'].min())
normalized_vaccine=m_df['total_vaccinations']-m_df['total_vaccinations'].min()/m_df['total_vaccinations'].max()-m_df['total_vaccinations'].min()


In [None]:

fig = px.area(m_df,x='GDP (million in US$)', y='total_vaccinations')
fig.show()

#list the top 20 countries with highest GDP
top20_gdp = m_df.nlargest(20,columns="GDP (million in US$)")
top20_gdp

In [None]:
fig = px.area(m_df,x='Population in thousands', y='total_vaccinations')
fig.show()

#list the top 20 countries with highest population
top20_gdp = m_df.nlargest(20,columns="Population in thousands")
top20_gdp

In [None]:
#plot the graph to see the relationship between Health expenditure and total vaccinations
fig = px.area(m_df,x='Health: Total expenditure (% of GDP)', y='total_vaccinations')
fig.show()

#list the top 20 countries with highest spend on Health expenses
top20_health_spend = m_df.nlargest(20,columns='Health: Total expenditure (% of GDP)')
top20_health_spend

From the above analysis we can conclude that countries with High GDP and Polpulations are leading in the COVID vaccination efforts. Health expenditure has little effect on vaccination progress. 
