##  Basic Visualizations using Seaborn
#### In this notebook, I wanted to practice doing basic visualizations using Seaborn on the COVID-19 vaccination progress dataset. 

#### Importing packages needed for analysis
#### Also reading the data set into a dataframe.

In [None]:
## importing packages needed for analysis.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import matplotlib.pyplot as plt2
plt.rcParams["figure.figsize"] = (20,3)
import datetime
import os

In [None]:
filename = '/kaggle/input/covid-world-vaccination-progress/country_vaccinations.csv'

In [None]:
df1 = pd.read_csv(filename)
df1['date'] = pd.to_datetime(df1['date'], format='%Y-%m-%d')

## Line Plots
#### Started out with a basic line plot of vaccinations over time in the United States.

In [None]:
## Started out with a basic line plot of vaccinations over time in the 
## United States. 
us_df = df1[df1.country == 'United States']
sns.lineplot(data=us_df, x="date", y="total_vaccinations")
plt2.title('US Total Vaccinations over time', fontsize=20)

## Bar Chart
#### Next I wanted to look at the top countries to analyze the total vaccinations per hundred people. Used the groupby command to find the mean of this column for each country. Took the top 10 countries.

In [None]:
df1_countrydata = df1.groupby("country")["total_vaccinations_per_hundred"].mean().to_frame()
df1_countrydata = df1_countrydata.sort_values(by=['total_vaccinations_per_hundred'],ascending=False)[0:9]
df1_countrydata = df1_countrydata.reset_index()
sns.barplot(data=df1_countrydata, x="country", y="total_vaccinations_per_hundred", estimator=np.mean)
plt2.title('Countries with Highest Vaccinations per hundred people', fontsize=20)

In [None]:
## Instead of bar plots, I'll plot a line chart to plot these countries over time. 
country_list = df1_countrydata['country']

df2 = df1[df1.country.isin(country_list)]
sns.set_style("whitegrid")
sns.lineplot(data = df2, x = "date", y = "total_vaccinations_per_hundred", hue = "country")
plt2.title('Countries with Highest Vaccinations per hundred people Over Time', fontsize=20)

## Analyzing increase since vaccinations started
#### I wanted to find the countries with the largest increase of vaccinations per hundred since they starte vaccinating.  Wrote some columns to add the start value of the countries vaccination and end value for most recent.  Then took the difference between the two columns. 

  - Wrote a for loop to get the starting value and ending value for vac_hundred.

In [None]:
df2 = df1[['date', 'country', 'total_vaccinations_per_hundred']].sort_values(by=['country', 'date'])
df2 = df2.dropna(subset=['total_vaccinations_per_hundred'])
df2['moving_average'] = 0.0
df2['start'] = 'nothing'
df2['end'] = 'nothing'
df2 = df2.rename(columns={'total_vaccinations_per_hundred': 'vac_hundred'})
df2.reset_index(drop=True, inplace=True)

In [None]:
for i in range(0, len(df2)):
    
    if i == 0:
        start_value = df2['vac_hundred'][i]
    else:
        if (i == (len(df2)-1)):
                df2['end'][i] = df2['vac_hundred'][i]
                df2['start'][i] = start_value
        else:
            if ((df2['country'][i] != df2['country'][i-1]) and (df2['country'][i] != df2['country'][i+1])):
                df2['end'][i] = df2['vac_hundred'][i]
                df2['start'][i] = df2['vac_hundred'][i]
            else:
                if (df2['country'][i] != df2['country'][i-1]):
                    start_value = df2['vac_hundred'][i]
                else:
                    if (df2['country'][i] != df2['country'][i+1]):
                         df2['end'][i] = df2['vac_hundred'][i]
                         df2['start'][i] = start_value
                    else:
                        pass



#### Now created a new column called vaccination_increase.  Using this metric found 10 countries with the highest vaccination increase since they started vaccinating.  Then did a line plot of these countries vaccinations over time.

In [None]:
df3 = df2[df2.end != 'nothing']
df3['vaccination_increase'] = df3['end'] - df3['start']
df3 = df3.sort_values(by=['vaccination_increase'],ascending=False)

country_list = df3['country'][0:9]
top_vac = df2[df2.country.isin(country_list)]
sns.set_style("whitegrid")
sns.lineplot(data = top_vac, x = "date", y = "vac_hundred", hue = "country")
plt2.title('Countries with the Highest Increase in Vaccinations', fontsize=20)

##  Heat maps
#### Next I wanted to try a Heat Map visualization.  I wanted to see the distribution of Vaccination types, Country and vaccinations per hundred people.  Below are the top 15 records.

#### Plotted a heat map of vaccination types, and the countries showing which countries were going to which countries.

In [None]:
heatmap_data = df1[['country', 'vaccines', 'people_vaccinated_per_hundred']]
heatmap_data = heatmap_data.dropna(subset=['people_vaccinated_per_hundred']).rename(columns={'people_vaccinated_per_hundred': 'vac'})
heatmap_data.reset_index(drop=True, inplace=True)
heatmap_data = heatmap_data.groupby(["country", "vaccines"])["vac"].mean().to_frame()
heatmap_data = heatmap_data.sort_values(by=['vac'],ascending=False)
heatmap_data = heatmap_data[0:15]

In [None]:
heatmap_data.reset_index(inplace=True)
heatmap_data = heatmap_data.pivot_table(index=["vaccines"], columns='country',  values='vac')

heatmap_data.fillna(0, inplace=True)
sns.heatmap(heatmap_data)
plt2.title('Heatmap of Vaccine type by country and number of vaccines by hundred', fontsize=20)

## Geo Plots
#### Next wanted to play around with some geographic plots of countries and how many vaccinations they have administered. 
#### Using GeoPandas/Geoplot packages to plot vaccinations by average vaccinations accross the globe.  
- Retrieved shape files from https://www.naturalearthdata.com/
- Needed to rename a few countries so they properly merged to our data.
- The shape_df file reads the shape file, and gets the geometries needed to plot.  The geometry column contains the polygon shape information.


In [None]:
## Using GeoPandas to plot vaccinations by average vaccinations accross the globe. 
## Retrieved shape files from https://www.naturalearthdata.com/
import geopandas as gpd
from geopandas import GeoDataFrame
import geoplot as gplt

## First load the shape file into a geoPandas data frame.  

filename2 = '/kaggle/input/shape-files1/ne_50m_admin_0_countries.shp'
shape_df = gpd.read_file(filename2)
shape_df = shape_df.sort_values(by=['NAME']).reset_index(drop=True)
shape_df['NAME'].iloc[40] = 'Cayman Islands'
shape_df['NAME'].iloc[68] = 'Faeroe Islands'
shape_df['NAME'].iloc[144] = 'Northern Cyprus'
shape_df['NAME'].iloc[227] = 'United States'
shape_df = shape_df[shape_df['CONTINENT'] == 'Europe']
shape_df = shape_df[['NAME', 'geometry', 'CONTINENT']]

geo1 = df1.groupby("country")["daily_vaccinations"].mean()
geo1 = geo1.to_frame()
geo1 = geo1.sort_values(by=['daily_vaccinations'],ascending=False)


df3 = geo1.merge(shape_df, how='right', left_on='country', right_on='NAME')
gdf = GeoDataFrame(df3, crs="EPSG:4326", geometry='geometry')
gdf['daily_vaccinations'] = gdf['daily_vaccinations'].fillna(0)

#### Plotted a quick analysis of countries in Europe and their average Daily Vaccine Counts. 

In [None]:
plt2.rcParams["figure.figsize"] = (40,6)
fig, ax1 = plt2.subplots()
gplt.choropleth(gdf[gdf['CONTINENT']=='Europe'], hue = gdf['daily_vaccinations'], ax = ax1,legend=True)
plt2.title('Heatmap of number of Daily Vaccines in Europe', fontsize=20)

#### Let me know of any comments or issues.  Very much appreciated to Gabriel Preda for posting/uploading this dataset.