In [None]:
#importing libraries

import gc
import os
from pathlib import Path
import random
import sys

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import scipy as sp


import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.templates.default = "plotly_dark"

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

**Downloading Latest COVID-19 Data From John Hopkins University Directory**

In [None]:
%%time
import requests

for filename in ['time_series_covid19_confirmed_global.csv',
                 'time_series_covid19_deaths_global.csv',
                 'time_series_covid19_recovered_global.csv',
                 ]:
    print(f'Downloading {filename}')
    url = f'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/{filename}'
    myfile = requests.get(url)
    open(filename, 'wb').write(myfile.content)

**Removing unnessecary columns and converting dates into datetime format**

In [None]:
from datetime import datetime

def _convert_date_str(df):
    try:
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]
    except:
        print('_convert_date_str failed with %y, try %Y')
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%Y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]


confirmed_global_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
_convert_date_str(confirmed_global_df)

deaths_global_df = pd.read_csv('time_series_covid19_deaths_global.csv')
_convert_date_str(deaths_global_df)

recovered_global_df = pd.read_csv('time_series_covid19_recovered_global.csv')
_convert_date_str(recovered_global_df)

**Filtering out problematic data points**

In [None]:


confirmed_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
deaths_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
recovered_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)



In [None]:
confirmed_global_melt_df = confirmed_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='ConfirmedCases')
deaths_global_melt_df = deaths_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Deaths')
recovered_global_melt_df = deaths_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Recovered')

**Merging confirmed, deaths and recovered datasets**

In [None]:
train = confirmed_global_melt_df.merge(deaths_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])
train = train.merge(recovered_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])

**Removing rows with empty values and renaming columns**

In [None]:
train.rename({'Country_Region': 'country', 'Province_State': 'province', 'Id': 'id', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Deaths': 'fatalities', 'Recovered': 'recovered'}, axis=1, inplace=True)
train['country_province'] = train['country'].fillna('') + '/' + train['province'].fillna('')

**Filtering European Countries**

In [None]:
europe_country_list =list([
    'Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland',
    'Italy', 'Latvia','Luxembourg','Lithuania','Malta','Norway','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia',
    'Spain', 'Sweden', 'United Kingdom', 'Iceland', 'Russia', 'Switzerland', 'Serbia', 'Ukraine', 'Belarus',
    'Albania', 'Bosnia and Herzegovina', 'Kosovo', 'Moldova', 'Montenegro', 'North Macedonia'])

In [None]:
train = train[train['country'].isin(europe_country_list)]

**Grouping the data by date by summing the particular rows**

In [None]:
train = train.groupby(['date', 'country']).sum()

In [None]:
train = train.drop(['Lat', 'Long'], axis = 1)
train.reset_index(inplace=True)


**Importing latest vaccination data**

In [None]:
vaccination = pd.read_csv('../input/covid-world-vaccination-progress/country_vaccinations.csv')

In [None]:
vaccination = vaccination.drop(['iso_code', 'source_name', 'source_website'], axis = 1)

In [None]:
europe_country_list =list([
    'Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland',
    'Italy', 'Latvia','Luxembourg','Lithuania','Malta','Norway','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia',
    'Spain', 'Sweden', 'United Kingdom', 'Iceland', 'Russia', 'Switzerland', 'Serbia', 'Ukraine', 'Belarus',
    'Albania', 'Bosnia and Herzegovina', 'Kosovo', 'Moldova', 'Montenegro', 'North Macedonia'])

vaccination = vaccination[vaccination['country'].isin(europe_country_list)]


In [None]:
vcc_all = vaccination.drop(['vaccines'], axis = 1)
vcc_all = vcc_all.groupby(['date']).sum()
vcc_all.reset_index(inplace=True)


In [None]:
train

In [None]:
vaccination

In [None]:
vcc_all

In [None]:
train_all = train.groupby(['date']).sum()
train_all.reset_index(inplace=True)
train_all

**Getting separated dates so that visualisation is clean**

In [None]:
def get_date_ticks(df,ngaps=7):
    dates    = df.date.unique()
    n        = len(dates)-1
    stepsize = int(n/ngaps)
    return [dates[i] for i in range(0,n+1,stepsize)]



def get_date_range(df):
    return '{0} - {1}'.format(get_date_str(df.date.min()),get_date_str(df.date.max()))

In [None]:
def get_date_str(d0,fmt='%d %b %Y'):
   return pd.to_datetime(d0).strftime(fmt)

**Function that returns the date when the vaccination in a given country started**

In [None]:
def get_vaccination_started_date(country):
   first = vaccination.loc[vaccination.total_vaccinations > 0, ["country","date"]].groupby(['country']).min()
   return first.loc[country,['date']].values[0]


# **QUALITATIVE ANALYSIS OF EFFECT OF VACCINATION ON COVID-19**

**Forming a title for particular graphs**

The title is in the format of -- Confirmed cases from "Country_name" between "Date_Range", Vaccination started - "Vaccination_starting_date"

In [None]:
def get_title( country, date_range, scaling = 'Confirmed' ):
    status = "Vaccination Started"
    date = get_vaccination_started_date(country)
    return '{0} Cases from {1}: {2}. {3} {4}.'.format(scaling,
                                                        country,
                                                        date_range,
                                                        status,
                                                        get_date_str(date))
 

**Function which returns the plot for the country name that is given as parameter**

In [None]:
def plot_country(country = 'Spain'):
    vaccination_started_date = get_vaccination_started_date(country)
    country_data        = train.loc[(train.country == country),:]


    dates       = country_data.date
    cases       = country_data.confirmed
 
    vaccination_started = ["PostVaccination Starting" if l else "PreVaccination Starting" for l in pd.to_datetime(country_data.date) >= vaccination_started_date]
 
    plt.figure(figsize=(20,6))
    sns.set_palette("RdBu_r",1)
    
    
    if vaccination_started[0] != vaccination_started[-1]:
        sns.scatterplot(x=dates,y=cases, hue=vaccination_started, style=vaccination_started, palette=['r','b'])
    else: 
        sns.scatterplot(x=dates,y=cases, hue=vaccination_started, style=vaccination_started, palette=['b'])
        
    plt.title(get_title(country, get_date_range(country_data)))
    plt.xticks(get_date_ticks(country_data))
    
    

plot_country()

In [None]:
europian =list([
    'Austria','Belgium', 'Croatia', 'Denmark', 'Finland', 'France', 'Germany', 'Greece', 'Hungary', 'Ireland',
    'Italy','Norway','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia',
    'Spain', 'Sweden', 'United Kingdom', 'Russia', 'Switzerland', 'Serbia'])

**Plot of every Major European Country**

In [None]:
for i in europian:
    plot_country(i)

From the above graphs we can observe that the no of cases does not goes down immediately but after a given the time the curve starts to flatten a bit. Thus the start of vaccination process immediately did not have any major impact on no of cases per day but with time as mosre people got vaccinated , fewer no of confirmed cases were observed. To look further upon this hypothesis let's observe relation between no of cases before and after completion of vaccination of 10% of total population.


**Given Function returns the date at which vaccination of atleast 10% of total population was completed for a givrn Country**

In [None]:
def get_vaccination_tendone(country):
   first = vaccination.loc[vaccination.total_vaccinations_per_hundred > 10, ["country","date"]].groupby(['country']).min()
   return first.loc[country,['date']].values[0]



**Given function returns title of a graph is the specified format**

The format is -- Confirmed Cases from "Country_Name" from "Date_Range". Vaccination 10% done on "Vcc10%done_Date".

In [None]:
def get_title( country, date_range, scaling = 'Confirmed' ):
    status = "Vaccination 10% done"
    date = get_vaccination_tendone(country)
    return '{0} Cases from {1}: {2}. {3} {4}.'.format(scaling,
                                                        country,
                                                        date_range,
                                                        status,
                                                        get_date_str(date))

**Function that return the plot of the country given as a parameter**

In [None]:
def plot_country_ten(country = 'Spain'):
    vaccination_started_date = get_vaccination_tendone(country)
    country_data        = train.loc[(train.country == country),:]


    dates       = country_data.date
    cases       = country_data.confirmed
 
    vaccination_started = ["Post 10%" if l else "Pre 10%" for l in pd.to_datetime(country_data.date) >= vaccination_started_date]
 
    plt.figure(figsize=(20,6))
    sns.set_palette("RdBu_r",1)
    
    
    if vaccination_started[0] != vaccination_started[-1]:
        sns.scatterplot(x=dates,y=cases, hue=vaccination_started, style=vaccination_started, palette=['r','b'])
    else: 
        sns.scatterplot(x=dates,y=cases, hue=vaccination_started, style=vaccination_started, palette=['b'])
        
    plt.title(get_title(country, get_date_range(country_data)))
    plt.xticks(get_date_ticks(country_data))
    


In [None]:
for i in europian:
    plot_country_ten(i)

From the above graphs we can obsereve that the no of cases starts dwindling gradually as more and more people are vaccinated and more or less every country has been able to flatten the curve. However due to increased mobility after start of the vaccination process few countries has also not been able to curb the groth of no of cases. Now we look upon the relation between vaccination and no of deaths recorded.

In [None]:
def get_title( country, date_range, scaling = 'Fatalities' ):
    status = "Vaccination Started"
    date = get_vaccination_started_date(country)
    return '{0} Cases from {1}: {2}. {3} {4}.'.format(scaling,
                                                        country,
                                                        date_range,
                                                        status,
                                                        get_date_str(date))

In [None]:
def plot_country_fatalities(country = 'Spain'):
    vaccination_started_date = get_vaccination_started_date(country)
    country_data        = train.loc[(train.country == country),:]


    dates       = country_data.date
    cases       = country_data.fatalities
 
    vaccination_started = ["PostVaccination Starting" if l else "PreVaccination Starting" for l in pd.to_datetime(country_data.date) >= vaccination_started_date]
 
    plt.figure(figsize=(20,6))
    sns.set_palette("RdBu_r",1)
    
    
    if vaccination_started[0] != vaccination_started[-1]:
        sns.scatterplot(x=dates,y=cases, hue=vaccination_started, style=vaccination_started, palette=['r','b'])
    else: 
        sns.scatterplot(x=dates,y=cases, hue=vaccination_started, style=vaccination_started, palette=['b'])
        
    plt.title(get_title(country, get_date_range(country_data)))
    plt.xticks(get_date_ticks(country_data))
    
    


In [None]:
for i in europian:
    plot_country_fatalities(i)

The time vs deaths plot is more telling of how important vaccination has been in the fight against Covid. After the start of vacciantion process, no of deaths and thus mortality rate has reduced in a regular manner.

In [None]:
def get_title( country, date_range, scaling = 'Fatalities' ):
    status = "Vaccination 10% done"
    date = get_vaccination_tendone(country)
    return '{0} Cases from {1}: {2}. {3} {4}.'.format(scaling,
                                                        country,
                                                        date_range,
                                                        status,
                                                        get_date_str(date))

In [None]:
def plot_country_ten_fatal(country = 'Spain'):
    vaccination_started_date = get_vaccination_tendone(country)
    country_data        = train.loc[(train.country == country),:]


    dates       = country_data.date
    cases       = country_data.fatalities
 
    vaccination_started = ["Post 10%" if l else "Pre 10%" for l in pd.to_datetime(country_data.date) >= vaccination_started_date]
 
    plt.figure(figsize=(20,6))
    sns.set_palette("RdBu_r",1)
    
    
    if vaccination_started[0] != vaccination_started[-1]:
        sns.scatterplot(x=dates,y=cases, hue=vaccination_started, style=vaccination_started, palette=['r','b'])
    else: 
        sns.scatterplot(x=dates,y=cases, hue=vaccination_started, style=vaccination_started, palette=['b'])
        
    plt.title(get_title(country, get_date_range(country_data)))
    plt.xticks(get_date_ticks(country_data))
    

In [None]:
for i in europian:
    plot_country_ten_fatal(i)

The above graphs depicting no of deaths before and after completion of vaccination of 10% of country's population shows that the hypothesis that increase in vaccination decreases no of deaths and mortality rate. Now we can try and use a machine learning model to test our hypothesis.

# **QUANTITATIVE ANALYSIS OF EFFECT OF VACCINATION ON COVID-19**

In [None]:
%%time
import requests

for filename in ['time_series_covid19_confirmed_global.csv',
                 'time_series_covid19_deaths_global.csv',
                 'time_series_covid19_recovered_global.csv',
                 ]:
    print(f'Downloading {filename}')
    url = f'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/{filename}'
    myfile = requests.get(url)
    open(filename, 'wb').write(myfile.content)

In [None]:
from datetime import datetime

def _convert_date_str(df):
    try:
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]
    except:
        print('_convert_date_str failed with %y, try %Y')
        df.columns = list(df.columns[:4]) + [datetime.strptime(d, "%m/%d/%Y").date().strftime("%Y-%m-%d") for d in df.columns[4:]]


confirmed_global_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
_convert_date_str(confirmed_global_df)

deaths_global_df = pd.read_csv('time_series_covid19_deaths_global.csv')
_convert_date_str(deaths_global_df)

recovered_global_df = pd.read_csv('time_series_covid19_recovered_global.csv')
_convert_date_str(recovered_global_df)

In [None]:


confirmed_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
deaths_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)
recovered_global_df.rename(columns={"Province/State": "Province_State", "Country/Region": "Country_Region"}, inplace=True)



In [None]:
confirmed_global_melt_df = confirmed_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='ConfirmedCases')
deaths_global_melt_df = deaths_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Deaths')
recovered_global_melt_df = deaths_global_df.melt(
    id_vars=['Country_Region', 'Province_State', 'Lat', 'Long'], value_vars=confirmed_global_df.columns[4:], var_name='Date', value_name='Recovered')

In [None]:
train = confirmed_global_melt_df.merge(deaths_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])
train = train.merge(recovered_global_melt_df, on=['Country_Region', 'Province_State', 'Lat', 'Long', 'Date'])

In [None]:
train.rename({'Country_Region': 'country', 'Province_State': 'province', 'Id': 'id', 'Date': 'date', 'ConfirmedCases': 'confirmed', 'Deaths': 'fatalities', 'Recovered': 'recovered'}, axis=1, inplace=True)
train['country_province'] = train['country'].fillna('') + '/' + train['province'].fillna('')

In [None]:
europe_country_list =list([
    'Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark','Estonia','Finland','France','Germany','Greece','Hungary','Ireland',
    'Italy', 'Latvia','Luxembourg','Lithuania','Malta','Norway','Netherlands','Poland','Portugal','Romania','Slovakia','Slovenia',
    'Spain', 'Sweden', 'United Kingdom', 'Iceland', 'Russia', 'Switzerland', 'Serbia', 'Ukraine', 'Belarus',
    'Albania', 'Bosnia and Herzegovina', 'Kosovo', 'Moldova', 'Montenegro', 'North Macedonia'])

In [None]:
train = train[train['country'].isin(europe_country_list)]

train = train.groupby(['date', 'country']).sum()

train = train.drop(['Lat', 'Long'], axis = 1)
train.reset_index(inplace=True)

In [None]:
df = pd.read_csv('https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv')

In [None]:
df = df[df['country_region'].isin(europe_country_list)]
df.shape

In [None]:
df = df.drop(['sub_region_1', 'sub_region_2', 'country_region_code', 'sub_region_2', 'metro_area',
              'iso_3166_2_code', 'census_fips_code', 'place_id'], axis = 1)

In [None]:
df = df.groupby(['date', 'country_region']).sum()

df.reset_index(inplace=True)

df.rename({'country_region': 'country'}, axis=1, inplace=True)

In [None]:
train = train.merge(df, on=['country', 'date'])

train1 = train.drop(['country'], axis = 1)
datewise = train1.groupby(["date"]).sum()

datewise.reset_index(inplace=True)

datewise['date'] = datewise['date'].astype('datetime64[ns]')
datewise.dtypes

In [None]:
datewise["Days"]=datewise.date - datewise.date.min()

datewise["Days"]=datewise["Days"].dt.days
datewise

In [None]:
train_ml=datewise.iloc[:int(datewise.shape[0]*0.85)]
valid_ml=datewise.iloc[int(datewise.shape[0]*0.85):]

In [None]:
train_ml

In [None]:
valid_ml

In [None]:
import statsmodels.api as sm
from statsmodels.tsa.api import Holt,SimpleExpSmoothing,ExponentialSmoothing

In [None]:
es=ExponentialSmoothing(np.asarray(train_ml['fatalities']),seasonal_periods=14,trend='add', seasonal='mul').fit()

In [None]:
y_pred=valid_ml.copy()
y_pred["Holt's Winter Model"]=es.forecast(len(valid_ml))

In [None]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=train_ml.index, y=train_ml["fatalities"],
                    mode='lines+markers',name="Train Data for Fatalities Cases"))
fig.add_trace(go.Scatter(x=valid_ml.index, y=valid_ml["fatalities"],
                    mode='lines+markers',name="Further Recorded data for fatalities",))
fig.add_trace(go.Scatter(x=valid_ml.index, y=y_pred["Holt\'s Winter Model"],
                    mode='lines+markers',name="Prediction of Fatalities Cases",))
fig.update_layout(title="Fatalities Cases Holt's Winter Model Prediction",
                 xaxis_title="Date",yaxis_title="Fatalities",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()

The graph above compares predicted no of deaths according to Holt's Winter Model which gavee an accuracy of 0.75% during forecasting and actutal no of deaths. We can observe that actual no of deaths are lower than the predicted counterpart and keeps on decreasing with the passage of time. Hence we can clearly say that vaccination results in fewer deaths due to COVID-19