# Acknowledgements
The data is provided by John Hopkins University github repository, https://github.com/CSSEGISandData/COVID-19.
The analysis in this notebook is based on the data up to November 28th, 2020.

# Libraries

In [None]:
import math
import numpy as np 
import pandas as pd
import datetime as dt
from datetime import datetime, timedelta
import itertools

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('bmh')    # print(plt.style.available)

#from plotly.offline import plot, iplot, init_notebook_mode
#init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Datasets

#### I use three datasets which give the number of confirmed cases, the number of deaths and the number of people recovered. 

In [None]:
# confirmed
cdf = pd.read_csv('../input/covid19-october2020/time_series_covid19_confirmed_global.csv')
# deaths
ddf = pd.read_csv('../input/covid19-october2020/time_series_covid19_deaths_global.csv')
# recovered
rdf = pd.read_csv('../input/covid19-october2020/time_series_covid19_recovered_global.csv')

cdf.head(15)
#ddf.head()
#rdf.head()

#### Cleaning.

In [None]:
# cleaning: ignore regions and coordinates

prev_datasets = [cdf, ddf, rdf]
new_datasets = []
for prev in prev_datasets:
    new = prev.drop(['Lat','Long'], axis=1).groupby('Country/Region').agg('sum').reset_index()
    new.rename(columns={'Country/Region':'Country'}, inplace=True)
    new_datasets.append(new)    
cdf_countries = new_datasets[0]
ddf_countries = new_datasets[1]
rdf_countries = new_datasets[2]

cdf_countries

#### Below I create the fully grouped dataset, that is each row corresponds to certain date and country. The rows are arranged first by dates, then by countries.

In [None]:
# Fully grouped common dataset: first by dates, then by countries
# len = (number of days) * (number of countries) 

n = len(cdf_countries.drop('Country', axis=1).columns)  # number of days considered, 282 originally
c = len(cdf_countries)  # number of countries considered, 190 originally

dates = [i for i in cdf_countries.columns[1:]]
countries = [i for i in cdf_countries['Country']]
confirmed = cdf_countries.drop('Country', axis=1).T.values.flatten()  # flatten 2d array
deaths = ddf_countries.drop('Country', axis=1).T.values.flatten()
recovered = rdf_countries.drop('Country', axis=1).T.values.flatten()
active = confirmed - deaths - recovered

d={'Date': np.repeat(dates, c), 'Country': countries*n, 'Confirmed': confirmed, 'Deaths':deaths,
   'Recovered':recovered, 'Active':active}
grouped1_df = pd.DataFrame(d)
grouped1_df['Date'] = pd.to_datetime(grouped1_df['Date'], format='%m/%d/%y')
#grouped1_df.head(20)
grouped1_df.head(10)

#### The following dataset is obtained from the one above by considering every fifth date (used later so that the animation is of reasonable rate). 

In [None]:
# grouped1_df -> every 5th date
# c=number of countries, n is number of dates. Originally c=190, n=282

chunks = [range(i,i+c) for i in range(n*c) if i%c == 0]    
every5thchunk = chunks[::5]
merged = list(itertools.chain.from_iterable(every5thchunk))
grouped1_df_every5thdate = grouped1_df.iloc[merged]
grouped1_df_every5thdate

#### Below I create the second fully grouped dataset. Now the rows are arranged first by countries, then by dates.

In [None]:
# Fully grouped common dataset: first by countries, then by dates
# len = (number of days) * (number of countries) 

n = len(cdf_countries.drop('Country', axis=1).columns)  # number of days considered, 282 originally
c = len(cdf_countries)  # number of countries considered, 190 originally

dates = [i for i in cdf_countries.columns[1:]]
countries = [i for i in cdf_countries['Country']]
confirmed = cdf_countries.drop('Country', axis=1).values.flatten()  # flatten 2d array
deaths = ddf_countries.drop('Country', axis=1).values.flatten()
recovered = rdf_countries.drop('Country', axis=1).values.flatten()
active = confirmed - deaths - recovered

d={'Country': np.repeat(countries, n), 'Date': dates*c, 'Confirmed': confirmed, 'Deaths':deaths,
   'Recovered':recovered, 'Active':active}
grouped2_df = pd.DataFrame(d)
grouped2_df['Date'] = pd.to_datetime(grouped2_df['Date'], format='%m/%d/%y')
grouped2_df

#### Today, by countries

In [None]:
# Countries, today. Originally: 10/29/20

today_df = grouped1_df.iloc[-190:].set_index('Date')
today_df['Deaths/100 cases']=(today_df['Deaths']/today_df['Confirmed'])*100
today_df = today_df.round(2)
today_df.tail(20)

#### Today, world

In [None]:
# World, today. Originally: 10/29/20

today = today_df.index[0]   # originally today='10/29/20'
temp_df = today_df.copy()    # I add last row: world 
temp_df.loc[str(c)] = temp_df.sum()    # c is the number of countries considered
temp_df.at[str(c), 'Country'] = 'World'
temp_df.at[str(c), '% of deaths'] = (temp_df.at[str(c), 'Deaths']/temp_df.at[str(c), 'Confirmed'])*100

temp_df.rename(index={str(c): today}, inplace=True)

todayW_df = temp_df.drop('Confirmed', axis=1).tail(1)
todayW_df


# Maps

In [None]:
def draw_map(df, column, cs):        # cs for color scale 
    fig = px.choropleth(df, locations='Country', locationmode='country names', color=column, hover_name='Country', 
                  title='{} as of {}'.format(column, today.date()), hover_data=[column],
                  color_continuous_scale=cs)   # px.colors.sequential.
    fig.show()      

In [None]:
draw_map(today_df, 'Confirmed', 'Blues')

In [None]:
draw_map(today_df, 'Active', 'deep')

In [None]:
draw_map(today_df, 'Deaths', 'Blues')

In [None]:
draw_map(today_df, 'Deaths/100 cases', 'deep')

In [None]:
def animation_map(df, column):
    fig = px.choropleth(df, locations='Country', locationmode='country names', hover_name='Country',
                        color=df[column],   # np.log
                        title='Cases over time', animation_frame=df['Date'].dt.strftime('%Y-%m-%d'),
                        color_continuous_scale=px.colors.sequential.Blues)
    fig.update(layout_coloraxis_showscale=False)
    fig.update_layout(transition = {'duration': 10})
    fig.show()

In [None]:
animation_map(grouped1_df_every5thdate, 'Confirmed')

# Top 15 countries


In [None]:
def plot_hbar(df, column, n=15, hover_data=[]):
    fig = px.bar(df.sort_values(column).tail(n), x=column, y="Country", color='Country',  
                 text=column, orientation='h', width=900, hover_data=hover_data,
                 color_discrete_sequence = px.colors.qualitative.Prism) 
    fig.update_traces(texttemplate='%{text:.3s}', textposition='outside')
    fig.update_layout(title=column, xaxis_title='', yaxis_title='', yaxis_categoryorder='total ascending',
                      uniformtext_minsize=8, uniformtext_mode='hide') # margin=dict(l=50, r=50, b=100, t=100, pad=4)
    fig.show()

In [None]:
plot_hbar(today_df, 'Confirmed')

In [None]:
plot_hbar(today_df, 'Active')

In [None]:
plot_hbar(today_df, 'Deaths')

In [None]:
plot_hbar(today_df, 'Deaths/100 cases')

In [None]:
def plot_line(df, column, n=6):
    top_n = grouped1_df.sort_values(column)[grouped1_df['Date']==today].tail(n)['Country'].to_numpy()
    #top10_df = grouped1_df[grouped1_df['Country'].isin(top_n)]     or
    top10_df = grouped1_df.query('Country in @top_n')
    
    fig = px.line(top10_df, x='Date', y=column, color='Country', height=600, 
                  title=column,    # '{} as of {}'.format(column, today.date()) 
                  color_discrete_sequence = px.colors.qualitative.Dark24)
    fig.update_layout(showlegend=True, xaxis_title='', yaxis_title='')
    fig.show()

In [None]:
plot_line(grouped2_df, 'Confirmed')

In [None]:
plot_line(grouped2_df, 'Active')

In [None]:
plot_line(grouped2_df, 'Deaths')

In [None]:
def plot_treemap(df, column):
    fig = px.treemap(df, path=['Country'], values=column, height=600,
                 title=column, color_discrete_sequence = px.colors.qualitative.Prism)
    fig.data[0].textinfo = 'label+text+value'
    fig.show()

In [None]:
plot_treemap(today_df, 'Confirmed')

In [None]:
plot_treemap(today_df, 'Active')

In [None]:
plot_treemap(today_df, 'Deaths')

In [None]:
plot_treemap(today_df, 'Deaths/100 cases')

In [None]:
def plot_bubble(df, column):
    temp = df[df[column]>0].sort_values('Country', ascending=False)
    fig = px.scatter(temp, x='Date', y='Country', size=column, color=column, height=3000, title=column,
                    color_continuous_scale=px.colors.sequential.ice)  # Aggrnyl 
    fig.update_layout(yaxis=dict(dtick=1), xaxis_title='', yaxis_title='') 
    fig.update(layout_coloraxis_showscale=False)
    fig.show()

In [None]:
plot_bubble(grouped1_df, 'Confirmed')

# US - Machine Learning

### Preparation

In [None]:
US_df = grouped1_df[grouped1_df['Country']=='US'].groupby(['Date']).agg({'Confirmed':'sum', 'Deaths':'sum', 'Recovered':'sum', 'Active':'sum'})
US_df.tail()

In [None]:
US_df['Days since 1/22'] = US_df.index - US_df.index[0]
US_df['Days since 1/22'] = US_df['Days since 1/22'].dt.days
US_df.head()

In [None]:
cutpoint = int(US_df.shape[0]*0.9)
train = US_df.iloc[:cutpoint]
valid = US_df.iloc[cutpoint:]

model_scores = []

### Linear regression

In [None]:
lin_reg = LinearRegression(normalize=True)
lin_reg.fit(np.array(train['Days since 1/22']).reshape(-1,1), np.array(train['Confirmed']).reshape(-1,1))
prediction_lr = lin_reg.predict(np.array(valid['Days since 1/22']).reshape(-1,1))
model_scores.append(np.sqrt(mean_squared_error(valid['Confirmed'], prediction_lr)))

print('Square root of MSE for Liner Regression is', model_scores[0])

In [None]:
def plot_LR():
    df = US_df
    pred = lin_reg.predict(np.array(df['Days since 1/22']).reshape(-1,1))
    plt.figure(figsize=(9,6))
    plt.plot(df['Confirmed'], label='Confirmed cases', color='darkblue')
    plt.plot(df.index, pred, label='Predicted number of cases', color='red')
    plt.title('Covid-19 cases in the US, linear regression', fontsize=20)
    plt.xticks(rotation=45, size=15 )
    plt.ylabel('Confirmed cases, 10M', fontsize=20)
    plt.yticks(size=20)
    plt.legend(loc=2, prop={'size': 17})

plot_LR()   

#### Linear regression definitely does not help

### Polynomial Regression

In [None]:
poly = PolynomialFeatures(degree=6)

train_poly = poly.fit_transform(np.array(train['Days since 1/22']).reshape(-1,1))
valid_poly = poly.fit_transform(np.array(valid['Days since 1/22']).reshape(-1,1))

In [None]:
lin_reg = LinearRegression(normalize=True)
lin_reg.fit(train_poly, train['Confirmed'])
prediction_pr = lin_reg.predict(valid_poly)
model_scores.append(np.sqrt(mean_squared_error(valid['Confirmed'], prediction_pr)))

print('Square root of MSE for Polynoial Regression is', model_scores[1])

In [None]:
def plot_PR():
    df = US_df
    data = poly.fit_transform(np.array(df['Days since 1/22']).reshape(-1,1))
    pred = lin_reg.predict(data)
    plt.figure(figsize=(9,6))
    plt.plot(df['Confirmed'], label='Confirmed cases', color='darkblue')
    plt.plot(df.index, pred, label='Predicted number of cases', color='red') #, linestyle='--')
    plt.title('Covid-19 cases in the US, polynomial regression', fontsize=20)
    plt.xticks(rotation=45, size=15)
    plt.yticks(size=20)
    plt.ylabel('Confirmed cases, 10M', fontsize=20)
    plt.legend(loc=2, prop={'size': 17})
    
plot_PR()  

### Support Vector Machine

In [None]:
svm = SVR(C=1, degree=5, kernel='poly', epsilon=0.05)
svm.fit(np.array(train['Days since 1/22']).reshape(-1,1), np.array(train['Confirmed']).reshape(-1,1))
pred_svm = svm.predict(np.array(valid['Days since 1/22']).reshape(-1,1))
model_scores.append(np.sqrt(mean_squared_error(valid['Confirmed'], pred_svm)))

print('Square root of MSE for SVM is', model_scores[2])

In [None]:
def plot_SVM():
    df = US_df
    pred = svm.predict(np.array(df['Days since 1/22']).reshape(-1,1))
    plt.figure(figsize=(9,6))
    plt.plot(df['Confirmed'], label='Confirmed cases', color='darkblue')
    plt.plot(df.index, pred, label='Predicted number of cases', color='red') #, linestyle='--')
    plt.title('Covid-19 cases in the US, SVM', fontsize=20)
    plt.xticks(rotation=45, size=15)
    plt.yticks(size=20)
    plt.ylabel('Confirmed cases, 10M', fontsize=20)
    plt.legend(loc=2, prop={'size': 17})
    
plot_SVM()  