In [None]:
import pandas as pd
import numpy as np
import math
from IPython.display import HTML

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPRegressor
from fbprophet import Prophet
import scipy.optimize as optim

from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn import metrics
from prettytable import PrettyTable
import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png')

#!conda install -c plotly plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
from plotly.graph_objs import *

from datetime import datetime

#Supress warnings and default INFO logging
import warnings
warnings.filterwarnings('ignore')

import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

%matplotlib inline

import seaborn as sns
sns.set()
pyo.init_notebook_mode()

# COVID-19 Case Study For Brazil

<font size="3">Brazil is one of the most populous country in South America with over **200 million people** and is also one of the worst hit countries in the world by Coronavirus cases. The aim of this project is analyze the current situation in Brazil, build a model that can predict daily cases and deaths, and design policies that can be implemented to reduce the growth of cases and deaths.</font>

## Team:  
$\;\;\;\;\;\;$<font size="3"><b>Anjana K</b></font>

$\;\;\;\;\;\;$<font size="3"><b>Manisha R</b></font>

$\;\;\;\;\;\;$<font size="3"><b>Padmaja Bhagwat</b></font>

<p><a href="https://commons.wikimedia.org/wiki/File:COVID-19_Outbreak_Cases_in_Brazil.svg#/media/File:COVID-19_Outbreak_Cases_in_Brazil.svg"><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/5/58/COVID-19_Outbreak_Cases_in_Brazil.svg/1200px-COVID-19_Outbreak_Cases_in_Brazil.svg.png" alt="COVID-19 Outbreak Cases in Brazil.svg" width="500" height="500"></a><br><a href="https://commons.wikimedia.org/w/index.php?curid=86423122">Image Source</a></p>

<a id='content'></a>
# Table of Contents

<font size="4">1. [Exploratory Data Analysis - Brazil](#section1)</font> 

$\;\;\;\;\;\;$<font size="3">1.1 [Current situation in Brazil - Cases and Deaths trend](#section1.1)</font>

$\;\;\;\;\;\;$<font size="3">1.2 [Cases v/s Deaths](#section1.2)</font>

$\;\;\;\;\;\;$<font size="3">1.3 [Mortality rate over time](#section1.3)</font>

$\;\;\;\;\;\;$<font size="3">1.4 [Cases and deaths growth over time](#section1.4)</font>

$\;\;\;\;\;\;$<font size="3">1.5 [Region wise case trend](#section1.5)</font>

$\;\;\;\;\;\;$<font size="3">1.6 [Heat map of total cases, deaths and death rate in Brazil](#section1.6)</font>


<font size="4">2. [Modeling and predictions](#section2)</font> 

$\;\;\;\;\;\;$<font size="3">2.1 [Linear regression model](#section2.1)</font>

$\;\;\;\;\;\;$<font size="3">2.2 [Multi linear regression](#section2.2)</font>

$\;\;\;\;\;\;$<font size="3">2.3 [Polynomial regression](#section2.3)</font>

$\;\;\;\;\;\;$<font size="3">2.4 [Multi layer perceptron](#section2.4)</font>

$\;\;\;\;\;\;$<font size="3">2.5 [Logistic growth and Prophet Model for Forecasting](#section2.5)</font>

$\;\;\;\;\;\;$<font size="3">2.6 [Model Comparison](#section2.6)</font>

<font size="4">3. [Exploratory Data Analysis - India, Italy and South Korea](#section3)</font> 

$\;\;\;\;\;\;$<font size="3">3.1 [India](#section3.1)</font>

$\;\;\;\;\;\;$<font size="3">3.2 [Italy](#section3.2)</font>

$\;\;\;\;\;\;$<font size="3">3.3 [South Korea](#section3.3)</font>

<font size="4">4. [Conclusion](#section4)</font> 

<a id='section1'> </a>    
# 1. Exploratory Data Analysis - Brazil

<a id='section1.1'> </a> 
## 1.1 Current situation in Brazil - Cases and Deaths trend

In [None]:
df_brazil = pd.read_csv('../input/corona-virus-brazil/brazil_covid19.csv')

In [None]:
cases = df_brazil['cases'].groupby(df_brazil['date']).sum().sort_values(ascending=True)
deaths = df_brazil['deaths'].groupby(df_brazil['date']).sum().sort_values(ascending=True)
days_since_first_case = np.array([i for i in range(len(cases.index))]).reshape(-1, 1)
days_since_first_death = np.array([i for i in range(len(deaths.index))]).reshape(-1, 1)

fig, ax = plt.subplots(1,2, figsize=(25,10))
ax[0].plot(days_since_first_case, cases, color='blue', label='Number of cases')
ax[0].legend(loc='best')
ax[0].set_xlabel('Number of days since the first case', size=20)
ax[0].set_ylabel('Count of cases in Brazil', size=20)
ax[0].set_title('Covid cases trend in Brazil', size=25)
ax[0].legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
ax[0].tick_params(axis="x", labelsize=20)
ax[0].tick_params(axis="y", labelsize=20)

ax[1].plot(days_since_first_death, deaths, color='red', label='Number of deaths')
ax[1].legend(loc='best')
ax[1].set_xlabel('Number of days since the first death', size=15)
ax[1].set_ylabel('Count of deaths in Brazil', size=15)
ax[1].set_title('Covid deaths trend in Brazil', size=25)
ax[1].legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
ax[1].tick_params(axis="x", labelsize=20)
ax[1].tick_params(axis="y", labelsize=20)

plt.show()

**From the above plot we can observe that cases and deaths are increasing exponentially over time**

<a id='section1.2'> </a>
## 1.2 Cases v/s Deaths

In [None]:
plt.figure(figsize=(15,10))
plt.plot(cases, deaths)
plt.xlabel('Cases count', size=20)
plt.ylabel('Deaths count', size=20)
plt.title('Cases v/s Deaths in Brazil', size=25)
plt.xticks(size=15)
plt.yticks(size=15)

plt.show()

**From the above plot, we can see that there is a Linear relationship between number of cases and number of deaths.**

<a id='section1.3'> </a>
## 1.3 Mortality rate over time

In [None]:
mortality_over_time = round((deaths/cases)*100,2)

index = mortality_over_time.index
data = mortality_over_time

fig = go.Figure(data=[
    
    go.Line(name='Mortality in %'
            , x = index
            , y=data
            , mode="lines+markers")
    
])

fig['layout'].update(
    title="Mortality rate over the time"
    , title_x=0.5
    , xaxis_title='Months'
    , yaxis_title='Mortality rate (deaths/cases) in Percentage'
    )


fig.show()

**We can observe here that the mortality rate had spiked during April to June and stabilized post that.**

<a id='section1.4'></a>
## 1.4 Cases And Deaths Growth Over Time

In [None]:
cases_today = df_brazil[['cases']].groupby(df_brazil['date']).sum().sort_values(by = 'cases', ascending=True).shift(-1)
cases_yesterday = df_brazil[['cases']].groupby(df_brazil['date']).sum().sort_values(by = 'cases', ascending=True)

deaths_today = df_brazil[['deaths']].groupby(df_brazil['date']).sum().sort_values(by = 'deaths', ascending=True).shift(-1)
deaths_yesterday = df_brazil[['deaths']].groupby(df_brazil['date']).sum().sort_values(by = 'deaths', ascending=True)

cases_growth_rate = cases_today-cases_yesterday
cases_growth_rate = cases_growth_rate.dropna()

deaths_growth_rate = deaths_today-deaths_yesterday
deaths_growth_rate = deaths_growth_rate.dropna()

layout = Layout(height=1200, xaxis_title='Month', legend = dict(font = dict(family = "Courier", size = 20)),
                  legend_title = dict(font = dict(family = "Courier", size = 10)))


fig = make_subplots(rows=2, cols=1, vertical_spacing=0.15
                    , subplot_titles=('New cases per day over the time'
                                      , 'New deaths per day over the time'))

fig.append_trace(go.Line(name='New cases per day'
                        , x = cases_growth_rate.index
                        , y = cases_growth_rate['cases']
                        , mode="lines+markers")
                        , row=1, col=1
                        )

fig.append_trace(go.Line(name='New deaths per day'
                        , x = deaths_growth_rate.index
                        , y = deaths_growth_rate['deaths']
                        , mode="lines+markers")
                        , row=2, col=1)

fig.update_xaxes(title_text="Months",row=1, col=1)
fig.update_yaxes(title_text="Cases per day", row=1, col=1)

fig.update_xaxes(title_text="Months",row=2, col=1)
fig.update_yaxes(title_text="Deaths per day", row=2, col=1)

fig['layout'].update(layout)

fig.show()

**The above plots indicate growth rate in deaths and cases over time. It can be observed that the rates have remained uniform throughout the time**

<a id='section1.5'> </a>
## 1.5 Region Wise Case Trend

In [None]:
confirmed_cases = df_brazil.groupby(['region','date']).agg({'cases':'sum'}).reset_index()

fig = px.line(confirmed_cases[['date','cases', 'region']], x='date', y='cases', color='region')
fig.update_layout(title='COVID-19 in Brazil: total number of cases over time',  title_x=0.5,
                  xaxis_title='Date', yaxis_title='Number of cases', legend_title='<b>Regions in Brazil</b>',
                  legend=dict(x=0.02,y=0.98))
fig.show()

**Number of cases in regions : Centro-Oeste, and Nordeste have higher number of cases as compared to the other regions in Brazil**

<a id='section1.6'> </a>
## 1.6 Heat map of total cases, deaths and death rate in Brazil 

In [None]:
region_wise_cases = df_brazil.groupby('region')['cases','deaths'].max().reset_index()
region_wise_cases['Active'] = region_wise_cases['cases'] - region_wise_cases['deaths']
region_wise_cases["Death Rate (per 100)"] = np.round(100*region_wise_cases["deaths"]/region_wise_cases["cases"],2)
region_wise_cases.sort_values('cases', ascending= False).fillna(0).style.background_gradient(cmap='Blues',subset=["cases"])\
                        .background_gradient(cmap='Blues',subset=["deaths"])\
                        .background_gradient(cmap='Blues',subset=["Active"])\
                        .background_gradient(cmap='Blues',subset=["Death Rate (per 100)"])


**From the above heat map we can observe that Sudeste region has the highest number of cases, deaths and death rate when compared to other regions.**

**Brazil has a low mortality rate.**

<a id='section1.7'> </a>
## 1.7 Conclusion
**Brazil needs to take measures to reduce the number of cases and deaths as they are increasing exponentially.**

<font size="3">[Click here to scroll to the top](#content)</font> 

<a id='section2'></a>
# 2. Modeling and predictions
In this section we shall be training different models and predicting total number of cases and deaths. We shall also be concluding which models work the best for given data.

## Models 
- Linear regression
- Multi linear regression
- Polynomial regression
- Multi layer perceptron
- Logistic growth curve along with Facebook Prophet model

#### Brazil day wise case and death data from brazil_covid19.csv

In [None]:
df_brazil = pd.read_csv('../input/corona-virus-brazil/brazil_covid19.csv')
df_brazil.head()

In [None]:
cases = df_brazil['cases'].groupby(df_brazil['date']).sum().sort_values(ascending=True)
deaths = df_brazil['deaths'].groupby(df_brazil['date']).sum().sort_values(ascending=True)

In [None]:
cases = cases[cases>0].reset_index().drop('date',axis=1)
deaths = deaths[deaths>0].reset_index().drop('date',axis=1)

<a id='section2.1'></a>
## 2.1 Linear regression 
We've done the predictions using a simple Linear regression model. 

### Step 1: Preparing data for modeling
- To model confirmed cases and deaths we have taken cases and deaths since first case and first death occurance respectively
- Converted our data into 1D arrays 
- Split the data for training and testing 
- Created the data for future prediction of cases and deaths (for next 20 days)

In [None]:
days_since_first_case = np.array([i for i in range(len(cases.index))]).reshape(-1, 1)
case_count = np.array(cases).reshape(-1, 1)

days_since_first_death = np.array([i for i in range(len(deaths.index))]).reshape(-1, 1)
death_count = np.array(deaths).reshape(-1, 1)

In [None]:
X_cases = days_since_first_case
y_cases = case_count
X_train_cases, X_test_cases, y_train_cases, y_test_cases = train_test_split(X_cases
                                                    , y_cases
                                                    , test_size= 0.2
                                                    , shuffle=True
                                                    , random_state = 42)
days_in_future = 20
total_days_since_first_case = 232
x_test_case_future = np.linspace(total_days_since_first_case+1, total_days_since_first_case+days_in_future,20).reshape(-1, 1)

### Step 2: Linear regression model for predicting cases 
We have made the **case predictions** using Linear regression on test data as well as **forcasted the predictions for next 20 days** 

In [None]:
# Case prediction for entire Brazil using linear regression
lr = LinearRegression()
lr.fit(X_train_cases, y_train_cases)
y_pred_case = lr.predict(X_test_cases)
y_pred_case_forecast = lr.predict(x_test_case_future)

rmse_test_case = math.sqrt(mean_squared_error(y_test_cases, y_pred_case))
r_score_case = r2_score(y_test_cases, y_pred_case)
print('RMSE for test cases = {:.2f}'.format(rmse_test_case))
print('R-squared for test cases = {:.2f}'.format(r_score_case))

In [None]:
X_death = days_since_first_death
y_death = death_count
X_train_deaths, X_test_deaths, y_train_deaths, y_test_deaths = train_test_split(X_death
                                                    , y_death
                                                    , test_size= 0.2
                                                    , shuffle=True
                                                    , random_state = 42)
days_in_future = 20
total_days_since_first_death = 212
x_test_death_future = np.linspace(total_days_since_first_death+1, total_days_since_first_death+days_in_future,20).reshape(-1, 1)

### Step 3: Linear regression model for predicting deaths 
We have made the **death predictions** using Linear regression on test data as well as **forcasted the predictions for next 20 days** 

In [None]:
# Death prediction for entire Brazil using linear regression
lr = LinearRegression()
lr.fit(X_train_deaths, y_train_deaths)
y_pred_deaths= lr.predict(X_test_deaths)
y_pred_death_forecast = lr.predict(x_test_death_future)

rmse_test_deaths = math.sqrt(mean_squared_error(y_test_deaths, y_pred_deaths))
r_score_deaths = r2_score(y_test_deaths, y_pred_deaths)
print('RMSE for test deaths = {:.2f}'.format(rmse_test_deaths))
print('R-squared for test deaths = {:.2f}'.format(r_score_deaths))

**The R-squared value of 0.92 for case prediction and 0.98 for death prediction is clearly indicating that that there is a linear relationship between the number of days since the first occurance of case/death to the total number of case/death respectively**

### Step 4: Plotting - Linear regression model for cases and deaths 
Plotted the case and death predictions on **test data** as well as **forcasted data for the next 20 days** 

In [None]:
# TODO: Name what dotted line mean
def plot_model(X_test, y_test, y_pred, x_test_future, y_pred_forecast, total_days, start_date, title, axis):
    x_axis_data = X_test
    y_axis_data_true = y_test

    l = sorted(zip(*[x_axis_data,y_axis_data_true]))
    sorted_x_test, y_sorted_test = list(zip(*l))
    axis.scatter(sorted_x_test, y_sorted_test, label=f"True {title}")

    l_pred = sorted(zip(*[x_axis_data,y_pred]))
    sorted_x_test_pred, y_sorted_test_pred = list(zip(*l_pred))
    axis.plot(sorted_x_test_pred, y_sorted_test_pred, label=f"Predicted {title}", color="red")
    axis.plot(x_test_future, y_pred_forecast, '.', label=f"Forcasted {title}", color="green")

    axis.axvline(total_days, linestyle='--')

    axis.set_title(f'Day wise {title} prediction for Brazil (forcasted for next 20 days)', size=18)
    axis.set_ylabel(f"Total number of {title}", size=15)
    axis.set_xlabel(f"Days since the first {title} {start_date}", size=15)
    axis.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
    
fig, ax = plt.subplots(2,1, figsize=(15,18))
plot_model(X_test_cases,y_test_cases, y_pred_case, x_test_case_future, y_pred_case_forecast, total_days_since_first_case,'(26-02-2020)', 'cases', ax[0])
plot_model(X_test_deaths,y_test_deaths, y_pred_deaths, x_test_death_future, y_pred_death_forecast, total_days_since_first_death, '(17-03-2020)', 'deaths', ax[1])
plt.show()

<a id='section2.2'></a>
## 2.2 Multi-linear regression

In this section we've tried to **predict region wise cases/deaths** by incorporating population of each region. For this we have performed multi-linear regression.

### Step 1: Preparing data for modeling
- Read the data from brazil_population_2019_changed.csv (This is a clean version of brazil_population_2019.csv)
- Left join this data with the brazil_covid19.csv based on the region
- Calculate days since the first case and first death for each region 
- One hot encode the region 
- Split the data into training and testing

#### Brazil population data from brazil_covid19.csv

In [None]:
df_brazil = pd.read_csv('../input/corona-virus-brazil/brazil_covid19.csv')
df_brazil_population = pd.read_csv('../input/brazil-population-2019-changed/brazil_population_2019_changed.csv', encoding='latin1')
df_brazil_population.head()

Getting the **total population of each region and left joining it with the df_brazil** that has day-wise cases and deaths


In [None]:
df_population_region = df_brazil_population.groupby('region').sum()['population'].reset_index()
df_brazil_infected_population = pd.DataFrame(pd.merge(df_brazil, df_population_region, on='region'))

Get the records where total number of cases and deaths are **greater than zero** 

In [None]:
confirmed_pop = df_brazil_infected_population[df_brazil_infected_population['cases']>0].groupby(['region','date']).agg({'cases':'sum','population':'mean'})
deaths_pop = df_brazil_infected_population[df_brazil_infected_population['deaths']>0].groupby(['region','date']).agg({'deaths':'sum','population':'mean'})

### Preprocessing of confirmed cases 

#### Calculating days since the first case occured for each region

In [None]:
# days since first case for each region
def get_days_since_first_case(df):
    df['days_since_first_case'] = np.arange(len(df))+1
    return df

df_confirmed_population = confirmed_pop.groupby(level=0).apply(get_days_since_first_case)
df_confirmed_population.reset_index(inplace=True)
df_confirmed_population.head()

#### One hot encoding the region

In [None]:
# One hot encode the regions
def get_one_hot(df):
    df_onehot_region = pd.get_dummies(df)
    df_onehot_region.drop('Sul',axis=1,inplace=True)
    return df_onehot_region

df_onehot_region = get_one_hot(df_confirmed_population['region'])

In [None]:
df_confirmed_cases = df_confirmed_population.drop('region',axis=1)
df_confirmed_cases = pd.concat([df_confirmed_cases,df_onehot_region],axis=1)
df_confirmed_cases.head()

### Preprocessing of deaths

#### Calculating days since the first case occured for each region

In [None]:
# days since first death for each region
def get_days_since_first_death(df):
    df['days_since_first_death'] = np.arange(len(df))+1
    return df

df_death_population = deaths_pop.groupby(level=0).apply(get_days_since_first_death)
df_death_population.reset_index(inplace=True)
df_death_population.head()

#### One hot encoding the region

In [None]:
df_confirmed_deaths = df_death_population.drop('region',axis=1)
df_onehot_region = get_one_hot(df_death_population['region'])

df_confirmed_deaths = pd.concat([df_confirmed_deaths,df_onehot_region],axis=1)
df_confirmed_deaths.head()

### Step 2: Multi-linear model for case prediction
We have made the **case predictions** using multi=linear regression model and calculated RMSE and R-squared by performing **cross-validation** on the training data

In [None]:
# Splitting the case data into training and testing 
X_cases = df_confirmed_cases.drop(['date','cases'], axis = 1)
y_cases = df_confirmed_cases['cases']

X_train_cases, X_test_cases, y_train_cases, y_test_cases = train_test_split(X_cases
                                                    , y_cases
                                                    , test_size= 0.2
                                                    , shuffle=True
                                                    , random_state = 42)

In [None]:
# Multilinear regression with cross-validation for cases 
linear_model = LinearRegression(normalize=True)
linear_model.fit(X_train_cases, y_train_cases)
y_pred_cases = linear_model.predict(X_test_cases)

mse_score = cross_validate(linear_model,X_train_cases,y_train_cases,scoring='neg_mean_squared_error',cv=10)['test_score'].mean()
rmse = math.sqrt(-mse_score)
r_score = r2_score(y_test_cases, y_pred_cases)

print('The best RMSE for predicted number of cases is: {:.2f}'.format(rmse))
print('The best R2 score for predicted number of cases is: {:.2f}'.format(r_score))

### Step 3: Plotting - Multi-linear regression model for region wise case prediction

In [None]:
# Plot region wise case predictions 

def region_wise_plot(X_test, y_test, title, region,axis):  
    if(region=='Sul'):
        condition = (X_test['Centro-Oeste']==0) & (X_test['Nordeste']==0) & (X_test['Norte']==0) & (X_test['Sudeste']==0)
    else:
        condition = X_test[region]==1

    y_test_pred = linear_model.predict(X_test[condition])
    
    x_axis_data = X_test[[f'days_since_first_{title}']][condition].values
    y_axis_data_true = y_test[condition].values

    l = sorted(zip(*[x_axis_data,y_axis_data_true]))
    sorted_x_test, y_sorted_test = list(zip(*l))
    axis.scatter(sorted_x_test, y_sorted_test, label="True data")

    l_pred = sorted(zip(*[x_axis_data,y_test_pred]))
    sorted_x_test_pred, y_sorted_test_pred = list(zip(*l_pred))
    axis.plot(sorted_x_test_pred, y_sorted_test_pred, label="Predicted data", color="red")

    axis.set_title(f'Day wise {title} prediction for {region}', size=18)
    axis.set_ylabel("Total number of cases", size=15)
    axis.set_xlabel("Days since the first case", size=15)
    axis.legend(loc="best")

fig, ax = plt.subplots(3,2,figsize=(15,20)) 
region_wise_plot(X_test_cases,y_test_cases,title='case',region='Centro-Oeste',axis=ax[0,0])
region_wise_plot(X_test_cases,y_test_cases,title='case',region='Nordeste',axis=ax[0,1])
region_wise_plot(X_test_cases,y_test_cases,title='case',region='Norte',axis=ax[1,0])
region_wise_plot(X_test_cases,y_test_cases,title='case',region='Sudeste',axis=ax[1,1])
region_wise_plot(X_test_cases,y_test_cases,title='case',region='Sul',axis=ax[2,0])

plt.show()

**From the above plot we can clearly see that there is a non-linear relationship between our predictors and the total number of cases, therefore in the next section we've created polynomial features for _days_since_first case_ and thus we can see the improvement in predictions**

### Step 4: Multi-linear model for death prediction
We have made the **death predictions** using multi-linear regression model and calculated RMSE and R-squared by performing **cross-validation** on the training data

In [None]:
X_death = df_confirmed_deaths.drop(['date','deaths'], axis = 1)
y_death = df_confirmed_deaths['deaths']

X_train_deaths, X_test_deaths, y_train_deaths, y_test_deaths = train_test_split(X_death
                                                    , y_death
                                                    , test_size= 0.2
                                                    , shuffle=True
                                                    , random_state = 42)

In [None]:
# Multilinear regression with cross-validation for deaths

linear_model = LinearRegression(normalize=True)
linear_model.fit(X_train_deaths, y_train_deaths)
y_pred_deaths = linear_model.predict(X_test_deaths)

mse_score = cross_validate(linear_model,X_train_deaths,y_train_deaths,scoring='neg_mean_squared_error',cv=10)['test_score'].mean()
rmse = math.sqrt(-mse_score)
#rmse = math.sqrt(mean_squared_error(y_test_cases, y_pred_cases))
r_score = r2_score(y_test_deaths, y_pred_deaths)

print('The best RMSE for predicted number of deaths is: {:.2f}'.format(rmse))
print('The best R2 score for predicted number of deaths is: {:.2f}'.format(r_score))

### Step 5: Plotting - Multi-linear regression model for region wise death prediction

In [None]:
# Plot region wise deaths 
fig, ax = plt.subplots(3,2,figsize=(15,20)) 
region_wise_plot(X_test_deaths,y_test_deaths,title='death',region='Centro-Oeste',axis=ax[0,0])
region_wise_plot(X_test_deaths,y_test_deaths,title='death',region='Nordeste',axis=ax[0,1])
region_wise_plot(X_test_deaths,y_test_deaths,title='death',region='Norte',axis=ax[1,0])
region_wise_plot(X_test_deaths,y_test_deaths,title='death',region='Sudeste',axis=ax[1,1])
region_wise_plot(X_test_deaths,y_test_deaths,title='death',region='Sul',axis=ax[2,0])

plt.show()

**Similar to case predictions, from the above plot we can clearly see that there is a non-linear relationship between our predictors and the total number of deaths, therefore in the next section we've created polynomial features for _days_since_first death_**

<a id='section2.3'></a>
## 2.3 Polynomial regression

In [None]:
# Splitting the case data into training and testing 
X_cases = df_confirmed_cases.drop(['date','cases'], axis = 1)
y_cases = df_confirmed_cases['cases']

X_train_cases, X_test_cases, y_train_cases, y_test_cases = train_test_split(X_cases
                                                    , y_cases
                                                    , test_size= 0.2
                                                    , shuffle=True
                                                    , random_state = 42)

### Step 1: Creating polynomial features for "days_since_first_case"
Population and one hot encoded features are further appended to this polynomial feature

In [None]:
# Creating polynomial features for days_since_first_case and concatinating it with other features
poly_feature_train = PolynomialFeatures(3).fit_transform(X_train_cases[['days_since_first_case']])
# Delete the first row, since the LinearRegression model shall be fitting the intercept by default
poly_feature_train = pd.DataFrame(np.delete(poly_feature_train, [0], 1))

poly_feature_test = PolynomialFeatures(3).fit_transform(X_test_cases[['days_since_first_case']])
poly_feature_test = pd.DataFrame(np.delete(poly_feature_test, [0], 1))

X_train = X_train_cases.reset_index().drop(['index','days_since_first_case'],axis=1)
X_train = pd.concat([poly_feature_train,X_train], axis=1)
y_train = y_train_cases.reset_index().drop(['index'],axis=1)

X_test = X_test_cases.reset_index().drop(['index','days_since_first_case'],axis=1)
X_test = pd.concat([poly_feature_test,X_test], axis=1)
y_test = y_test_cases.reset_index().drop(['index'],axis=1)

In [None]:
X_train.head()

### Step 2: Modeling polynomial regression for predictig cases 

In [None]:
linear_model = LinearRegression(normalize=True)
linear_model.fit(X_train, y_train)
y_pred_cases = linear_model.predict(X_test)

# mse_score = cross_validate(linear_model,poly_X_train_cases,y_train_cases,scoring='neg_mean_squared_error',cv=10)['test_score'].mean()
rmse = math.sqrt(mean_squared_error(y_test, y_pred_cases))
r_score = r2_score(y_test, y_pred_cases)

print('The best RMSE is: {:.2f}'.format(rmse))
print('The best R2 score is: {:.2f}'.format(r_score))

**Here we can see that the R-squaed value has improved from 0.81 in case of linear model to 0.87 after calcualting the polynomial features for days_since_first_case. The plot below also demonstrates a better fit**

### Step 3: Plotting - Polynomial regression model for region wise case prediction

In [None]:
def region_wise_poly_plot(X_test,y_test,region, x, axis):  
    if(region=='Sul'):
        condition = (X_test['Centro-Oeste']==0) & (X_test['Nordeste']==0) & (X_test['Norte']==0) & (X_test['Sudeste']==0)
    else:
        condition = X_test[region]==1

    y_test_pred = linear_model.predict(X_test[condition])
    
    # X_test[[0]] gives days since the first case/death
    x_axis_data = X_test[[0]][condition].values
    y_axis_data_true = y_test[condition].values

    l = sorted(zip(*[x_axis_data,y_axis_data_true]))
    sorted_x_test, y_sorted_test = list(zip(*l))
    axis.scatter(sorted_x_test, y_sorted_test, label="True data")

    l_pred = sorted(zip(*[x_axis_data,y_test_pred]))
    sorted_x_test_pred, y_sorted_test_pred = list(zip(*l_pred))
    axis.plot(sorted_x_test_pred, y_sorted_test_pred, label="Predicted data", color="red")

    axis.set_title(f'Day wise {x} prediction for {region}', size=18)
    axis.set_ylabel(f'Total number of {x}', size=15)
    axis.set_xlabel(f'Days since the first {x}', size=15)
    axis.legend(loc="best")

fig, ax = plt.subplots(3,2,figsize=(15,20)) 
region_wise_poly_plot(X_test,y_test,region='Centro-Oeste',x='case',axis=ax[0,0])
region_wise_poly_plot(X_test,y_test,region='Nordeste',x='case',axis=ax[0,1])
region_wise_poly_plot(X_test,y_test,region='Norte',x='case',axis=ax[1,0])
region_wise_poly_plot(X_test,y_test,region='Sudeste',x='case',axis=ax[1,1])
region_wise_poly_plot(X_test,y_test,region='Sul',x='case',axis=ax[2,0])

plt.show()

In [None]:
X_death = df_confirmed_deaths.drop(['date','deaths'], axis = 1)
y_death = df_confirmed_deaths['deaths']

X_train_deaths, X_test_deaths, y_train_deaths, y_test_deaths = train_test_split(X_death
                                                    , y_death
                                                    , test_size= 0.2
                                                    , shuffle=True
                                                    , random_state = 42)

### Step 4: Creating polynomial features for "days_since_first_death"
Population and one hot encoded features are further appended to this polynomial feature

In [None]:
# Creating polynomial features for days_since_first_death and concatinating it with other features

poly_feature_train = PolynomialFeatures(5).fit_transform(X_train_deaths[['days_since_first_death']])
poly_feature_train = pd.DataFrame(np.delete(poly_feature_train, [0], 1))

poly_feature_test = PolynomialFeatures(5).fit_transform(X_test_deaths[['days_since_first_death']])
poly_feature_test = pd.DataFrame(np.delete(poly_feature_test, [0], 1))

X_train = X_train_deaths.reset_index().drop(['index','days_since_first_death'],axis=1)
X_train = pd.concat([poly_feature_train,X_train], axis=1)
y_train = y_train_deaths.reset_index().drop(['index'],axis=1)

X_test = X_test_deaths.reset_index().drop(['index','days_since_first_death'],axis=1)
X_test = pd.concat([poly_feature_test,X_test], axis=1)
y_test = y_test_deaths.reset_index().drop(['index'],axis=1)

In [None]:
linear_model = LinearRegression(normalize=True)
linear_model.fit(X_train, y_train)
y_pred_cases = linear_model.predict(X_test)

rmse = math.sqrt(mean_squared_error(y_test, y_pred_cases))
r_score = r2_score(y_test, y_pred_cases)

print('The best RMSE is: {:.2f}'.format(rmse))
print('The best R2 score is: {:.2f}'.format(r_score))

We notice that **R-squared** value obtained from **multi-linear regression and polynomial regression are almost the same** (0.79 in case of multi-linear and 0.80 in case of polymoial regression), indicating that there is more **linear relationship between our predictors and total number of deaths** when compared to total cases 

### Step 5: Plotting - Polynomial regression model for region wise death prediction

In [None]:
fig, ax = plt.subplots(3,2,figsize=(15,20)) 
region_wise_poly_plot(X_test,y_test,region='Centro-Oeste',x='death',axis=ax[0,0])
region_wise_poly_plot(X_test,y_test,region='Nordeste',x='death',axis=ax[0,1])
region_wise_poly_plot(X_test,y_test,region='Norte',x='death',axis=ax[1,0])
region_wise_poly_plot(X_test,y_test,region='Sudeste',x='death',axis=ax[1,1])
region_wise_poly_plot(X_test,y_test,region='Sul',x='death',axis=ax[2,0])

plt.show()

## 2.3 Multi Layer Perceptron 
In this section we have built an MLP model, since it helps in approximating any kind functions. Here we can see significant inprovement in model performance both for case and death predictions 

### Step 1: Preparing data for modeling
- To model confirmed cases and deaths we have taken cases and deaths since first case and first death occurance respectively
- Split the data for training and testing 
- Created the data for future prediction of cases and deaths (for next 20 days)

### Step 2: Build MLP model for case prediction
- The model has 2 hidden layers with 200 nodes each 
- We are using relu activation function and lbfgs solver for finding the global minimum

In [None]:
X_cases = days_since_first_case
y_cases = case_count
X_train_cases, X_test_cases, y_train_cases, y_test_cases = train_test_split(X_cases
                                                    , y_cases
                                                    , test_size= 0.2
                                                    , shuffle=True
                                                    , random_state = 42)
days_in_future = 20
total_days_since_first_case = 232
x_test_case_future = np.linspace(total_days_since_first_case+1, total_days_since_first_case+days_in_future,20).reshape(-1, 1)

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(200,200),activation='relu',solver='lbfgs',max_iter=1000, shuffle=True)
mlp.fit(X=X_train_cases.reshape(-1,1),y=y_train_cases.ravel())
y_predict_cases = mlp.predict(X_test_cases)
y_pred_case_forecast = mlp.predict(x_test_case_future)

In [None]:
rmse = math.sqrt(mean_squared_error(y_test_cases, y_predict_cases))
r_score = r2_score(y_test_cases, y_predict_cases)

print('The best RMSE for predicted number of cases is: {:.2f}'.format(rmse))
print('The best R2 score for predicted number of cases is: {:.2f}'.format(r_score))

### Step 3: Build MLP model for death prediction

In [None]:
X_death = days_since_first_death
y_death = death_count
X_train_deaths, X_test_deaths, y_train_deaths, y_test_deaths = train_test_split(X_death
                                                    , y_death
                                                    , test_size= 0.2
                                                    , shuffle=True
                                                    , random_state = 42)
days_in_future = 20
total_days_since_first_death = 212
x_test_death_future = np.linspace(total_days_since_first_death+1, total_days_since_first_death+days_in_future,20).reshape(-1, 1)

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(200,200),activation='relu',solver='lbfgs',max_iter=1000, shuffle=True)
mlp.fit(X=X_train_deaths,y=y_train_deaths.ravel())
y_predict_deaths = mlp.predict(X_test_deaths)
y_pred_death_forecast = mlp.predict(x_test_death_future)

In [None]:
rmse = math.sqrt(mean_squared_error(y_test_deaths, y_predict_deaths))
r_score = r2_score(y_test_deaths, y_predict_deaths)

print('The best RMSE for predicted number of deaths is: {:.2f}'.format(rmse))
print('The best R2 score for predicted number of deaths is: {:.2f}'.format(r_score))

Here we can see that both case and death prediction using MLP has a **perfect R-squared** value of **1.00**

### Step 4: Plotting - MLP model for cases and deaths 
Plotted the case and death predictions on **test data** as well as **forcasted data for the next 20 days** 

In [None]:
fig, ax = plt.subplots(2,1, figsize=(15,18))
plot_model(X_test_cases,y_test_cases, y_predict_cases, x_test_case_future, y_pred_case_forecast, total_days_since_first_case,'(26-02-2020)', 'cases', ax[0])
plot_model(X_test_deaths,y_test_deaths, y_predict_deaths, x_test_death_future, y_pred_death_forecast, total_days_since_first_death, '(17-03-2020)', 'deaths', ax[1])
plt.show()

<a id='section2.5'></a>
## 2.5 Logistic growth and Prophet Model for Forecasting

From the previous analysis, coronavirus outbreak seems to be an exponential growth, but there **cannot be an exponential forever** as the outbreak would stabilize slowly when most of the people are infected by it. 
This kind of growth is similar to a **logistic growth**. 

Therefore, we shall be modifying the Logistic regression generally used for classification task to perform the regression task by modifying the formula as follows:

Logistic regression formula (for classification): **y(t) = 1 / (1 + np.exp(-b * t)**

Modified Logistic growth formula (for regression): **y(t) = c / (1 + a * np.exp(-b * t))**

The **Logistic growth formula** is used to calculate the **day of fastest growth** and the cases/deaths on the fastest growing day.
In the following sections, we will be explaining how each one of these will play a role in building a better forecasting model.

### Step 1: Preparing data for modeling

* Reading the raw data : <br>
    - Brazil cases and deaths data(brazil_covid19.csv) <br>
    - Brazil population data (brazil_population_2019_changed.csv)<br>
* Adding total population column to cases and deaths dataframe from population dataframe.
* Aggregating cases and deaths by date.

In [None]:
df_brazil = pd.read_csv('../input/corona-virus-brazil/brazil_covid19.csv')
df_population = pd.read_csv('../input/brazil-population-2019-changed/brazil_population_2019_changed.csv',encoding='latin1')

df_brazil['population'] = df_population['population'].sum()
confirmed_pop = df_brazil.groupby(['date']).agg({'cases':'sum','population':'mean'}).reset_index()
deaths_pop = df_brazil.groupby(['date']).agg({'deaths':'sum','population':'mean'}).reset_index()

In [None]:
data_cases = confirmed_pop[confirmed_pop['cases']>0]
data_cases = data_cases.reset_index().drop('index',axis=1)
data_cases['Timestep'] = data_cases.index
population_brazil = data_cases['population'].unique()[0]

In [None]:
data_deaths = deaths_pop[deaths_pop['deaths']>0]
data_deaths = data_deaths.reset_index().drop('index',axis=1)
data_deaths['Timestep'] = data_deaths.index
population_brazil = data_deaths['population'].unique()[0]

### Step 2: Modeling

Prediction is performed by fitting a **prophet model** to the data. Prophet requires us to provide a **maximum capacity** reachable by the forecast. One way of defining this would be to assign population as the maximum capacity, but a more optimal way to determine maximum capacity would be to compute it based on the growth of infection on any given day. This can be easily computed by fitting our data on a given day to a logistic growth function.


### Logistic Growth Function

Logistic function gives us the number of cases/deaths at any given time T.

**y(t) = c/(1 + a * np.exp(-b * t))**

Here,  
- **y(t)** is the number of cases/deaths at any given time t
- **c** is the maximum capacity of growth,
- **b** is a value greater than 0

In [None]:
def func_logistic(t, a, b, c):
    return c / (1 + a * np.exp(-b*t))

### Fitting Logistic Growth Function For Cases and Deaths

- In order to get the coefficients for the Logistic function, **Nonlinear Least Squares estimation** is used.

- We determine the fastest growth day, i.e, the day at which the growth was maximum, and the number of cases/deaths on that day by using the Logistic funtion with the estimated coefficients.

Now to compute maximum capacity: 

- If the cases/deaths are increasing exponentially, then we are taking the maximum capacity as the number of cases/deaths that is observed **10 days from the fastest growth day.**
- If the number of cases/deaths has stabilized, then we are taking the maximum capacity as the number of cases/deaths that is observed **10 days from the current date.**

In [None]:
cases_df = data_cases
cases_df = cases_df.drop(['population','date'],axis=1)
column = 'cases'
# Randomly initialize the coefficients
p0 = np.random.exponential(size=3)
# Set min bound 0 on all coefficients, and set different max bounds # for each coefficient
bounds = (0, [100000., 14., population_brazil])
# Convert pd.Series to np.Array and use Scipy's curve fit to find   # the best Nonlinear Least Squares coefficients
x_cases = np.array(cases_df['Timestep']) + 1
y_cases = np.array(cases_df[column])
(a_cases,b_cases,c_cases),cov_cases = optim.curve_fit(func_logistic, x_cases, y_cases, bounds=bounds, p0=p0, maxfev=1000000)

In [None]:
# The time step at which the growth is fastest
t_fastest_cases = np.log(a_cases) / b_cases
i_fastest_cases = func_logistic(t_fastest_cases, a_cases, b_cases, c_cases)
res_df_cases = data_cases[['Timestep','date', column]].copy()
res_df_cases['fastest_grow_day'] = t_fastest_cases
res_df_cases['fastest_grow_value'] = i_fastest_cases
res_df_cases['growth_stabilized'] = res_df_cases['Timestep']>=t_fastest_cases
res_df_cases['res_func_logistic'] = func_logistic(x_cases, a_cases, b_cases, c_cases)
res_df_cases['cap'] = res_df_cases.apply(lambda row:func_logistic(t_fastest_cases+10,a_cases,b_cases,c_cases) if(row['Timestep']<t_fastest_cases) else func_logistic(row['Timestep']+10,a_cases,b_cases,c_cases),axis=1)

### Plotting - Maximum Capacity For Cases

In [None]:
# plot the logistic function
plt.figure(figsize=(10,6))
plt.plot(res_df_cases['Timestep'], res_df_cases['cap'])
plt.title('Maximum capacity values for each day for cases', size=20)
plt.xlabel('Days since the first case was observed in Brazil', size=15)
plt.ylabel('Maximum capacity', size=15)
plt.show()

In [None]:
deaths_df = data_deaths
deaths_df = deaths_df.drop(['population','date'],axis=1)
column = 'deaths'
# Randomly initialize the coefficients
p0 = np.random.exponential(size=3)
# Set min bound 0 on all coefficients, and set different max bounds # for each coefficient
bounds = (0, [100000., 14., population_brazil])
# Convert pd.Series to np.Array and use Scipy's curve fit to find   # the best Nonlinear Least Squares coefficients
x_deaths = np.array(deaths_df['Timestep']) + 1
y_deaths = np.array(deaths_df[column])
(a_deaths, b_deaths, c_deaths),cov_deaths = optim.curve_fit(func_logistic, x_deaths, y_deaths, bounds=bounds, p0=p0, maxfev=1000000)

In [None]:
# The time step at which the growth is fastest
t_fastest_deaths = np.log(a_deaths) / b_deaths
i_fastest_deaths = func_logistic(t_fastest_deaths, a_deaths, b_deaths, c_deaths)
res_df_deaths = data_deaths[['Timestep','date', column]].copy()
res_df_deaths['fastest_grow_day'] = t_fastest_deaths
res_df_deaths['fastest_grow_value'] = i_fastest_deaths
res_df_deaths['growth_stabilized'] = res_df_deaths['Timestep']>=t_fastest_deaths
res_df_deaths['res_func_logistic'] = func_logistic(x_deaths, a_deaths, b_deaths, c_deaths)
res_df_deaths['cap'] = res_df_deaths.apply(lambda row:func_logistic(t_fastest_deaths+10,a_deaths, b_deaths, c_deaths) if(row['Timestep']<t_fastest_deaths) else func_logistic(row['Timestep']+10,a_deaths, b_deaths, c_deaths),axis=1)

### Plotting - Maximum Capacity For Deaths

In [None]:
# plot the logistic function
plt.figure(figsize=(10,6))
plt.plot(res_df_deaths['Timestep'], res_df_deaths['cap'])
plt.title('Maximum capacity values for each day for deaths', size=20)
plt.xlabel('Days since the first death was observed in Brazil', size=15)
plt.ylabel('Maximum capacity', size=15)
plt.show()

### Step 3: Fitting Prophet and Prediction

We use the data prepared along with the respective maximum capacities for cases and deaths to fit separate prophet models to cases and deaths and are used to make future forecasts. Since the cases/deaths count can't go below zero, we provide a floor value of zero to Prophet.

### Cases


In [None]:
cases_to_fit = res_df_cases.drop(['Timestep','fastest_grow_day','fastest_grow_value','growth_stabilized','res_func_logistic'],axis=1).rename(columns={'date':'ds','cases':'y'})
cases_to_fit['floor'] = 0

cases_train, cases_test = train_test_split(cases_to_fit, test_size=0.2, shuffle=False, random_state=42)
model_cases = Prophet(interval_width=0.95,growth='logistic')
model_cases.fit(cases_train)


 **Creating dataframe for future forecast**

In [None]:

future_data_cases = pd.DataFrame()
future_data_cases['ds'] = pd.date_range(start="2020-10-15",end="2020-11-05")
future_data_cases['floor'] = 0
future_data_cases['Timestep'] = future_data_cases.index+231 #We have used 231 data points for training
future_data_cases['cap'] =  future_data_cases.apply(lambda row:func_logistic(row['Timestep']+10,a_cases,b_cases,c_cases),axis=1)
future_data_cases = future_data_cases.drop('Timestep',axis=1)

In [None]:

forecast_cases_test = model_cases.predict(cases_test)
#forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(5)
rmse = np.sqrt(mean_squared_error(cases_test['y'],forecast_cases_test['yhat']))
r2 = r2_score(cases_test['y'],forecast_cases_test['yhat'])
print('Root Mean Squared error for cases predicted is {:.2f}'.format(rmse))
print('R2 score for cases predicted is {:.2f}'.format(r2))

In [None]:
forecast_cases_future = model_cases.predict(future_data_cases)
all_forecasts_cases = forecast_cases_test.append(forecast_cases_future)

**From the above metrics, we see that the prophet model is performing well in forecasting the number of cases**

### Deaths

In [None]:

deaths_to_fit = res_df_deaths.drop(['Timestep','fastest_grow_day','fastest_grow_value','growth_stabilized','res_func_logistic'],axis=1).rename(columns={'date':'ds','deaths':'y'})
deaths_to_fit['floor'] = 0
deaths_train, deaths_test = train_test_split(deaths_to_fit, test_size=0.2, shuffle=False, random_state=42)
model_deaths = Prophet(interval_width=0.95,growth='logistic')
model_deaths.fit(deaths_train)

**Creating future data for future forecast of deaths**

In [None]:
future_data_deaths = pd.DataFrame()
future_data_deaths['ds'] = pd.date_range(start="2020-10-15",end="2020-11-05")
future_data_deaths['floor'] = 0
future_data_deaths['Timestep'] = future_data_deaths.index+231 #We have used 231 data points for training
future_data_deaths['cap'] =  future_data_deaths.apply(lambda row:func_logistic(row['Timestep']+10,a_deaths, b_deaths, c_deaths),axis=1)
future_data_deaths = future_data_deaths.drop('Timestep',axis=1)

In [None]:
forecast_deaths_test = model_deaths.predict(deaths_test)
rmse = np.sqrt(mean_squared_error(deaths_test['y'],forecast_deaths_test['yhat']))
r2 = r2_score(deaths_test['y'],forecast_deaths_test['yhat'])
print('Root Mean Squared error for deaths predicted is {:.2f}'.format(rmse))
print('R2 score for deaths predicted is {:.2f}'.format(r2))

In [None]:
forecast_deaths_future = model_deaths.predict(future_data_deaths)
all_forecasts_deaths = forecast_deaths_test.append(forecast_deaths_future)


**From the above metrics, we see that the prophet model is performing well in forecasting the number of deaths**

### Step 4: Plotting Prophet Predictions and Forecasts for next 20 Days

### Cases

In [None]:
model_cases.plot(all_forecasts_cases,xlabel='Date',ylabel='Cases',figsize=(12,6));
plt.axvline(x=pd.to_datetime(cases_train['ds'].iloc[-1]),linestyle='-.',label='2020-08-28: Last observation date in training data')
plt.axvline(x=pd.to_datetime(cases_test['ds'].iloc[-1]),linestyle='--',label='2020-10-14: Last observation date in test data')
plt.title('Prophet model for Cases',size=20)
plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
plt.show()

In the above plot, 

- **Blue line** represents forecast done by the model
- **Black Dotted lines** represent the minimum and maximum capacity.
- **Cases have been forecasted for 20 days starting from 2020-10-14. This can be seen in the last vertical segment of the above graph**.<br>


From the above plot, we can clearly see a **logistic growth in number of cases over the months.**

### Deaths

In [None]:
model_deaths.plot(all_forecasts_deaths,xlabel='Date',ylabel='Deaths',figsize=(12,6));
plt.axvline(x=pd.to_datetime(deaths_train['ds'].iloc[-1]),linestyle='-.',label='2020-08-28: Last observation date in training data')
plt.axvline(x=pd.to_datetime(deaths_test['ds'].iloc[-1]),linestyle='--',label='2020-10-14: Last observation date in test data')
plt.title('Prophet model for Deaths',size=20)
plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
plt.show()

In the above plot, 

- **Blue line** represents forecast done by the model
- **Black Dotted lines** represent the minimum and maximum capacity.
- **Deaths have been forecasted for 20 days starting from 2020-10-14. This can be seen in the last vertical segment of the above graph**.<br>


From the above plot, we can clearly see a **logistic growth in number of deaths over the months.**

<a id='section2.6'></a>
## 2.6 Model Comparison:

In [None]:
cases = PrettyTable()
cases.field_names = ["Model","RMSE", "R2 Score"]
cases.add_row(["Linear Regression", 524980.36,0.92])
cases.add_row(["Multi Linear Regression", 197317.77,0.81])
cases.add_row(["Polynomial Regression", 159563.04,0.87])
cases.add_row(["MLP", 19625.79,1.00])
cases.add_row(["Logistic Growth & Prophet", 23966.72,0.91])

In [None]:
deaths = PrettyTable()
deaths.field_names = ["Model","RMSE", "R2 Score"]
deaths.add_row(["Linear Regression", 6696.82,0.98])
deaths.add_row(["Multi Linear Regression", 7789.30,0.79])
deaths.add_row(["Polynomial Regression", 7416.37,0.80])
deaths.add_row(["MLP", 2723.93,1.00])
deaths.add_row(["Logistic Growth & Prophet", 3592.70,0.83])

Summary of RMSE scores and R2 scores for different models

In [None]:
print("CASES")
print(cases)
print()
print("DEATHS")
print(deaths)

Currently, Brazil is witnessing an exponential growth of COVID-19 infection, therefore Linear models (Linear , Multi-linear, and polynomial regression) seem to perform well, but based on the observations made from other countries, this growth gets saturated after a point. 

We notice that MLP is giving extremely accurate predictions for the given dataset, but it fails to capture the generic trend associated with this kind of infection growth.

This kind of growth can be well modeled using Logistic growth functions. Therefore, logistic growth modeling with Prophet provides a more realistic predictions, that would help in understanding the future trend and in designing the policies inoder to curb spread of infection in Brazil.

<a id='section3'></a>
# 3. Exploratory Data Analysis - India, Italy and South Korea

**In this section we shall be exploring the trends in the following 3 contries in order to come up with policies to contain spread of COVID-19 in Brazil**

<a id='section3.1'></a>
## 3.1 India

In [None]:
india_covid_19 = pd.read_csv('../input/covid19-in-india/covid_19_india.csv')
india_covid_19['Date'] = india_covid_19['Date'].apply(lambda d:datetime.strptime(d, '%d/%m/%y'))
india_covid_19 = india_covid_19[india_covid_19['Date']<='2020-09-30']

### Raw data 

In [None]:
india_covid_19.head()

### Trends for Deaths, Recovery and Cases across months

In [None]:
cumulative_forAllStates = india_covid_19.groupby(india_covid_19['Date'].dt.strftime('%m')).agg(Total_Cured=('Cured', 'sum'), 
                                         Total_deaths=('Deaths', 'sum'),
                                         Total_confirmed=('Confirmed','sum')
                                         ).reset_index()

### Recovered Cases Trend

In [None]:
plt.figure(figsize=(15,6))
plt.bar(cumulative_forAllStates['Date'], cumulative_forAllStates['Total_Cured'],color='green',label='Actual Values',alpha=0.5)
plt.plot(cumulative_forAllStates['Date'],cumulative_forAllStates['Total_Cured'],color='black',label='Trend')
plt.xlabel('Month', size=20)
plt.ylabel('Total cured in 1e7s', size=20)
plt.axvline(x=2.5,linestyle ='-.',label='Start of Lockdown')
plt.axvline(x=4.2,label='End of Lockdown')
plt.xticks([0,1,2,3,4,5,6,7,8], ['Jan','Feb','Mar','Apr','May','June','July','Aug','Sep'], rotation='horizontal', size=15)
plt.yticks(size=15)
plt.title('Trend of total cured over time', size=25)
plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
plt.show()

**We can observe that the trend is exponential in nature.**

### Death Trend

In [None]:
plt.figure(figsize=(15,6))
plt.bar(cumulative_forAllStates['Date'], cumulative_forAllStates['Total_deaths'],color='maroon',label='Actual Values',alpha=0.5)
plt.plot(cumulative_forAllStates['Date'],cumulative_forAllStates['Total_deaths'],color='black',label='Trend')
plt.xlabel('Month', size=20)
plt.ylabel('Total deaths', size=20)
plt.axvline(x=2.5,linestyle ='-.',label='Start of Lockdown')
plt.axvline(x=4.2,label='End of Lockdown')
plt.xticks([0,1,2,3,4,5,6,7,8], ['Jan','Feb','Mar','Apr','May','June','July','Aug','Sep'], rotation='horizontal', size=15)
plt.yticks(size=15)
plt.title('Trend of total death over time', size=25)
plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
plt.show()

**Trend here is exponential as well. However the values in y-axis are absolut values. Lockdown being imposed well in advance has seemed to contain the number of deaths**

### Active Cases Trend

In [None]:
plt.figure(figsize=(15,6))
plt.bar(cumulative_forAllStates['Date'], cumulative_forAllStates['Total_confirmed'],color='orange',label='Actual Values',alpha=0.5)
plt.plot(cumulative_forAllStates['Date'],cumulative_forAllStates['Total_confirmed'],color='black',label='Trend')
plt.xlabel('Month', size=20)
plt.ylabel('Total cases in 1e8s', size=20)
plt.axvline(x=2.5,linestyle ='-.',label='Start of Lockdown')
plt.axvline(x=4.2,label='End of Lockdown')
plt.xticks([0,1,2,3,4,5,6,7,8], ['Jan','Feb','Mar','Apr','May','June','July','Aug','Sep'], rotation='horizontal', size=15)
plt.title('Trend of total cases over time', size=25)
plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
plt.show()

**Cases growth has also followed an exponential curve. But lockdown again has helped prevent sudden spikes.**

### Growth Factor

We calculate the values of rate of growth for active cases and deaths in pre lockdown, post lockdown and lockdown periods to examine how lockdown has benefitted.

In [None]:
india_date_grouped = india_covid_19.groupby(india_covid_19['Date']).agg(Total_Cured=('Cured', 'sum'), 
                                         Total_deaths=('Deaths', 'sum'),
                                         Total_confirmed=('Confirmed','sum')
                                         ).reset_index()

 We have referred to the following link to get the lockdown dates:<br>
 https://en.wikipedia.org/wiki/COVID-19_pandemic_lockdown_in_India <br>

In [None]:
#Growth factor before lockdown
data_before_lockdown = india_date_grouped[india_date_grouped['Date']<='2020-03-25']
data_during_lockdown =  india_date_grouped[(india_date_grouped['Date']>'2020-03-25') & (india_date_grouped['Date']<='2020-05-31')]
data_after_lockdown = india_date_grouped[india_date_grouped['Date']>'2020-05-31']

In [None]:
def growth_factor(df,column):
    growth_diff = []
    for i in range(1,len(df)):
        growth_diff.append(df.iloc[i][column] / df.iloc[i-1][column])
    growth_factor = sum(growth_diff)/len(growth_diff)
    return growth_factor

**Computing average growth rate for pre-lockdown, lockdown and post-lockdown**

In [None]:
cases_growth_factor = {}
cases_growth_factor['before'] = growth_factor(data_before_lockdown,'Total_confirmed')
cases_growth_factor['during'] = growth_factor(data_during_lockdown,'Total_confirmed')
cases_growth_factor['after'] = growth_factor(data_after_lockdown,'Total_confirmed')
print('Average growth rate for cases before lockdown {:.2f}'.format(cases_growth_factor['before']))
print('Average growth rate for cases during lockdown {:.2f}'.format(cases_growth_factor['during']))
print('Average growth rate for cases after lockdown {:.2f}'.format(cases_growth_factor['after']))


In [None]:
x = [1,2,3]
y = np.array([20,21,22,23])
my_xticks = ['Before','During','After']
plt.figure(figsize=(15,6))
plt.xticks(x, my_xticks, size=15)
plt.plot(x,list(cases_growth_factor.values()),label='Growth Factor Values')
plt.xlabel('Phases of Lockdown', size=20)
plt.ylabel('Growth Factor', size=20)
plt.title('Growth Factor for Number of Cases', size=25)
plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
plt.show()

**We can observe that growth factor has decreased during lockdown. This means that without imposition of lockdown the spread of virus would have been really fast and uncontrollable.**

In [None]:
deaths_growth_factor = {}
deaths_growth_factor['before'] = growth_factor(data_before_lockdown,'Total_deaths')
deaths_growth_factor['during'] = growth_factor(data_during_lockdown,'Total_deaths')
deaths_growth_factor['after'] = growth_factor(data_after_lockdown,'Total_deaths')
print('Average growth rate for deaths before lockdown {:.2f}'.format(deaths_growth_factor['before']))
print('Average growth rate for deaths during lockdown {:.2f}'.format(deaths_growth_factor['during']))
print('Average growth rate for deaths after lockdown {:.2f}'.format(deaths_growth_factor['after']))

**The growth factor for deaths clearly indicate that lockdown imposition was on time in India bringing deaths under control in time.**

### Plotting actual values of deaths, active cases and cured over the above three periods

### Before Lockdown

In [None]:
#Plotting data before lockdown
temp = data_before_lockdown[['Date', 'Total_confirmed', 'Total_deaths', 'Total_Cured']]
temp.columns = ['Date', 'Active', 'Deaths', 'Cured']
temp = temp.melt(id_vars="Date", value_vars=['Cured', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()
fig_2 = px.bar(temp, x="Date", y="Count", color='Case', height=500, 
               title='Before Lockdown - Cases over time', color_discrete_sequence = ['green', 'maroon', 'orange'])
fig_2.update_layout(title_x = 0.5)
fig_2.show()

### During Lockdown

In [None]:
#During lockdown
temp = data_during_lockdown[['Date', 'Total_confirmed', 'Total_deaths', 'Total_Cured']]
temp.columns = ['Date', 'Active', 'Deaths', 'Cured']
temp = temp.melt(id_vars="Date", value_vars=['Cured', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()
fig_2 = px.bar(temp, x="Date", y="Count", color='Case', height=500, 
               title='During Lockdown - Cases over time', color_discrete_sequence = ['green', 'brown', 'orange'])
fig_2.update_layout(title_x = 0.5)
fig_2.show()

### Post Lockdown

In [None]:
#After lockdown
temp = data_after_lockdown[['Date', 'Total_confirmed', 'Total_deaths', 'Total_Cured']]
temp.columns = ['Date', 'Active', 'Deaths', 'Cured']
temp = temp.melt(id_vars="Date", value_vars=['Cured', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()
fig_2 = px.bar(temp, x="Date", y="Count", color='Case', height=500, 
               title='Post Lockdown - Cases over time', color_discrete_sequence = ['green', 'maroon', 'orange'])
fig_2.update_layout(title_x = 0.5)
fig_2.show()

### Overall Infection Rate Till Date for India

In [None]:
#Overall infection rate 
india_date_grouped["Infection Rate"] = india_date_grouped["Total_confirmed"].diff()
india_date_grouped = india_date_grouped[india_date_grouped["Infection Rate"]>0]

In [None]:
lockdown_start_date="2020-03-25"
lockdown_end_date="2020-05-31"
fig=px.line(india_date_grouped,x="Date",y="Infection Rate", title="Overall infection rate in India")
fig.add_shape(dict(type="line",x0=lockdown_start_date,y0=0,
                  x1=lockdown_start_date,y1=india_date_grouped["Infection Rate"].max(),
                  line=dict(color="red",width=2)))
fig.add_annotation(dict(x=lockdown_start_date,y=india_date_grouped["Infection Rate"].max(),text='starting date of the lockdown'))



fig.add_shape(dict(type='line',x0=lockdown_end_date,y0=0,
                  x1=lockdown_end_date,y1=india_date_grouped["Infection Rate"].max(),
                  line=dict(color="red",width=2)))
fig.add_annotation(dict(x=lockdown_end_date,y=india_date_grouped["Infection Rate"].max(),text="lockdown end date"))

fig.update_layout(title_x = 0.5)
fig.show()

**As, observed from the above lineplot we can clearly see that during the lockdown the Infection Rate had at worst a linear increase. Without lockdown it would have been an exponential increase.**

<a id='section3.2'></a>
## 3.2 Italy

In [None]:
df_italy = pd.read_csv('../input/covid19-in-italy/covid19_italy_region.csv')

### Raw Data 

In [None]:
df_italy.head()

In [None]:
def convertDate(date):
    d = datetime.fromisoformat(date)
    return d
    

In [None]:
df_italy['Date'] = df_italy['Date'].apply(lambda d:convertDate(d))
df_italy = df_italy[df_italy['Date']<='2020-08-31 00:00:00']

### Trends of Cases, Deaths and Recovery by month

In [None]:
data_by_month = df_italy.groupby(df_italy['Date'].dt.strftime('%m')).agg(Total_Cured=('Recovered', 'sum'), 
                                         Total_deaths=('Deaths', 'sum'),
                                         Total_confirmed=('TotalPositiveCases','sum')
                                         ).reset_index()

### Recovery trend

In [None]:
plt.figure(figsize=(15,8))
plt.bar(data_by_month['Date'], data_by_month['Total_Cured'],color='green',label='Actual Values',alpha=0.5)
plt.plot(data_by_month['Date'],data_by_month['Total_Cured'],color='black',label='Trend')
plt.xlabel('Month', size=20)
plt.ylabel('Total cured', size=20)
plt.axvline(x=0.8,linestyle ='-.',label='Start of Lockdown')
plt.axvline(x=2.8,label='End of Lockdown')
plt.xticks([0,1,2,3,4,5,6], ['Feb','Mar','Apr','May','June','July','Aug'], rotation='horizontal', size=15)
plt.title('Trend of total cured over time', size=25)
plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
plt.show()

**The curves for Italy are in general steeper than those for India.**

### Death Trend

In [None]:
plt.figure(figsize=(15,8))
plt.bar(data_by_month['Date'], data_by_month['Total_deaths'],color='maroon',label='Actual Values',alpha=0.5)
plt.plot(data_by_month['Date'],data_by_month['Total_deaths'],color='black',label='Trend')
plt.xlabel('Month', size=20)
plt.ylabel('Total deaths', size=20)
plt.axvline(x=0.8,linestyle ='-.',label='Start of Lockdown')
plt.axvline(x=2.8,label='End of Lockdown')
plt.xticks([0,1,2,3,4,5,6], ['Feb','Mar','Apr','May','June','July','Aug'], rotation='horizontal', size=15)
plt.title('Trend of total death over time', size=25)
plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
plt.show()

**Since lockdown was imposed at a later stage in Italy, death numbers had already started spiking.**

### Active Cases Trend 

In [None]:
plt.figure(figsize=(15,8))
plt.bar(data_by_month['Date'], data_by_month['Total_confirmed'],color='orange',label='Actual Values',alpha=0.5)
plt.plot(data_by_month['Date'],data_by_month['Total_confirmed'],color='black',label='Trend')
plt.xlabel('Month', size=20)
plt.ylabel('Total cases', size=20)
plt.axvline(x=0.8,linestyle ='-.',label='Start of Lockdown')
plt.axvline(x=2.8,label='End of Lockdown')
plt.xticks([0,1,2,3,4,5,6], ['Feb','Mar','Apr','May','June','July','Aug'], rotation='horizontal', size=15)
plt.title('Trend of total cases over time', size=25)
plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 15})
plt.show()

**As can be observed since measures were not observed on time, the rates were already increasing exponentially before lockdown.** 

### Growth Factor

We compute growth factor to help understand if lockdown has helped.

In [None]:
df_italy = df_italy.groupby(df_italy['Date'].dt.strftime('%Y-%m-%d')).agg(Total_Cured=('Recovered', 'sum'), 
                                         Total_deaths=('Deaths', 'sum'),
                                         Total_confirmed=('TotalPositiveCases','sum')
                                         ).reset_index()

In [None]:
#Growth factor before lockdown
data_before_lockdown = df_italy[df_italy['Date']<='2020-03-09']
data_during_lockdown =  df_italy[(df_italy['Date']>'2020-03-09') & (df_italy['Date']<='2020-05-04')]
data_after_lockdown = df_italy[df_italy['Date']>'2020-05-04']

In [None]:
def growth_factor(df,column):
    growth_diff = []
    for i in range(1,len(df)):
        growth_diff.append(df.iloc[i][column] / df.iloc[i-1][column])
    growth_factor = sum(growth_diff)/len(growth_diff)
    return growth_factor

**Computing average growth rate for pre-lockdown, lockdown and post-lockdown**

In [None]:
cases_growth_factor = {}
cases_growth_factor['before'] = growth_factor(data_before_lockdown,'Total_confirmed')
cases_growth_factor['during'] = growth_factor(data_during_lockdown,'Total_confirmed')
cases_growth_factor['after'] = growth_factor(data_after_lockdown,'Total_confirmed')
print('Average growth rate for cases before lockdown {:.2f}'.format(cases_growth_factor['before']))
print('Average growth rate for cases during lockdown {:.2f}'.format(cases_growth_factor['during']))
print('Average growth rate for cases after lockdown {:.2f}'.format(cases_growth_factor['after']))


In [None]:
deaths_growth_factor = {}
deaths_growth_factor['before'] = growth_factor(data_before_lockdown,'Total_deaths')
deaths_growth_factor['during'] = growth_factor(data_during_lockdown,'Total_deaths')
deaths_growth_factor['after'] = growth_factor(data_after_lockdown,'Total_deaths')
print('Average growth rate for deaths before lockdown {:.2f}'.format(deaths_growth_factor['before']))
print('Average growth rate for deaths during lockdown {:.2f}'.format(deaths_growth_factor['during']))
print('Average growth rate for deaths after lockdown {:.2f}'.format(deaths_growth_factor['after']))


* **Cases** : We can see that the average growth rate for India before lockdown is 1.17 whereas we can observe a higher infection rate in Italy(1.31) prior to lockdown


* **Deaths**: For deaths, we can observe that in India the growth rate is "nan" meaning that the lockdown was imposed even before the firt death occured whereas in case of Italy, the death rate is also quite high(1.35) prior to lockdown.


* In both the countries, we can see **decreased cases/death rate during lockdown** (as is evident from the  growth rate values). 

### Plotting actual values of deaths, active cases and cured over the above three periods

### Pre lockdown

In [None]:
#Plotting data before lockdown
temp = data_before_lockdown[['Date', 'Total_confirmed', 'Total_deaths', 'Total_Cured']]
temp.columns = ['Date', 'Active', 'Deaths', 'Cured']
temp = temp.melt(id_vars="Date", value_vars=['Cured', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()
fig_2 = px.bar(temp, x="Date", y="Count", color='Case', height=500, 
               title='Cases over time', color_discrete_sequence = ['green', 'maroon', 'orange'])
fig_2.update_layout(title_x = 0.5)
fig_2.show()

### Lockdown period

In [None]:
#During lockdown
temp = data_during_lockdown[['Date', 'Total_confirmed', 'Total_deaths', 'Total_Cured']]
temp.columns = ['Date', 'Active', 'Deaths', 'Cured']
temp = temp.melt(id_vars="Date", value_vars=['Cured', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()
fig_2 = px.bar(temp, x="Date", y="Count", color='Case', height=500, 
               title='Cases over time', color_discrete_sequence = ['green', 'maroon', 'orange'])
fig_2.update_layout(title_x = 0.5)
fig_2.show()

### Post lockdown

In [None]:
#After lockdown
temp = data_after_lockdown[['Date', 'Total_confirmed', 'Total_deaths', 'Total_Cured']]
temp.columns = ['Date', 'Active', 'Deaths', 'Cured']
temp = temp.melt(id_vars="Date", value_vars=['Cured', 'Deaths', 'Active'],
                 var_name='Case', value_name='Count')
temp.head()
fig_2 = px.bar(temp, x="Date", y="Count", color='Case', height=500, 
               title='Cases over time', color_discrete_sequence = ['green', 'maroon', 'orange'])
fig_2.update_layout(title_x = 0.5)
fig_2.show()

### Overall Infection Rate 

In [None]:
#Overall infection rate 
df_italy["Infection Rate"] = df_italy["Total_confirmed"].diff()
df_italy = df_italy[df_italy["Infection Rate"]>0]

In [None]:
lockdown_start_date="2020-03-09"
lockdown_end_date="2020-05-04"
fig=px.line(df_italy,x="Date",y="Infection Rate", title="Overall infection rate over time")
fig.add_shape(dict(type="line",x0=lockdown_start_date,y0=0,
                  x1=lockdown_start_date,y1=df_italy["Infection Rate"].max(),
                  line=dict(color="red",width=2)))
fig.add_annotation(dict(x=lockdown_start_date,y=df_italy["Infection Rate"].max(),text='starting date of the lockdown'))



fig.add_shape(dict(type='line',x0=lockdown_end_date,y0=0,
                  x1=lockdown_end_date,y1=df_italy["Infection Rate"].max(),
                  line=dict(color="red",width=2)))
fig.add_annotation(dict(x=lockdown_end_date,y=df_italy["Infection Rate"].max(),text="lockdown end date"))
fig.update_layout(title_x = 0.5)
fig.show()

From the above plot it is evident that the infection rate was the already a peak before the lockdown was imposed. However the lockdown measure have clearly brought down the rate significantly.

<a id='section3.3'></a>
## 3.3 South Korea

In [None]:
df_policy = pd.read_csv('../input/coronavirusdataset/Policy.csv')
df_time = pd.read_csv('../input/coronavirusdataset/Time.csv')

In [None]:
df_policy.dropna(inplace=True)

## Raw data

In [None]:
df_policy.head()

In [None]:
df_time['date'] = df_time.date.apply(lambda d: (datetime.strptime(d, "%Y-%m-%d")))

In [None]:
df_time.head()

### Plot - Cases trend and timeline of policy implementation

In [None]:
plt.figure(figsize=(15,6))

plt.axvline(x=datetime.strptime('2020-01-03', "%Y-%m-%d"), label='Level 1 Alert Start time', color='blue')
plt.axvline(x=datetime.strptime('2020-01-19', "%Y-%m-%d"), label='Level 1 Alert End time', color='blue')

plt.axvline(x=datetime.strptime('2020-01-20', "%Y-%m-%d"), label='Level 2 Alert Start time', color='yellow')
plt.axvline(x=datetime.strptime('2020-01-27', "%Y-%m-%d"), label='Level 2 Alert End time', color='yellow')

plt.axvline(x=datetime.strptime('2020-01-28', "%Y-%m-%d"), label='Level 3 Alert Start time', color='brown')
plt.axvline(x=datetime.strptime('2020-02-22', "%Y-%m-%d"), label='Level 3 Alert End time', color='brown')

plt.axvline(x=datetime.strptime('2020-02-29', "%Y-%m-%d"), label='Social Distancing Campaign 1 Start time', color='red')
plt.axvline(x=datetime.strptime('2020-03-21', "%Y-%m-%d"), label='Social Distancing Campaign 1 End time', color='red')

plt.axvline(x=datetime.strptime('2020-03-22', "%Y-%m-%d"), label='Social Distancing Campaign 2 Start time', color='grey')
plt.axvline(x=datetime.strptime('2020-04-19', "%Y-%m-%d"), label='Social Distancing Campaign 2 End time', color='grey')

plt.xlabel('Date', size=20)
plt.ylabel('Number of cases', size=20)
plt.xticks(size=15)
plt.yticks(size=15)
plt.title("Covid cases trend with policy timelines", size=25)

plt.legend(loc = "best"
           , frameon = True
           , fancybox = True
           , framealpha = 0.95
           , shadow = True
           , borderpad = 1
           , prop={'size': 12})

plt.plot(df_time.date, df_time.confirmed, color='black')
plt.show()

We can notice that the following policies have had a **huge positive impact** in containing the spread of COVID-19 in South Korea:

1. Level 1 Alert - Blue zone
2. Level 2 Alert - Yellow zone
3. Level 3 Alert - Orange zone
4. Social Distancing campaign 1
5. Social Distancing campaign 2



**From the above plot, we can see that the rate of growth of cases reduced After *Social Distancing Campaign* was implemented in South Korea.**

### Infection spread due to close contact

In [None]:
df_patient_info = pd.read_csv('../input/coronavirusdataset/PatientInfo.csv')

In [None]:
df_patient_info = df_patient_info[df_patient_info['infected_by'].notna()]
df_patient_by_infected = pd.DataFrame(df_patient_info[['contact_number', 'infected_by', 'patient_id']]).groupby(df_patient_info['infected_by']).agg('count')

**Aggregating the total number of infected patients due to close contact.**

In [None]:
df_patient_by_infected.head()

In [None]:
mean_infected_number = df_patient_by_infected['infected_by'].mean()

print('Average transmission rate is {:.2f}'.format(mean_infected_number))

We can notice that, on an average, one person is spreading the infection to 3 more people. This rate of spread is very low when compared to other countries. 
Refer to the link mentioned below, to see the rate of spread in other countries.

https://www.who.int/docs/default-source/coronaviruse/situation-reports/20200805-covid-19-sitrep-198.pdf?sfvrsn=f99d1754_2

<a link='section4'></a>
# 4. Conclusion

In this notebook we have carried out extensive analysis and have come up with models that can accurately predict and forcast the daily cases and deaths in Brazil.

We have also performed EDA on 3 other countries - India, Italy, and South Korea inorder to understand the general trend of the COVID-19 better and model accordingly.