# COVID-19 - EDA and Predicitons

In this is study it will be presented an exploratory data analysis for covid in the world. Then, I will focus on the data from Brazil, my home country, to finish with a prediction of deaths using Linear regression.  

## 1 - Importing the libraries and Loading the file

In [None]:
#Data Analysis and Visualization 
import numpy as np 
import matplotlib.pyplot as plt 
plt.rcParams.update({'font.size': 16})
import pandas as pd 
import seaborn as sns
import folium
import geopandas as gpd
import branca.colormap as cm
import plotly.graph_objects as go
import plotly.express as px
pd.set_option('display.max_rows', None)
from plotly.subplots import make_subplots
import calendar

#Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression


In [None]:
covid = pd.read_csv('../input/novel-corona-virus-2019-dataset/covid_19_data.csv')
covid.head()

In [None]:
covid.shape

Converting the data and creating a column with active cases of covid

In [None]:
covid[["Confirmed","Deaths","Recovered"]] =covid[["Confirmed","Deaths","Recovered"]].astype(int)
covid['Active'] = covid['Confirmed'] - covid['Deaths'] - covid['Recovered']
covid.head()

World cases as of the date of this notebook;

In [None]:
covid_world = covid.copy()

Getting the active cases for the date:

In [None]:
covid_world = covid_world[covid_world['ObservationDate'] == max(covid_world['ObservationDate'])].reset_index()
world_cases = covid_world.groupby(["ObservationDate"])[["Confirmed", "Deaths","Recovered", "Active"]].sum().reset_index()
world_cases

In [None]:
labels = ["Recovered","Deaths", "Active"]
values = world_cases.loc[0, ["Recovered","Deaths", "Active"]]
fig = px.pie(world_cases, values=values, names=labels,color_discrete_sequence=['#636EFA','#FECB52','#EF553B' ],hole=0.5)
fig.update_layout(title='Total cases on 11/15/2020 : '+str(world_cases["Confirmed"][0]),)
fig.show()

## 2 - Creating a new dataframe only for data from Brazil

In [None]:
df_brazil = covid[covid['Country/Region'] == 'Brazil']
df_brazil.head()

In [None]:
df_brazil.shape

In [None]:
#Saving as a new dataframe
df_brazil.to_csv('covid_19_data_brazil.csv')
df_brazil = pd.read_csv('covid_19_data_brazil.csv')

In [None]:

df_brazil2 = df_brazil[df_brazil['ObservationDate'] == max(df_brazil['ObservationDate'])].reset_index()
df_brazil_sum = df_brazil2.groupby(["ObservationDate"])[["Confirmed", "Deaths","Recovered", "Active"]].sum().reset_index()
df_brazil_sum

In [None]:
labels = ["Recovered","Deaths", "Active"]
values = df_brazil_sum.loc[0, ["Recovered","Deaths", "Active"]]
fig = px.pie(df_brazil_sum, values=values, names=labels,color_discrete_sequence=['#636EFA','#FECB52','#EF553B' ],hole=0.5)
fig.update_layout(title='Total cases for Brazil on 11/15/2020 : '+str(df_brazil_sum["Confirmed"][0]),)
fig.show()

## 3 - EDA

#### Cleaning the dataframe from unnecessary data and NAN. 

In [None]:
df_brazil.dtypes

In [None]:
df_brazil.duplicated().sum()

In [None]:
df_brazil.isnull().sum()

In [None]:
df_brazil['Province/State'].unique()

In [None]:
df_brazil[df_brazil['Province/State']=='Unknown']

In [None]:
df_brazil["Province/State"]= df_brazil["Province/State"].fillna('Unknown')

In [None]:
df_brazil['Province/State'].unique()

In [None]:
df_brazil.isnull().sum()

It is necessary to convert ObservationDate to datetime (it is an object). 

In [None]:
df_brazil['ObservationDate'] = pd.to_datetime(df_brazil['ObservationDate'])
df_brazil.dtypes

Gathering datetime information by 'month'

In [None]:
df_brazil['month'] = pd.DatetimeIndex(df_brazil['ObservationDate']).month
df_brazil['month_name'] = df_brazil['month'].apply(lambda x: calendar.month_abbr[x])
df_brazil.head()

Ordering the information by date:

In [None]:
df_brazil_sorted = df_brazil.sort_values("month", ascending=True)

In [None]:
#Visualizing the data:
sns.set_style("whitegrid") 
sns.set_palette("Set1")
plt.figure(figsize=(25, 12))
sns.lineplot(data=df_brazil_sorted, x=df_brazil_sorted['month_name'], y=df_brazil_sorted['Confirmed'], color='red', sort=False, markers=True)
plt.title('Evolution of COVID-19 Cases in Brazil by Month', fontsize= 24)
plt.xlabel('Month', fontsize = 20)
plt.ylabel('Confirmed Cases', fontsize=20);

### Evolution over time

In [None]:
df_sum = df_brazil_sorted.groupby('ObservationDate').agg(
    {'Confirmed': 'sum', 'Deaths': 'sum', 'Recovered': 'sum', 'Active':'sum'}).reset_index()

In [None]:
df_sum.head()

In [None]:
df_sum.tail()

In [None]:
sns.set_palette("Set2")
#sns.set_style("darkgrid") 
#plt.style.use('dark_background')
sns.set_style("whitegrid") 

plt.figure(figsize=(22, 12))
plt.stackplot(df_sum['ObservationDate'], [df_sum['Deaths'],df_sum['Confirmed'], df_sum['Recovered'], df_sum['Active']],
              labels = ['Deaths', 'Confirmed Cases', 'Recovered', 'Active'])
plt.legend(loc = 'upper left', fontsize=16)
plt.title('Evolution of COVID-19 Cases in Brazil by Observation Date', fontsize= 24)
plt.xlabel('Observation Date', fontsize = 20)
plt.ylabel('Total Number', fontsize=20)


In [None]:
#sns.set_palette("Set1")
#sns.set_style("darkgrid") 
#plt.style.use('white_background')
sns.set_style("whitegrid") 
plt.figure(figsize = (24,12))
plt. plot(df_sum['ObservationDate'], df_sum['Confirmed'] , label = "Confirmed Cases", linewidth=4, color='blue')
plt. plot(df_sum['ObservationDate'], df_sum['Deaths'] , label = "Deaths", linewidth=4, color='red')
plt. plot(df_sum['ObservationDate'], df_sum['Recovered'] , label = "Recovered", linewidth=4, color='c')
plt. plot(df_sum['ObservationDate'], df_sum['Active'] , label = "Active", linewidth=4, color='yellow')
plt.title('Evolution of COVID-19 Cases in Brazil by Observation Date', fontsize= 24)
plt.xlabel('Observation Date', fontsize = 20)
plt.ylabel('Total Number', fontsize=20);
plt.legend(loc = 'upper left', fontsize=20);

## 4 -  Prediction Model

In [None]:
deaths_df = pd.read_csv('../input/novel-corona-virus-2019-dataset/time_series_covid_19_deaths.csv')

In [None]:
deaths_df.shape

In [None]:
deaths_df.head()

In [None]:
deaths_df.tail()

In [None]:
#Filtering the data for Brazil: 

deaths_df[deaths_df['Country/Region'] == 'Brazil']

In [None]:
columns = deaths_df.keys()
columns

In [None]:
deaths_df = deaths_df.loc[:, columns[4]:columns[-1]]
deaths_df.head()

In [None]:
deaths_df.keys()

In [None]:
len(deaths_df.keys())

In [None]:
#Sum of death cases for 22nd of January - the first day on the dataset
deaths_df['1/22/20'].sum()

In [None]:
#Sum of death cases for the last day on the dataset
deaths_df['11/15/20'].sum()

Summing up the number of deaths per day

In [None]:
dates = deaths_df.keys()
y = []
for i in dates:
    
  y.append(deaths_df[i].sum())

In [None]:
len(y)

We have 299 dates so far

#### Preparing the dataset for the machine learning model:

Transforming y from a vector to a matrix:

In [None]:
y = np.array(y).reshape(-1,1)
y.shape

In [None]:
X = np.arange(len(dates)).reshape(-1,1)
X.shape

Now , creating a forecast matrix to predict death cases for the next 15 days, which means in this case, the rest of the month of November. 


In [None]:
fcast_mat = np.arange(len(dates) + 15).reshape(-1,1)
fcast_mat.shape

Before the forecast matrix, we had 299 data points, now there are 314 data points as we included more 15 days to predict. 

#### Train and Test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, shuffle = False)
#shuffle=4 because this is a time series prediction

### Linear Regression

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
score = regressor.score(X_train, y_train)
predictions_lin = regressor.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, predictions_lin)
regressor.score(X_test, y_test)

regressor.intercept_
len(regressor.coef_)

Comparing predictions with linear regression and test data:

In [None]:
plt.figure(figsize=(14, 6))
plt.plot(predictions_lin, linestyle = 'dashed', linewidth=3, label='Predictions with Linear Regression')
plt.plot(y_test, linewidth=3, label = 'Test Data')
plt.legend(loc = 'upper left', fontsize=14);

In [None]:
print('MAE:', mean_absolute_error(predictions_lin, y_test))
print('RMSE:', np.sqrt(mean_absolute_error(predictions_lin, y_test)))

### Polynomial Regression

In [None]:
poly = PolynomialFeatures(degree=4)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [None]:
regressor = LinearRegression()
regressor.fit(X_train_poly, y_train)

regressor = LinearRegression()
regressor.fit(X_train_poly, y_train)
score = regressor.score(X_train_poly, y_train)
poly_test_pred = regressor.predict(X_test_poly)

Comparing the values of y_test and poly_test_predict we can see that some values were predicted right and others not so much. To have an overview of the accuracy, let's plot both data (tested and predicted) to visualize the differences:

In [None]:
plt.figure(figsize=(14, 6))
plt.plot(poly_test_pred, linestyle = 'dashed', linewidth=3, label='Predictions with Polynomial Regression')
plt.plot(y_test,linewidth=3, label = 'Test Data')
plt.legend(loc = 'upper left', fontsize=14);

In [None]:
print('MAE:', mean_absolute_error(poly_test_pred, y_test))
print('RMSE:', np.sqrt(mean_absolute_error(poly_test_pred, y_test)))

### Forecast Plot

In [None]:
X_train_all = poly.transform(fcast_mat)
X_train_all.shape

In [None]:
poly_pred_all = regressor.predict(X_train_all)
len(poly_pred_all)

In [None]:
plt.figure(figsize=(14, 6))
plt.plot(fcast_mat[:-15], y, color='red', linewidth=3)
plt.plot(fcast_mat, poly_pred_all, color='blue', linestyle='dashed', linewidth=3)
plt.title('Deaths of COVID-19 in Brazil', fontsize=20)
plt.xlabel('Days since the first case reported in Brazil (1/22/2020)', fontsize=16)
plt.ylabel('Number of deaths', fontsize=16)
plt.legend(['Death cases', 'Predictions'], fontsize=14);

## Models comparison

In [None]:
plt.figure(figsize=(14, 6))
plt.plot(poly_test_pred, linestyle = 'dashed', linewidth=3, label='Predictions with Polynomial Regression', color='c')
plt.plot(predictions_lin, linestyle = 'dashed', linewidth=3, label='Predictions with Linear Regression', color='red')
plt.plot(y_test,linewidth=3, label = 'Test Data', color='blue')
plt.legend(loc = 'upper left', fontsize=14);