In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
df = pd.read_csv('covid_19_data.csv')
df

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
6717,6718,03/18/2020,,Guernsey,2020-03-17T18:33:03,0.0,0.0,0.0
6718,6719,03/18/2020,,Jersey,2020-03-17T18:33:03,0.0,0.0,0.0
6719,6720,03/18/2020,,Puerto Rico,2020-03-17T16:13:14,0.0,0.0,0.0
6720,6721,03/18/2020,,Republic of the Congo,2020-03-17T21:33:03,0.0,0.0,0.0


In [3]:
df.isna().sum()

SNo                   0
ObservationDate       0
Province/State     2769
Country/Region        0
Last Update           0
Confirmed             0
Deaths                0
Recovered             0
dtype: int64

In [4]:
df.shape

(6722, 8)

In [5]:
df.describe()

Unnamed: 0,SNo,Confirmed,Deaths,Recovered
count,6722.0,6722.0,6722.0,6722.0
mean,3361.5,601.195924,19.855846,226.341267
std,1940.618587,4896.33214,204.486922,2556.035202
min,1.0,0.0,0.0,0.0
25%,1681.25,2.0,0.0,0.0
50%,3361.5,13.0,0.0,0.0
75%,5041.75,108.0,1.0,11.0
max,6722.0,67800.0,3122.0,56927.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6722 entries, 0 to 6721
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SNo              6722 non-null   int64  
 1   ObservationDate  6722 non-null   object 
 2   Province/State   3953 non-null   object 
 3   Country/Region   6722 non-null   object 
 4   Last Update      6722 non-null   object 
 5   Confirmed        6722 non-null   float64
 6   Deaths           6722 non-null   float64
 7   Recovered        6722 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 420.3+ KB


In [7]:
df['ObservationDate'] = pd.to_datetime(df['ObservationDate'], format='%m/%d/%Y')

In [8]:
df_group = df.groupby('ObservationDate')[['Confirmed','Deaths','Recovered']].sum()
df_group.reset_index(inplace=True)
df_group

Unnamed: 0,ObservationDate,Confirmed,Deaths,Recovered
0,2020-01-22,555.0,17.0,28.0
1,2020-01-23,653.0,18.0,30.0
2,2020-01-24,941.0,26.0,36.0
3,2020-01-25,1438.0,42.0,39.0
4,2020-01-26,2118.0,56.0,52.0
5,2020-01-27,2927.0,82.0,61.0
6,2020-01-28,5578.0,131.0,107.0
7,2020-01-29,6165.0,133.0,126.0
8,2020-01-30,8235.0,171.0,143.0
9,2020-01-31,9925.0,213.0,222.0


In [9]:
df_group['DaysPassed'] = (df_group['ObservationDate'] - df_group['ObservationDate'].min()).dt.days+1

In [10]:
df_group

Unnamed: 0,ObservationDate,Confirmed,Deaths,Recovered,DaysPassed
0,2020-01-22,555.0,17.0,28.0,1
1,2020-01-23,653.0,18.0,30.0,2
2,2020-01-24,941.0,26.0,36.0,3
3,2020-01-25,1438.0,42.0,39.0,4
4,2020-01-26,2118.0,56.0,52.0,5
5,2020-01-27,2927.0,82.0,61.0,6
6,2020-01-28,5578.0,131.0,107.0,7
7,2020-01-29,6165.0,133.0,126.0,8
8,2020-01-30,8235.0,171.0,143.0,9
9,2020-01-31,9925.0,213.0,222.0,10


In [11]:
df_group['Infected'] = df_group['Confirmed'] - (df_group['Deaths'] + df_group['Recovered'])
df_group

Unnamed: 0,ObservationDate,Confirmed,Deaths,Recovered,DaysPassed,Infected
0,2020-01-22,555.0,17.0,28.0,1,510.0
1,2020-01-23,653.0,18.0,30.0,2,605.0
2,2020-01-24,941.0,26.0,36.0,3,879.0
3,2020-01-25,1438.0,42.0,39.0,4,1357.0
4,2020-01-26,2118.0,56.0,52.0,5,2010.0
5,2020-01-27,2927.0,82.0,61.0,6,2784.0
6,2020-01-28,5578.0,131.0,107.0,7,5340.0
7,2020-01-29,6165.0,133.0,126.0,8,5906.0
8,2020-01-30,8235.0,171.0,143.0,9,7921.0
9,2020-01-31,9925.0,213.0,222.0,10,9490.0


In [28]:
from sklearn.linear_model import LinearRegression

In [14]:
X = df_group[['DaysPassed']] 
y_confirmed = df_group['Confirmed']
y_deaths = df_group['Deaths']
y_recovered = df_group['Recovered']

In [15]:
lr_confirmed = LinearRegression()
lr_deaths = LinearRegression()
lr_recovered = LinearRegression()

lr_confirmed.fit(X, y_confirmed)
lr_deaths.fit(X, y_deaths)
lr_recovered.fit(X, y_recovered)


In [17]:
lr_confirmed.score(X,y_confirmed)

0.9226983983959586

In [18]:
lr_deaths.score(X,y_deaths)

0.8756757582417307

In [19]:

lr_recovered.score(X,y_recovered)

0.905195522090644

In [20]:
last_day = df_group['DaysPassed'].max()

future_days = np.arange(last_day + 1, last_day + 8)

X_future = future_days.reshape(-1, 1)

In [21]:
lr_pred_confirmed = lr_confirmed.predict(X_future)



In [23]:
lr_pred_deaths = lr_deaths.predict(X_future)



In [24]:
lr_pred_recovered = lr_recovered.predict(X_future)



In [25]:
lr_pred_confirmed

array([158934.3302005 , 161970.03366174, 165005.73712298, 168041.44058422,
       171077.14404546, 174112.8475067 , 177148.55096794])

In [26]:
lr_pred_deaths

array([5773.20739348, 5891.53880391, 6009.87021433, 6128.20162475,
       6246.53303517, 6364.8644456 , 6483.19585602])

In [27]:
lr_pred_recovered

array([72904.39724311, 74497.91487339, 76091.43250367, 77684.95013396,
       79278.46776424, 80871.98539452, 82465.5030248 ])

In [29]:
forecast_df = pd.DataFrame({
    'Day': future_days,
    'LR_Confirmed': lr_pred_confirmed.round().astype(int),
    'LR_Deaths': lr_pred_deaths.round().astype(int),
    'LR_Recovered': lr_pred_recovered.round().astype(int)
    })

In [30]:
forecast_df

Unnamed: 0,Day,LR_Confirmed,LR_Deaths,LR_Recovered
0,58,158934,5773,72904
1,59,161970,5892,74498
2,60,165006,6010,76091
3,61,168041,6128,77685
4,62,171077,6247,79278
5,63,174113,6365,80872
6,64,177149,6483,82466
