In [21]:
import datetime
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor 

Load Covid-19 data

In [22]:
df = pd.read_csv("data/covid_19_data.csv")
df['Date'] = pd.to_datetime(df['ObservationDate'])
df.columns

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered', 'Date'],
      dtype='object')

Data Cleaning - Keep only required columns

In [23]:
cols = ['Date', 'Country/Region', 'Province/State', 'Confirmed', 'Recovered', 'Deaths']
df = df[cols]
df = df.reset_index(drop=True)

Replace NA with 0 or by others

In [24]:
df['Province/State'] = df['Province/State'].fillna('Others')
df['Confirmed'] = df['Confirmed'].fillna(0)
df = df.sort_values(['Date', 'Country/Region','Province/State'])

Aggregate confirmed cases by Date/Country/Region


In [25]:
df = df.groupby(['Date','Country/Region','Province/State']).agg({'Confirmed':'sum'}).reset_index()
df['Province/State'] = 'all'

In [26]:
df['Country/Region'].value_counts()

US                     15430
Russia                 14066
Mainland China          9263
Japan                   8504
India                   5967
                       ...  
Republic of Ireland        1
Channel Islands            1
St. Martin                 1
 Azerbaijan                1
North Ireland              1
Name: Country/Region, Length: 226, dtype: int64

Load world data

In [27]:
world_df = pd.read_csv("data/countries of the world.csv")
world_df['Country'] = world_df['Country'].str.strip()
world_df.head()

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,0,2306,16307,700.0,360,32,1213,22,8765,1,466,2034,38.0,24.0,38.0
1,Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,232.0,188.0,579.0
2,Algeria,NORTHERN AFRICA,32930091,2381740,138,4,-39,31,6000.0,700,781,322,25,9653,1,1714,461,101.0,6.0,298.0
3,American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,,,
4,Andorra,WESTERN EUROPE,71201,468,1521,0,66,405,19000.0,1000,4972,222,0,9778,3,871,625,,,


Attatch world data to Covid-19 data

In [28]:
df = pd.merge(df, world_df, how ='left', left_on=['Country/Region'], right_on=['Country'])
df['Date'] = pd.to_datetime(df['Date'])

In [29]:
df.loc[df['Country'].isnull()]['Country/Region'].value_counts().head()

US                 15430
Mainland China      9263
UK                  3318
South Korea          299
North Macedonia      264
Name: Country/Region, dtype: int64

In [31]:
df.loc[df['Country'].isnull(), 'Country'] = 'Undefined'
df.loc[df['Region'].isnull(), 'Region'] = 'Others'
df['Country'].value_counts()

Undefined         33743
Russia            14066
Japan              8504
India              5967
Colombia           5873
                  ...  
Gambia, The           4
Bahamas, The          3
Cayman Islands        3
East Timor            1
Cape Verde            1
Name: Country, Length: 186, dtype: int64

In [33]:
confirmed_original = df['Confirmed']
minmax_transformer = MinMaxScaler(feature_range=(0,1)).fit(np.asarray([0, 2E5]).reshape(-1,1))
confirmed_transformed = pd.Series(minmax_transformer.transform(confirmed_original.values.reshape(-1,1)).reshape(-1))
df['confirmed_transformed'] = confirmed_transformed

encoded_country_region = LabelEncoder().fit(df['Country/Region'])
df['encoded_country_region'] = encoded_country_region.transform(df['Country/Region'])
encoded_region = LabelEncoder().fit(df['Region'])
df['encoded_region'] = encoded_region.transform(df['Region'])

Split data into train and test set.

In [34]:
train_data = df.loc[df['Confirmed'] > 50].copy()
print({train_data.shape})
train, test = train_test_split(train_data, test_size=0.2, shuffle=True, random_state=200000)

{(138384, 27)}


Fit data into gradient boost model

In [35]:
   
model = LGBMRegressor(n_estimators=200, metric='mae', min_child_samples=5, min_child_weight=0.001)

confirmed1 = ['Confirmed']
confirmed2 = ['confirmed_transformed']

model.fit(X=train[confirmed1+confirmed2], y=train['Confirmed'], 
               eval_set=(test[confirmed1+confirmed2], test['Confirmed']),
               early_stopping_rounds=100, verbose=10)

Training until validation scores don't improve for 100 rounds
[10]	valid_0's l1: 14816.1
[20]	valid_0's l1: 5511.61
[30]	valid_0's l1: 2570.76
[40]	valid_0's l1: 1747.65
[50]	valid_0's l1: 1525.82
[60]	valid_0's l1: 1461.69
[70]	valid_0's l1: 1444.42
[80]	valid_0's l1: 1439.81
[90]	valid_0's l1: 1437.89
[100]	valid_0's l1: 1436.79
[110]	valid_0's l1: 1435.99
[120]	valid_0's l1: 1435.49
[130]	valid_0's l1: 1435.05
[140]	valid_0's l1: 1434.83
[150]	valid_0's l1: 1434.67
[160]	valid_0's l1: 1434.57
[170]	valid_0's l1: 1434.5
[180]	valid_0's l1: 1434.45
[190]	valid_0's l1: 1434.43
[200]	valid_0's l1: 1434.41
Did not meet early stopping. Best iteration is:
[200]	valid_0's l1: 1434.41


LGBMRegressor(metric='mae', min_child_samples=5, n_estimators=200)

Predict confirmed cases for tomorrow

In [36]:
us_data = df.loc[(df['Confirmed'] > 0) & (df['Country/Region']=='US')]
today_data = us_data.iloc[-1]

confimed = today_data[confirmed1].values
transformed = today_data[confirmed2].values
confirmed_today = today_data['Confirmed']
tomorrow_confirmed = model.predict(np.hstack([confimed, transformed]).reshape(1,-1))[0]
print(f'Total Confimred cases tomorrow in US - {tomorrow_confirmed}')

Total Confimred cases tomorrow in US - 98632.5598635329
