Required imports

In [0]:
import numpy as np 
import pandas as pd
import datetime  

Reading COVID-19 data

In [3]:
data_frame = pd.read_csv("sample_data/covid_19_data.csv")
data_frame['Date'] = pd.to_datetime(data_frame['ObservationDate'])
data_frame.columns

Index(['SNo', 'ObservationDate', 'Province/State', 'Country/Region',
       'Last Update', 'Confirmed', 'Deaths', 'Recovered', 'Date'],
      dtype='object')

Selecting only requuired columns

In [0]:
columns = ['Province/State', 'Country/Region', 'Date', 'Confirmed', 'Deaths', 'Recovered']
data_frame = data_frame[columns]
data_frame = data_frame.reset_index(drop=True)

Filling default values where data was blank

In [0]:
data_frame['Province/State'] = data_frame['Province/State'].fillna('Others')
data_frame['Confirmed'] = data_frame['Confirmed'].fillna(0)
data_frame = data_frame.sort_values(['Country/Region','Province/State','Date'])

Aggregate conforemd cases Country/Region/Date wise

In [0]:
data_frame = data_frame.groupby(['Country/Region','Province/State','Date']).agg({'Confirmed':'sum'}).reset_index()
data_frame['Province/State'] = 'all'

In [7]:
data_frame['Country/Region'].value_counts()

Mainland China     1761
US                 1388
Australia           287
Canada              208
France               91
                   ... 
Zambia                1
('St. Martin',)       1
Kyrgyzstan            1
St. Martin            1
Gambia, The           1
Name: Country/Region, Length: 179, dtype: int64

Reading World data

In [8]:
world_data = pd.read_csv("sample_data/countries of the world.csv")
world_data['Country'] = world_data['Country'].str.strip()
world_data

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,000,2306,16307,700.0,360,32,1213,022,8765,1,466,2034,038,024,038
1,Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,0232,0188,0579
2,Algeria,NORTHERN AFRICA,32930091,2381740,138,004,-039,31,6000.0,700,781,322,025,9653,1,1714,461,0101,06,0298
3,American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,,,
4,Andorra,WESTERN EUROPE,71201,468,1521,000,66,405,19000.0,1000,4972,222,0,9778,3,871,625,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,West Bank,NEAR EAST,2460492,5860,4199,000,298,1962,800.0,,1452,169,1897,6413,3,3167,392,009,028,063
223,Western Sahara,NORTHERN AFRICA,273008,266000,10,042,,,,,,002,0,9998,1,,,,,04
224,Yemen,NEAR EAST,21456188,527970,406,036,0,615,800.0,502,372,278,024,9698,1,4289,83,0135,0472,0393
225,Zambia,SUB-SAHARAN AFRICA,11502010,752614,153,000,0,8829,800.0,806,82,708,003,929,2,41,1993,022,029,0489


Combine world and COVID-19 data

In [0]:
data_frame = pd.merge(data_frame, world_data, how='left', left_on=['Country/Region'], right_on=['Country'])
data_frame['Date'] = pd.to_datetime(data_frame['Date'])

In [10]:
data_frame.loc[data_frame['Country'].isnull()]['Country/Region'].value_counts()

Mainland China                    1761
US                                1388
UK                                  68
South Korea                         57
Others                              41
North Macedonia                     22
Bosnia and Herzegovina              14
Holy See                             9
Ivory Coast                          9
Congo (Kinshasa)                     8
occupied Palestinian territory       7
Saint Barthelemy                     7
Antigua and Barbuda                  6
Trinidad and Tobago                  5
Palestine                            5
Eswatini                             5
Kosovo                               4
Congo (Brazzaville)                  4
Vatican City                         4
Central African Republic             4
Republic of the Congo                3
The Bahamas                          3
Montenegro                           2
Curacao                              2
The Gambia                           2
Channel Islands          

In [0]:
data_frame.loc[data_frame['Region'].isnull(), 'Region'] = 'Others'
data_frame.loc[data_frame['Country'].isnull(), 'Country'] = 'Undefined'

In [12]:
data_frame['Country'].value_counts()

Undefined      3446
Australia       287
Canada          208
France           91
Taiwan           57
               ... 
Gambia, The       1
Mauritius         1
Djibouti          1
Zambia            1
Kyrgyzstan        1
Name: Country, Length: 149, dtype: int64

In [0]:
from sklearn.preprocessing import MinMaxScaler
original_confirmed = data_frame['Confirmed']
transformer = MinMaxScaler(feature_range=(0,1)).fit(np.asarray([0, 2E5]).reshape(-1,1))
tranformed_confirmed = pd.Series(transformer.transform(original_confirmed.values.reshape(-1,1)).reshape(-1))
data_frame['Confirmed_transformed'] = tranformed_confirmed

Encoding text data

In [0]:
from sklearn.preprocessing import LabelEncoder

encoded_country = LabelEncoder().fit(data_frame['Country/Region'])
data_frame['encoded_country'] = encoded_country.transform(data_frame['Country/Region'])
encoded_region = LabelEncoder().fit(data_frame['Region'])
data_frame['encoded_region'] = encoded_region.transform(data_frame['Region'])

Spitting into train and test data

In [15]:
from sklearn.model_selection import train_test_split
train_data = data_frame.loc[data_frame['Confirmed'] > 50].copy()
print(f'Train Data shape {train_data.shape}')
train, valid = train_test_split(train_data, test_size=0.2, shuffle=True, random_state=200000)

Train Data shape (2271, 27)


Fitting data into LGBM model

In [16]:
from lightgbm import LGBMRegressor    
model = LGBMRegressor(n_estimators=200, metric='mae', min_child_samples=5, min_child_weight=0.001)
#model_lgbm = LGBMRegressor(n_estimators=200, random_state=1000, min_child_samples=5, min_child_weight=0.000001)
#model_lgbm = LGBMRegressor(n_estimators=200, metric='mae', min_child_samples=5, min_child_weight=0.000001)
#model_lgbm = LGBMRegressor(n_estimators=200, metric='mae', min_child_samples=10, min_child_weight=0.000001)

confirmed_1 = ['Confirmed']
confirmed_2 = ['Confirmed_transformed']

model.fit(X=train[confirmed_1+confirmed_2], y=train['Confirmed'], 
               eval_set=(valid[confirmed_1+confirmed_2], valid['Confirmed']),
               early_stopping_rounds=100, verbose=10)

Training until validation scores don't improve for 100 rounds.
[10]	valid_0's l1: 1085.48
[20]	valid_0's l1: 372.012
[30]	valid_0's l1: 148.928
[40]	valid_0's l1: 93.9282
[50]	valid_0's l1: 79.9089
[60]	valid_0's l1: 75.2717
[70]	valid_0's l1: 73.7761
[80]	valid_0's l1: 74.0701
[90]	valid_0's l1: 74.1816
[100]	valid_0's l1: 74.226
[110]	valid_0's l1: 74.1989
[120]	valid_0's l1: 74.181
[130]	valid_0's l1: 74.1523
[140]	valid_0's l1: 74.1145
[150]	valid_0's l1: 74.102
[160]	valid_0's l1: 74.0241
[170]	valid_0's l1: 73.9338
Early stopping, best iteration is:
[70]	valid_0's l1: 73.7761


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              metric='mae', min_child_samples=5, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=200, n_jobs=-1, num_leaves=31,
              objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              silent=True, subsample=1.0, subsample_for_bin=200000,
              subsample_freq=0)

Predicting Data

In [17]:
italy_data = data_frame.loc[(data_frame['Confirmed'] > 0) & (data_frame['Country/Region']=='Italy')]
last_day_data = italy_data.iloc[-1]

# print(nextDay.count)
# print(nextDay['Confirmed'])
# print(history0['Date'])

confimed = last_day_data[confirmed_1].values
transformed = last_day_data[confirmed_2].values
confirmed_yesterday = last_day_data['Confirmed']
next_day_confirmed = model.predict(np.hstack([confimed, transformed]).reshape(1,-1))[0]
print(f'Total Confimred Tommorrow- {next_day_confirmed}')

Total Confimred Tommorrow- 39080.11740453535
