In [0]:
import numpy as np 
import pandas as pd
import datetime  

In [0]:
data_frame = pd.read_csv("sample_data/covid_19_data.csv")
data_frame['Date'] = pd.to_datetime(data_frame['ObservationDate'])
data_frame['Outbreak'] = 'COVID_2019'
data_frame.columns

In [0]:
columns = ['Outbreak', 'Province/State', 'Country/Region', 'Date', 'Confirmed', 'Deaths', 'Recovered']
data_frame = data_frame[columns]
data_frame = data_frame.reset_index(drop=True)

In [0]:
data_frame['Province/State'] = data_frame['Province/State'].fillna('Others')
data_frame['Confirmed'] = data_frame['Confirmed'].fillna(0)
data_frame = data_frame.sort_values(['Country/Region','Province/State','Date'])

In [0]:
data_frame = data_frame.groupby(['Outbreak','Country/Region','Province/State','Date']).agg({'Confirmed':'sum'}).reset_index()
data_frame['Province/State'] = 'all'

In [0]:
data_frame['Country/Region'].value_counts()

In [0]:
world_data = pd.read_csv("sample_data/countries of the world.csv")
world_data['Country'] = world_data['Country'].str.strip()
world_data

In [0]:
data_frame = pd.merge(data_frame, world_data, how='left', left_on=['Country/Region'], right_on=['Country'])
data_frame['Date'] = pd.to_datetime(data_frame['Date'])

In [0]:
data_frame.loc[data_frame['Country'].isnull()]['Country/Region'].value_counts()

In [0]:
data_frame.loc[data_frame['Region'].isnull(), 'Region'] = 'Others'
data_frame.loc[data_frame['Country'].isnull(), 'Country'] = 'Undefined'

In [0]:
data_frame['Country'].value_counts()

In [0]:
from sklearn.preprocessing import MinMaxScaler
original_confirmed = data_frame['Confirmed']
transformer = MinMaxScaler(feature_range=(0,1)).fit(np.asarray([0, 2E5]).reshape(-1,1))
tranformed_confirmed = pd.Series(transformer.transform(original_confirmed.values.reshape(-1,1)).reshape(-1))
data_frame['Confirmed_transformed'] = tranformed_confirmed

In [0]:
from sklearn.preprocessing import LabelEncoder

encoded_country = LabelEncoder().fit(data_frame['Country/Region'])
data_frame['encoded_country'] = encoded_country.transform(data_frame['Country/Region'])

encoded_outbreak = LabelEncoder().fit(data_frame['Outbreak'])
data_frame['encoded_outbreak'] = encoded_outbreak.transform(data_frame['Outbreak'])


encoded_region = LabelEncoder().fit(data_frame['Region'])
data_frame['encoded_region'] = encoded_region.transform(data_frame['Region'])

In [63]:
from sklearn.model_selection import train_test_split
train_data = data_frame.loc[data_frame['Confirmed'] > 50].copy()
print(f'Train Data shape {train_data.shape}')
train, valid = train_test_split(train_data, test_size=0.2, shuffle=True, random_state=200000)

Train Data shape (2271, 72)


In [0]:
from lightgbm import LGBMRegressor    
model = LGBMRegressor(n_estimators=200, metric='mae', min_child_samples=5, min_child_weight=0.001)
#model_lgbm = LGBMRegressor(n_estimators=200, random_state=1234, min_child_samples=5, min_child_weight=0.000001)
#model_lgbm = LGBMRegressor(n_estimators=200, metric='mae', min_child_samples=5, min_child_weight=0.000001)
#model_lgbm = LGBMRegressor(n_estimators=200, metric='mae', min_child_samples=10, min_child_weight=0.000001)
#print(f'Fitting on data with shape {train['Confirmed', 'Confirmed_transformed'].shape} with validation of shape {valid[X_mask_cat+X_mask_lags].shape}')

confirmed_1 = ['Confirmed']
confirmed_2 = ['Confirmed_transformed']

model.fit(X=train[confirmed_1+confirmed_2], y=train['Confirmed'], 
               eval_set=(valid[confirmed_1+confirmed_2], valid['Confirmed']),
               early_stopping_rounds=100, verbose=10)

In [65]:
italy_data = data_frame.loc[(data_frame['Outbreak']=='COVID_2019') & (data_frame['Confirmed'] > 0) & (data_frame['Country/Region']=='Italy')]
# nextDay = valid.loc[(df['Outbreak']=='COVID_2019') & (df['Confirmed'] > 0) & (df['Country/Region']=='Italy')]
# print(nextDay['Date'])
last_day_data = italy_data.iloc[-1]
#nextDay1 = nextDay.iloc[-1]
# print(nextDay.count)
# print(nextDay['Confirmed'])
# print(history0['Date'])
confimed = last_day_data[confirmed_1].values
transformed = last_day_data[confirmed_2].values
y = last_day_data['Confirmed']
print(y)
next_day_confirmed = model.predict(np.hstack([confimed, transformed]).reshape(1,-1))[0]
print(next_day_confirmed)

35713.0
39080.11740453535
