In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../input/covid19-global-forecasting-week-4/train.csv")

test = pd.read_csv("../input/covid19-global-forecasting-week-4/test.csv")

submission = pd.read_csv("../input/covid19-global-forecasting-week-4/submission.csv")

In [None]:
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [None]:
train = train[train['Date']<='2020-04-14']
train['part'] = 'train'
test['part'] = 'test'

In [None]:
train['Location'] = train['Province_State'].astype(str) + train['Country_Region'].astype(str)
test['Location'] = test['Province_State'].astype(str) + test['Country_Region'].astype(str)

In [None]:
test = test.merge(train[['ConfirmedCases','Fatalities','Location','Date']],how='left',on=['Location','Date'])

In [None]:
train = train[train['Date']<='2020-04-01']

In [None]:
data = pd.concat([train,test],axis=0)
data = data.sort_values(['Country_Region','Date'])

In [None]:
data = data.melt(id_vars=['ForecastId','Date','Location','part'],value_vars=['ConfirmedCases','Fatalities'],value_name='Target').sort_values(['Location','Date'])

In [None]:
data

In [None]:
data['Day'] = data['Date'].astype(str).apply(lambda x: int(''.join(x.split('-')[1:])))
data['Month'] = data.Date.dt.month

In [None]:
data['lag_1'] = data.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(1))
data['lag_2'] = data.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(2))
data['lag_3'] = data.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(3))
data['lag_4'] = data.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(4))

In [None]:
data['Diff1'] = data['lag_1'] - data['lag_2']
data['Diff2'] = data['lag_2'] - data['lag_3']
data['Diff3'] = data['lag_3'] - data['lag_4']
data['Diffavg'] = (data['Diff1'] + data['Diff2'] +data['Diff3'])/3

In [None]:
data["Inc1"] = (data['Diff1'] / data['lag_2'])*100
data["Inc2"] = (data['Diff2'] / data['lag_3'])*100
data["Inc3"] = (data['Diff3'] / data['lag_4'])*100
data['Incavg'] = (data['Inc1'] + data['Inc2'] +data['Inc3'])/3

In [None]:
data = data[data['Date']>'2020-02-19']
data.drop(['Diff1','Diff2','Diff3'],axis=1,inplace=True)

In [None]:
data

In [None]:
from sklearn.preprocessing import LabelEncoder

encoderloc = LabelEncoder()
encodervar = LabelEncoder()

data['Location'] = encoderloc.fit_transform(data['Location'])
data['variable'] = encodervar.fit_transform(data['variable'])

data.head()

In [None]:
features = ['Day','Location','variable','lag_1','lag_2','Diffavg','Inc1','Inc2','Incavg','Month']

In [None]:
x_train = data[data['Date']<='2020-04-14']
#x_train.sort_values(['Day','Location'],inplace=True)
y_train = x_train['Target']
x_val = data[(data['Date']>='2020-04-02')&(data['Date']<='2020-04-14')]
#x_val.sort_values(['Day','Location'],inplace=True)
y_val = x_val['Target']
test_ = data[data['part']=='test']
#test_.sort_values(['Day','Location'],inplace=True)

In [None]:
from xgboost import DMatrix,train,plot_importance,XGBRegressor

In [None]:
params = {'objective': 'reg:squarederror',
         'n_jobs': -1,
         'seed': 236,
         }


In [None]:
from tqdm import tqdm

In [None]:
def feature(test_,ctr):
    if ctr==1:
        lags = [1]
    elif ctr==2:
        lags = [1,2]
    elif ctr==3:
        lags = [1,2,3]
    else:
        lags = [1,2,3,4]
    for i in lags:
        test_['lag_'+str(i)] = test_.groupby(['Location','variable'])['Target'].transform(lambda x: x.shift(i))
    test_['Diff1'] = test_['lag_1'] - test_['lag_2']
    test_['Diff2'] = test_['lag_2'] - test_['lag_3']
    test_['Diff3'] = test_['lag_3'] - test_['lag_4']
    test_['Diffavg'] = (test_['Diff1'] + test_['Diff2'] +test_['Diff3'])/3
    test_["Inc1"] = (test_['Diff1'] / test_['lag_2'])*100
    test_["Inc2"] = (test_['Diff2'] / test_['lag_3'])*100
    test_["Inc3"] = (test_['Diff3'] / test_['lag_4'])*100
    test_['Incavg'] = (test_['Inc1'] + test_['Inc2'] +test_['Inc3'])/3
    test_.drop(['Diff1','Diff2','Diff3'],axis=1,inplace=True)
    ctr+=1
    
    return test_

In [None]:
days = test_['Day'].unique().tolist()
days = days[13:]

In [None]:
train_set = DMatrix(x_train[features],y_train)
val_set = DMatrix(x_val[features],y_val)
model = train(params,train_set,num_boost_round=100,evals=[(val_set,'validation')],verbose_eval=50)
ctr = 1
for j in days:
    test_set = DMatrix(test_[test_['Day']==j][features])
    test_.loc[test_['Day']==j,'Target'] = model.predict(test_set)
    test_ = feature(test_,ctr)
    ctr+=1

In [None]:
sub = test_[['ForecastId','variable','Target']]

In [None]:
sub = pd.pivot(sub,index='ForecastId',columns='variable',values='Target').reset_index()

In [None]:
sub['ForecastId'] = sub['ForecastId'].astype(int)

sub.columns = ['ForecastId','ConfirmedCases','Fatalities']

In [None]:
sub.to_csv("submission.csv",index=False)