In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
pd.set_option('display.max_columns',500)

from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
# read data
train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv', parse_dates=['Date'])
test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv', parse_dates=['Date'])

In [None]:
# rename columns
train = train.rename(columns={'Country_Region':'Country','Province_State':'State'})
test = test.rename(columns={'Country_Region':'Country','Province_State':'State'})

In [None]:
# fill na in state columns with empty
train.State = train.State.fillna('Empty')
test.State = test.State.fillna('Empty')

In [None]:
train['day'] = train['Date'].dt.day
train['month'] = train['Date'].dt.month
train['dayofweek'] = train['Date'].dt.dayofweek
train['dayofyear'] = train['Date'].dt.dayofyear
train['quarter'] = train['Date'].dt.quarter
train['weekofyear'] = train['Date'].dt.weekofyear

test['day'] = test['Date'].dt.day
test['month'] = test['Date'].dt.month
test['dayofweek'] = test['Date'].dt.dayofweek
test['dayofyear'] = test['Date'].dt.dayofyear
test['quarter'] = test['Date'].dt.quarter
test['weekofyear'] = test['Date'].dt.weekofyear

In [None]:
from xgboost.sklearn import XGBRegressor

In [None]:
model = XGBRegressor(n_estimators=1000)

In [None]:
# get a list of countries to predict for each country separately
countries = train.Country.unique().tolist()

#create a results dataframe
results_df = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})

for c in countries:
    states = train.loc[train.Country == c,:].State.unique().tolist()
    for state in states:
        X_train = train.loc[(train.Country == c) & (train.State == state), :]
        
        y1 = X_train[['ConfirmedCases']]
        y2 = X_train[['Fatalities']]
        
        X_train = X_train.drop(['Id','ConfirmedCases','Fatalities','State','Country','Date'], axis=1)
        
        X_test = test.loc[(test.Country == c) & (test.State == state), :]
        
        results_temp = X_test[['ForecastId']]
        
        X_test = X_test.drop(['ForecastId','State','Country','Date'], axis=1)
        
        model_confirmed = model.fit(X_train, y1)
        prediction_confirmed = model_confirmed.predict(X_test)
        
        model_fatalities = model.fit(X_train, y2)
        prediction_fatalities = model_fatalities.predict(X_test)
        
        results_temp['ConfirmedCases'] = prediction_confirmed
        results_temp['Fatalities'] = prediction_fatalities
        
        results_df = pd.concat([results_df, results_temp], axis=0)
        #print('Country:',c,'  State:', state)

In [None]:
sub = results_df.copy()

In [None]:
sub.ConfirmedCases = sub.ConfirmedCases.astype(int)
sub.Fatalities = sub.Fatalities.astype(int)
sub.ForecastId = sub.ForecastId.astype(int)

In [None]:
sub.describe()

In [None]:
sub.to_csv('submission.csv', index=False)