In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# Loading the train data set
train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/train.csv')
print(train.head())
print(train.tail())

In [None]:
# Loading the train data set
test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-3/test.csv')
print(test.head())
print(test.tail())

In [None]:
# Shape of the data
print(train.shape)
print(test.shape)

In [None]:
print(train.info())
print(test.info())

**Basic EDA**

In [None]:
# Number of confirmed cases in different countries
import seaborn as sns
cm = sns.light_palette("green", as_cmap=True)
country_wise_count = train.loc[train['Date']=='2020-04-03']
country_wise_count=country_wise_count.groupby('Country_Region')['ConfirmedCases','Fatalities'].sum().reset_index()
country_wise_count=country_wise_count.sort_values(by='ConfirmedCases', ascending=False).reset_index()
country_wise_count=country_wise_count.drop(['index'], axis=1)
country_wise_count.style.background_gradient(cmap=cm)

In [None]:
# Graph showing the global confirmed cases and cases in China
%matplotlib inline
import matplotlib.pyplot as plt
sns.set(style="darkgrid")
fig, axs = plt.subplots(ncols=2, figsize=(12,4))
temp=country_wise_count.head(15)
temp.plot(x="Country_Region", y=["ConfirmedCases", "Fatalities"], kind="bar",ax=axs[0])
ax=axs[0].set_title('People infected globally').set_fontsize('15')
temp=train.loc[train['Country_Region'] == 'China']
temp=temp.loc[temp['Date'] == '2020-04-03']
temp=temp.sort_values(by='ConfirmedCases', ascending=False).head(20)
temp.plot(x="Province_State", y=["ConfirmedCases", "Fatalities"], kind="bar",ax=axs[1])
ax=axs[1].set_title('Province wise infection in China').set_fontsize('15')

In [None]:
# Timeline of the spread of the disease Globally and in China
fig, axs = plt.subplots(ncols=2, figsize=(12,4))
datewise_count=train.groupby('Date')["ConfirmedCases", "Fatalities"].sum().reset_index()
datewise_count.plot(x="Date", y=["ConfirmedCases", "Fatalities"],ax=axs[0])
ax=axs[0].set_title('Spread of the disease Golbally').set_fontsize('15')
temp=train.loc[train['Country_Region'] == 'China']
datewise_count=temp.groupby('Date')["ConfirmedCases", "Fatalities"].sum().reset_index()
datewise_count.plot(x="Date", y=["ConfirmedCases", "Fatalities"],ax=axs[1])
ax=axs[1].set_title('Spread of the disease in China').set_fontsize('15')

# **Data Processing**

In [None]:
# Extracting the number of days from the date and replacing the Null values in the Province Column
# Train data

train = train.rename(columns={"Date": "date","Country_Region": "country","Province_State": 'state', "ConfirmedCases":"confirm","Fatalities": "deaths"})
train['date'] =  pd.to_datetime(train['date'], format='%Y-%m-%d')
train['days']= (train['date'] - pd.to_datetime('1/22/2020')).astype(str).str[:2].apply(pd.to_numeric)
train.drop(['Id','date'],inplace=True, axis=1)
train.state.fillna(train.country, inplace=True)
print(train.head())

# Test data
test = test.rename(columns={"Country_Region": "country",'Province_State': 'state','Date':'date'})
test['date'] =  pd.to_datetime(test['date'], format='%Y-%m-%d')
test['days']= (test['date'] - pd.to_datetime('2020-03-19')).astype(str).str[:2].apply(pd.to_numeric)
test.drop(['date'],inplace=True, axis=1)
test.state.fillna(test.country, inplace=True)
print(test.head())

In [None]:
# Label encoding the categorical columns
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder() 
# Encode labels in column 'species'. 
train['state']= label_encoder.fit_transform(train['state'])
train['country']= label_encoder.fit_transform(train['country'])
test['state']= label_encoder.fit_transform(test['state'])
test['country']= label_encoder.fit_transform(test['country'])

In [None]:
# Building the model and making prediction
from xgboost import XGBRegressor
country_list = train.country.unique()

final_df = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})

for i in country_list:
    states = train.loc[train.country == i, :].state.unique()
    for j in states:
        country_wise = train.loc[(train.country == i) & (train.state == j)]
        
        yconfrim_country_wise = country_wise['confirm']
        ydeath_country_wise = country_wise['deaths']
        
        X_country_wise = country_wise.drop(['confirm','deaths'],axis=1)
        
        country_wise_test = test.loc[(test.country == i) & (test.state == j)]
        
        Test_Id = country_wise_test['ForecastId']
        country_wise_test.drop('ForecastId',axis=1,inplace=True)
        
        model1 = XGBRegressor(n_estimators=1000)
        model1.fit(X_country_wise, yconfrim_country_wise)
        y_confirm_pred = model1.predict(country_wise_test)
        
        model2 = XGBRegressor(n_estimators=1000)
        model2.fit(X_country_wise, ydeath_country_wise)
        y_death_pred = model2.predict(country_wise_test)
        
        df = pd.DataFrame({'ForecastId': Test_Id, 'ConfirmedCases': y_confirm_pred, 'Fatalities': y_death_pred})
        final_df = pd.concat([final_df, df], axis=0)
final_df.ForecastId = final_df.ForecastId.astype('int')
final_df

In [None]:
final_df.to_csv('submission.csv', index=False)