In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Background**

The White House Office of Science and Technology Policy (OSTP) pulled together a coalition research groups and companies (including Kaggle) to prepare the COVID-19 Open Research Dataset (CORD-19) to attempt to address key open scientific questions on COVID-19. Those questions are drawn from National Academies of Sciences, Engineering, and Medicine’s (NASEM) and the World Health Organization (WHO).

# The Challenge

Kaggle is launching a companion COVID-19 forecasting challenges to help answer a subset of the NASEM/WHO questions. While the challenge involves forecasting confirmed cases and fatalities between April 15 and May 14 by region, the primary goal isn't only to produce accurate forecasts. It’s also to identify factors that appear to impact the transmission rate of COVID-19.

You are encouraged to pull in, curate and share data sources that might be helpful. If you find variables that look like they impact the transmission rate, please share your finding in a notebook.

As the data becomes available, we will update the leaderboard with live results based on data made available from the Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE).

We have received support and guidance from health and policy organizations in launching these challenges. We're hopeful the Kaggle community can make valuable contributions to developing a better understanding of factors that impact the transmission of COVID-19.

# Objective

In this challenge, you will be predicting the cumulative number of confirmed COVID19 cases in various locations across the world, as well as the number of resulting fatalities, for future dates.

In [None]:
#reading the files
train=pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/train.csv',parse_dates=['Date'])
test=pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/test.csv',parse_dates=['Date'])
g=train

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.tail()

In [None]:
test.shape

In [None]:
#describing the details of train and test dataframe
train.info()

In [None]:
test.info()

In [None]:
#checking for missing values in train and set
train.isnull().any()

In [None]:
test.isnull().any()

In [None]:
#visualizing missing data for train and test
import missingno as msno
msno.matrix(train)

In [None]:
msno.matrix(test)

In [None]:
#identifying null values
train['Province_State'].isnull().value_counts()

In [None]:
#dropping unecessary columns
train.drop(columns=['Province_State','Id'],axis=1,inplace=True)
test.drop(columns=['Province_State','ForecastId'],axis=1,inplace=True)

In [None]:
train.head()

In [None]:
test.head()

It was seen in the dataset that the **'Province_State'** column of the training dataset was dropped due to null values being more than 50%,but some countries involved values in the **'Province_State'** too,so to include that countries,we will use groupby with aggregation function of sum for all the countries which are repeated and combining them into an single country for better day to day analysis. 

In [None]:
#dataframe for plotting the cases and fatalities around the world
train_data_by_country = train.groupby(['Date','Country_Region'],as_index=False).agg({'ConfirmedCases': 'sum', 'Fatalities': 'sum'})

In [None]:
train_data_by_country

The use of group by function has enabled us to diaplay all the cases and fatalities worldwide taking place on daily.


In [None]:
#visualizing clean train and test datasets
msno.matrix(train)

In [None]:
msno.matrix(test)

In [None]:
#checking trend of confirmed cases around top countries
topc=["US","China","Spain",'France','United Kingdom','Italy','Brazil','Belgium','Germany','Iran','Canada']
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode(connected = True)
for i in topc:
    df=train_data_by_country[train_data_by_country['Country_Region']==i]
    fig=px.line(df,x="Date",y="ConfirmedCases",title='Daily Analysis of Confirmed Cases in' + " " +i)
    fig.show()

While most of the countries show exponential trend,**China** shows constant trend of confirmed cases around beginning of March.

In [None]:
#checking trend of fatalities around top countries
for i in topc:
    df=train_data_by_country[train_data_by_country['Country_Region']==i]
    fig=px.line(df,x='Date',y='Fatalities',title='Daily Analysis of Fatalities for' + " " +i,color_discrete_map={'Fatalities':'Red'})
    fig.show()

While most of the countries shows exponential trend in fatalities,**China** shows constant fatality rate around mid-march to mid-april then after sharp increase it becomes constant at beginning of may.

Similarly,we can calaculate trend of confirmed cases and fatalities of any country.

In [None]:
#setting DatetimeIndex for train
train=train.set_index(['Date'])

In [None]:
#visualizing Confirmed Cases per month around the world
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
plt.title('Confirmed Cases on Monthly Basis around the world')
plt.ylabel('Confirmed Cases')
train.ConfirmedCases.resample('M').plot(figsize=(10,10))

* During the month of January,the cases are growing at a slow rate mainly on countries around **China**.Cases are less than 10000.

* During the month of February,there is an gradual increase in the number of Confirmed Cases reaching more than 50000 at end of the month.The cases are at constant rate when half of the month is passed till the end of the month.

* During the month of March,the rate of Confirmed Cases is like previous month (constant around 60000) and it is continued till mid-march but starts increasing by the end of the month from 60000 to more than 100000.

* During the month of April and May,the cases start to grow exponentially till date indicating worse effects more than 300000.



In [None]:
#visualising fatalities per month around the world
plt.title('Fatalities on Monthly Basis around the world')
plt.ylabel('Fatalities')
train.Fatalities.resample('M').plot(figsize=(10,10))

* During the month of January,the fatalities starts to grow at very slow rate ranging around 100-300.

* During the month of February,there is an gradual increase in the number of fatalities ranging around 2500 till the end of month.

* During the month of March the fatalities happen at a constant rate till mid-month(around 2800-3000) but shows sharp increase at the end of the month(reaching more than 10000).

* During the month of April-May,the fatalities start to grow exponentially making it a global calamity till date ranging around 10000-35000. 


In [None]:
#plotting both the independent variables
train.plot(figsize=(10,10))
plt.title('Ratio of Confirmed Cases to Fatalities around the world for every month')

The Fatalities started to occur at very slow pace from January to mid-march as compared to occurence of Confirmed Cases but after mid-march till date, the rate of fatalities is increasing gradually.

In [None]:
s=train_data_by_country['Country_Region'].tolist()

There are no iso3 or iso2 codes of countries to plot them on choropleth.Therefore,without including them from an another dataset we can make use of a library known as **'country_converter'** to find all relevant details about any country.

More details at-https://pypi.org/project/country-converter/

In [None]:
pip install country_converter

In [None]:
#generating iso3 names of the countries
import country_converter as cc
iso_alpha=cc.convert(names=s,to='ISO3')

In [None]:
#combining the codes into the dataframe train_data_by_country
train_data_by_country['iso_codes']=iso_alpha

In [None]:
train_data_by_country

In [None]:
#generating choropleth maps for visualizing day to day analysis of all confirmed cases around the world
train_date=train_data_by_country['Date'].astype(str)
fig_1=px.choropleth(train_data_by_country,locations='iso_codes',color='ConfirmedCases',hover_name='Country_Region',
                 hover_data=['ConfirmedCases'],animation_frame=train_date,
                  color_continuous_scale=px.colors.sequential.Purpor,title='Confirmed Cases around the world on daily basis')

fig_1.show()

In [None]:
#generating choropleth maps for visualizing day to day analysis of all fatalities around the world
train_date=train_data_by_country['Date'].astype(str)
fig_2=px.choropleth(train_data_by_country,locations='iso_codes',color='Fatalities',hover_name='Country_Region',
                 hover_data=['Fatalities'],animation_frame=train_date,
                color_continuous_scale=px.colors.sequential.PuRd,title='Fatalities around the world on daily basis')
fig_2.show()

In [None]:
#adding the date column
train['Date']=train.index

In [None]:
#removing the Datetime Index of train
train.index=g.index

In [None]:
#separating train date into day,month and year values and adding it to the train
train['Day']=train['Date'].dt.day
train['Month']=train['Date'].dt.month
train['Year']=train['Date'].dt.year

In [None]:
#dropping the year column in train
train.drop(columns=['Year'],inplace=True)

In [None]:
#separating test date into day,month and year values and adding it to the test
test['Day']=test['Date'].dt.day
test['Month']=test['Date'].dt.month
test['Year']=test['Date'].dt.year

In [None]:
#dropping the year column in test
test.drop(columns='Year',inplace=True)

In [None]:
test.head()

In [None]:
train.head()

In [None]:
del train['Date']
del test['Date']

In [None]:
#converting the independent variables into int datatype
train['ConfirmedCases'] = train['ConfirmedCases'].apply(int)
train['Fatalities'] = train['Fatalities'].apply(int)

In [None]:
cases = train.ConfirmedCases
fatalities = train.Fatalities
del train['ConfirmedCases']
del train['Fatalities']

In [None]:
#Handling Categorical data
from sklearn.preprocessing import LabelEncoder,StandardScaler
lb = LabelEncoder()
train['Country_Region'] = lb.fit_transform(train['Country_Region'])
test['Country_Region'] = lb.transform(test['Country_Region'])

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(train.values)
x_test = scaler.transform(test.values)

In [None]:
#using xgboost for prediction 
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor

In [None]:
#fitting the model
dt=DecisionTreeRegressor(random_state=0)
lg=LGBMRegressor()
lr=LinearRegression()
classifier=[('Linear Regression',lr),('DecisionTreeRegressor',dt),('GradientBoosting',lg)]

In [None]:
vc=VotingRegressor(estimators=classifier)
vc.fit(x_train,cases)

In [None]:
#predicting the confirmed cases
cases_pred = vc.predict(x_test)
cases_pred

In [None]:
#rounding off the cases to nearest number
cases_pred = np.around(cases_pred,decimals = 0)
cases_pred

In [None]:
#fitting the model
vc=VotingRegressor(estimators=classifier)
vc.fit(x_train,fatalities)

In [None]:
#predicting the fatalities
fatalities_pred = vc.predict(x_test)
fatalities_pred

In [None]:
#rounding off the Fatalities to nearest number
fatalities_pred = np.around(fatalities_pred,decimals = 0)
fatalities_pred

In [None]:
#submitting the required result
submission=pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/submission.csv')
submission['ConfirmedCases'] = cases_pred
submission['Fatalities'] = fatalities_pred

In [None]:
submission.head(10)

In [None]:
submission.to_csv("submission.csv" , index = False)

**If you like this notebook do upvote it.**

Do provide your valuable feedback.

Do checkout my other notebooks at https://www.kaggle.com/tmchls