# Project 3 - 140001742

I confirm that this is my own work, except where clearly indicated.

# Data Import

In [None]:
import pandas as pd
import numpy as np

Import in all the data sets I will be using for this model creation.

In [None]:
data_main = pd.read_csv("../input/corona/train (1).csv", header=0)

In [None]:
#First ensure the province/state column is consistently filled
data_main['Province_State'].fillna('', inplace=True)
#now change the date into datetime for ease of analysis
data_main['Date'] = pd.to_datetime(data_main['Date'])
#add new column DayOfYear stating which day of the year it is, this is as we do not want any data ater March 31st (the 91st day of the year)
data_main['DOY'] = data_main.Date.dt.dayofyear
#the submission must be given in terms of cumulative values of cases and deaths so the data set will be transformed to mirror this.
data_main[['ConfirmedCases','Fatalities']]=data_main.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases', 'Fatalities']].transform('cummax')

In [None]:
data_info = pd.read_csv("../input/corona/covid19countryinfo.csv", header=0)

In [None]:
data_info['region'].fillna('', inplace=True)

In [None]:
data_lockdown = pd.read_csv("../input/corona/countryLockdowndates.csv", header=0)
data_lockdown['Date'] = pd.to_datetime(data_lockdown['Date'])

I need to condense all of this information into one data table. 
First I will merge data_main with data_info, as data_main is the data set from the kaggle competition and so is necessary whilst data_info contains a wide range of information about each country and region.

In [None]:
data_new1 = pd.merge(data_main, data_info, how='left', left_on=['Country_Region', 'Province_State'], right_on=['country', 'region'])

To check that this merge was done correctly I will compare the dimensions of each of the datasets. If the merge was implemented correctly then the new dataset should have the same number of rows as data_main (20580) and the number of columns should be equal to data_main columns + data_info columns = 67.

Next I want to also include the Lockdown information from data_lockdown, so I will merge this with data_new1.

In [None]:
data_new2 = pd.merge(data_new1, data_lockdown, how='left', left_on=['Country_Region', 'Province_State', 'Date'], right_on=['Country/Region', 'Province', 'Date'])
# note in this new data set the column Type denotes the type of lockdown, this value changes on the date on which Lockdown was introduced for that specific country/region.

Again the merge can be checked by simply viewing the dimensions of the new dataset, data_new2 contains the same number of rows as data_new1, but contains 4 new columns; country/region, province, type, reference.

# Preparing the Data

Now that I have merged the different datasets to create data_new2 I will clean this data so that it is ready for model creation.

There are a lot of superfluous columns in this dataset (due to the merging process) and so I will drop those columns to reduce the size of the dataset.

In [None]:
data_new2.info()

Columns to drop:



*   region
*   country
*   alpha3code
*   alpha2code
*   active1
*   active2
*   active3
*   newcases1
*   newcases2
*   newcases3
*   newdeaths1
*   newdeaths2
*   newdeaths3
*   critical1
*   critical2
*   critical3
*   Country/Region
*   Province
*   Reference

In [None]:
coldrop = ['region', 'country', 'alpha3code', 'alpha2code', 'active1', 'active2', 'active3', 'newcases1', 'newcases2', 'newcases3', 'newdeaths1', 'newdeaths2', 'newdeaths3', 'critical1', 'critical2', 'critical3', 'Country/Region', 'Province', 'Reference']
data_new2.drop(coldrop, inplace=True, axis=1)

This has reduced the dataset to 52 columns, 49 of which are possible variables for the model.

# Data Cleaning

## NAs

Looking at the latest printout from data_new2.info() it is clear that there are many null cells in the dataset, otherwise the value in the non-null column would be 20580. I do not want NAs in the dataset so I will replace them with a 0 instead.



In [None]:
data_new2.fillna(0, inplace=True)

## DateTime

First I want to transform all of the variables that describe a date to the datetime format as this will ease the modelling process. I have already changed the Date column but there are others from the datasets which were merged.

Those variables are:


*   quarantine
*   schools
*   publicplace
*   gathering
*   nonessential
*   firstcase

In [None]:
data_new2['quarantine'] = pd.to_datetime(data_new2['quarantine'])
data_new2['schools'] = pd.to_datetime(data_new2['schools'])
data_new2['publicplace'] = pd.to_datetime(data_new2['publicplace'])
data_new2['gathering'] = pd.to_datetime(data_new2['gathering'])
data_new2['nonessential'] = pd.to_datetime(data_new2['nonessential'])
data_new2['firstcase'] = pd.to_datetime(data_new2['firstcase'])

# Now change each to day of year format for ease of modelling.

data_new2['quarantine'] = data_new2.quarantine.dt.dayofyear
data_new2['schools'] = data_new2.schools.dt.dayofyear
data_new2['publicplace'] = data_new2.publicplace.dt.dayofyear
data_new2['gathering'] = data_new2.gathering.dt.dayofyear
data_new2['nonessential'] = data_new2.nonessential.dt.dayofyear
data_new2['firstcase'] = data_new2.firstcase.dt.dayofyear


## Creating Location Variable

As the location is a combination of two columns; 'Country_Region' and 'Province_State', I will combine these two to create a single variable called 'Location'.

In [None]:
data_new2['Location'] = data_new2.Country_Region.astype(str) + ":" + data_new2.Province_State.astype(str)

# Now drop the two columns 'Country_Region' and 'Province_State'

data_new2.drop('Country_Region', inplace=True, axis=1)
data_new2.drop('Province_State', inplace=True, axis=1)

## Converting objects to float64

In [None]:
objcols = ['pop', 'tests', 'testpop', 'density', 'medianage', 'urbanpop', 'gatheringlimit', 'hospibed', 'smokers', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', 'malelung', 'gdp2019', 'healthexp', 'healthperpop', 'fertility', 'avgtemp', 'avghumidity', 'totalcases', 'active30', 'active31', 'deaths', 'newdeaths30', 'newdeaths31', 'recovered', 'critical30', 'critical31', 'casediv1m', 'deathdiv1m', 'Type', 'Location', 'newcases30', 'newcases31']
for i in objcols:
  data_new2[i] = pd.to_numeric(data_new2[i], errors='coerce')


## NAs again

In [None]:
data_new2.fillna(0, inplace=True)

# Exploring the Variables

For the models we are making we have two target variables: 'ConfirmedCases' and 'Fatalities'.
Which variables will be used as features for the model will now have to be decided.
I will create a correlation matrix to see if any of the variables stand out as being related to either of the target variables.

In [None]:
corr_matrix = data_new2.corr()
corr_matrix['ConfirmedCases'].sort_values(ascending=False)

In [None]:
corr_matrix['Fatalities'].sort_values(ascending=False)

In [None]:
corr_matrix.to_csv('corrr.csv', index=False)

Looking at these correlation matrices the variables I will begin my modeling process with will be

ConfirmedCases:

*   newdeaths31
*   newdeaths30
*   tests
*   quarantine
*   firstcase
*   deathdiv1m



Fatalities:

*   newdeaths31
*   newdeaths30
*   tests
*   deathdiv1m
*   quarantine
*   nonessential


Both of these will also contain 'DOY', 'Id' and 'Location'.

For my modelling process I will tackle each of the target variables individually.

In [None]:
CCfeatures = ['Location', 'DOY', 'Id', 'newdeaths31', 'newdeaths30', 'tests', 'quarantine', 'firstcase', 'deathdiv1m']
CCtarget = ['ConfirmedCases']
Ffeatures = ['Location', 'DOY', 'Id', 'newdeaths31', 'newdeaths30', 'tests', 'deathdiv1m', 'quarantine', 'nonessential']
Ftarget = ['Fatalities']

yCC = data_new2.loc[:,CCtarget]
yF = data_new2.loc[:,Ftarget]
xCC = data_new2.loc[:,CCfeatures]
xF = data_new2.loc[:,Ffeatures]

## Splitting into Training and Validation Sets

In [None]:
from sklearn.model_selection import train_test_split
xCC_train,xCC_test,yCC_train,yCC_test=train_test_split(xCC,yCC,test_size=0.2, random_state = 140001742)
xF_train,xF_test,yF_train,yF_test=train_test_split(xF,yF,test_size=0.2, random_state = 140001742)

# Training Models

## ConfirmedCases

### Model 1 - Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(xCC_train, yCC_train)

# Now to check the validity of the model

lin_predictions = lin_reg.predict(xCC_test)

from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(lin_predictions, yCC_test)
lin_rmse = np.sqrt(lin_mse)
print("MSE: %d" % lin_mse, end="\n")
print("RMSE: %d" % lin_rmse)

### Model 2 - Random Forest Regressor Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr_reg = RandomForestRegressor()
rfr_reg.fit(xCC_train, yCC_train)

# Check validity

rfr_predictions = rfr_reg.predict(xCC_test)

rfr_mse = mean_squared_error(yCC_test, rfr_predictions)
rfr_rmse = np.sqrt(rfr_mse)
print("MSE: %d" % rfr_mse, end="\n")
print("RMSE: %d" % rfr_rmse)

### Model 3 - Decision Tree Regressor Model

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(xCC_train, yCC_train)

# Now to check the validity of the model

dtr_predictions = tree_reg.predict(xCC_test)

dtr_mse = mean_squared_error(yCC_test, dtr_predictions)
dtr_rmse = np.sqrt(dtr_mse)
print("MSE: %d" % dtr_mse, end="\n")
print("RMSE: %d" % dtr_rmse)

### Model 4 - Random Forest Classifier

In [None]:
import math as math

In [None]:
from sklearn.ensemble import RandomForestClassifier
ranforclas = RandomForestClassifier(random_state=140001742)
ranforclas.fit(xCC_train, yCC_train)

# Now to check the validity of the model

ranforclas_predictions = ranforclas.predict(xCC_test)

rfc_mse = mean_squared_error(yCC_test, ranforclas_predictions)
rfc_rmse = math.sqrt(rfc_mse)
print("MSE: %d" % rfc_mse, end="\n")
print("RMSE: %d" % rfc_rmse)

Out of these four models the third (Decision Tree Regressor Model) has the lowest RMSE score and so provides the best predictions from the four models available. This will be the model used to predict the ConfirmedCases variable for the test data.

## Fatalities 

### Model 1 - Linear Regression

In [None]:
lin_reg1 = LinearRegression()
lin_reg1.fit(xF_train, yF_train)

# check validity

lin_predictions1 = lin_reg1.predict(xF_test)

lin_mse1 = mean_squared_error(lin_predictions1, yF_test)
lin_rmse1 = np.sqrt(lin_mse1)
print("MSE: %d" % lin_mse1, end="\n")
print("RMSE: %d" % lin_rmse1)

### Model 2 - Random Forest Regressor Model

In [None]:
rfr_reg1 = RandomForestRegressor()
rfr_reg1.fit(xF_train, yF_train)

# check validity

rfr_predictions1 = rfr_reg1.predict(xF_test)

rfr_mse1 = mean_squared_error(yF_test, rfr_predictions1)
rfr_rmse1 = np.sqrt(rfr_mse1)
print("MSE: %d" % rfr_mse1, end="\n")
print("RMSE: %d" % rfr_rmse1)

### Model 3 - Decision Tree Regressor Model

In [None]:
tree_reg1 = DecisionTreeRegressor()
tree_reg1.fit(xF_train, yF_train)

# Check Validity

dtr_predictions1 = tree_reg1.predict(xF_test)

dtr_mse1 = mean_squared_error(yF_test, dtr_predictions1)
dtr_rmse1 = np.sqrt(dtr_mse1)
print("MSE: %d" % dtr_mse1, end="\n")
print("RMSE: %d" % dtr_rmse1)

### Model 4 - Random Forest Classifier

In [None]:
ranforclas1 = RandomForestClassifier(random_state=140001742)
ranforclas1.fit(xF_train, yF_train)

# Check Validity

ranforclas_predictions1 = ranforclas1.predict(xF_test)

rfc_mse1 = mean_squared_error(yF_test, ranforclas_predictions1)
rfc_rmse1 = math.sqrt(rfr_mse1)
print("MSE: %d" % rfc_mse1, end="\n")
print("RMSE: %d" % rfc_rmse1)

Of these four models two (2 and 4) share an RMSE of 25, the lowest of the models. This suggests that both models provide a similar standard of predictions. The model chosen for predicting Fatalities is model two Random Forest Regressor Model, this is because it ran substantially faster than model 4 and as they provide similar predictions it is more efficient to select the faster model.

# Test Data

## Prepare the test Data

In [None]:
data_test = pd.read_csv("../input/corona/test (1).csv", header=0)

In [None]:
data_test['Province_State'].fillna('', inplace=True)
data_test['Date'] = pd.to_datetime(data_test['Date'])
data_test['DOY'] = data_test.Date.dt.dayofyear


Merge with the other data sets.

In [None]:
data_test1 = pd.merge(data_test, data_info, how='left', left_on=['Country_Region', 'Province_State'], right_on=['country', 'region'])


In [None]:
data_test1['Date'] = pd.to_datetime(data_test1['Date'])
data_test1.info()

In [None]:
data_lockdown['Date'] = pd.to_datetime(data_lockdown['Date'])
data_test2 = pd.merge(data_test1, data_lockdown, how='left', left_on=['Country_Region', 'Province_State', 'Date'], right_on=['Country/Region', 'Province', 'Date'])

In [None]:
data_test2.fillna(0, inplace=True)

In [None]:
data_test2['quarantine'] = pd.to_datetime(data_test2['quarantine'])
data_test2['quarantine'] = data_test2.quarantine.dt.dayofyear
data_test2['nonessential'] = pd.to_datetime(data_test2['nonessential'])
data_test2['firstcase'] = pd.to_datetime(data_test2['firstcase'])
data_test2['nonessential'] = data_test2.nonessential.dt.dayofyear
data_test2['firstcase'] = data_test2.firstcase.dt.dayofyear

In [None]:
data_test2['Location'] = data_test2.Country_Region.astype(str) + ":" + data_test2.Province_State.astype(str)

In [None]:
objcols = ['pop', 'tests', 'testpop', 'density', 'medianage', 'urbanpop', 'gatheringlimit', 'hospibed', 'smokers', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', 'malelung', 'gdp2019', 'healthexp', 'healthperpop', 'fertility', 'avgtemp', 'avghumidity', 'totalcases', 'active30', 'active31', 'deaths', 'newdeaths30', 'newdeaths31', 'recovered', 'critical30', 'critical31', 'casediv1m', 'deathdiv1m', 'Type', 'Location', 'newcases30', 'newcases31']
for i in objcols:
  data_test2[i] = pd.to_numeric(data_new2[i], errors='coerce')

In [None]:
data_test2.fillna(0, inplace=True)

In [None]:
CCfeatures = ['Location', 'DOY', 'ForecastId', 'newdeaths31', 'newdeaths30', 'tests', 'quarantine', 'firstcase', 'deathdiv1m']
Ffeatures = ['Location', 'DOY', 'ForecastId', 'newdeaths31', 'newdeaths30', 'tests', 'deathdiv1m', 'quarantine', 'nonessential']
XCC = data_test2.loc[:,CCfeatures]
XF = data_test2.loc[:,Ffeatures]

In [None]:
final_predictions_CC = tree_reg.predict(XCC)
final_predictions_F = rfr_reg1.predict(XF)

In [None]:
My_Preds = pd.DataFrame(data_test['ForecastId'])
My_Preds['ConfirmedCases'] = final_predictions_CC
My_Preds['Fatalities'] = final_predictions_F

print(My_Preds)

In [None]:
My_Preds.to_csv('submission.csv', index=False)