In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, f1_score, mean_absolute_error, mean_squared_error

In [2]:
df = pd.read_csv("Data/covid19-global-forecasting-week-4/train.csv", parse_dates=['Date'])
df.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,2020-01-22,0.0,0.0
1,2,,Afghanistan,2020-01-23,0.0,0.0
2,3,,Afghanistan,2020-01-24,0.0,0.0
3,4,,Afghanistan,2020-01-25,0.0,0.0
4,5,,Afghanistan,2020-01-26,0.0,0.0


In [3]:
df.isna().sum()

Id                    0
Province_State    16560
Country_Region        0
Date                  0
ConfirmedCases        0
Fatalities            0
dtype: int64

In [4]:
len(df.ConfirmedCases)

28796

In [5]:
df.Province_State.fillna('missing', inplace=True)

In [6]:
df.isna().sum()

Id                0
Province_State    0
Country_Region    0
Date              0
ConfirmedCases    0
Fatalities        0
dtype: int64

In [7]:
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

Province_State
Country_Region


In [8]:
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        df[label] = content.astype('category').cat.as_ordered()

In [9]:
df['diseaseYear'] = df.Date.dt.year
df['diseaseMonth'] = df.Date.dt.month
df['diseaseDate'] = df.Date.dt.day

In [10]:
df['diseaseDayofWeek'] = df.Date.dt.dayofweek
df['diseaseDayofYear'] = df.Date.dt.dayofyear

In [12]:
df.drop('Date', axis=1, inplace=True)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28796 entries, 0 to 28795
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Id                28796 non-null  int64   
 1   Province_State    28796 non-null  category
 2   Country_Region    28796 non-null  category
 3   ConfirmedCases    28796 non-null  float64 
 4   Fatalities        28796 non-null  float64 
 5   diseaseYear       28796 non-null  int64   
 6   diseaseMonth      28796 non-null  int64   
 7   diseaseDate       28796 non-null  int64   
 8   diseaseDayofWeek  28796 non-null  int64   
 9   diseaseDayofYear  28796 non-null  int64   
dtypes: category(2), float64(2), int64(6)
memory usage: 1.9 MB


In [14]:
for label, content in df.items():
    if pd.api.types.is_categorical_dtype(content):
        df[label] = pd.Categorical(content).codes+1

In [15]:
df.isnull().sum()

Id                  0
Province_State      0
Country_Region      0
ConfirmedCases      0
Fatalities          0
diseaseYear         0
diseaseMonth        0
diseaseDate         0
diseaseDayofWeek    0
diseaseDayofYear    0
dtype: int64

In [16]:
df.Fatalities.value_counts()

0.0        19186
1.0         1831
2.0          994
3.0          740
6.0          555
           ...  
10023.0        1
6803.0         1
21067.0        1
1772.0         1
1100.0         1
Name: Fatalities, Length: 998, dtype: int64

In [17]:
X = df.drop(['Fatalities', 'ConfirmedCases'], axis=1 )
X.shape

(28796, 8)

In [18]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28796 entries, 0 to 28795
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Id                28796 non-null  int64
 1   Province_State    28796 non-null  int16
 2   Country_Region    28796 non-null  int16
 3   diseaseYear       28796 non-null  int64
 4   diseaseMonth      28796 non-null  int64
 5   diseaseDate       28796 non-null  int64
 6   diseaseDayofWeek  28796 non-null  int64
 7   diseaseDayofYear  28796 non-null  int64
dtypes: int16(2), int64(6)
memory usage: 1.4 MB


In [19]:
y= df[['Fatalities', 'ConfirmedCases']]

In [20]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28796 entries, 0 to 28795
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Fatalities      28796 non-null  float64
 1   ConfirmedCases  28796 non-null  float64
dtypes: float64(2)
memory usage: 450.1 KB


In [21]:
y.shape

(28796, 2)

In [22]:
2

2

In [24]:
y['Fatalities'] = y['Fatalities'].astype(int)
y['ConfirmedCases'] = y['ConfirmedCases'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['Fatalities'] = y['Fatalities'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['ConfirmedCases'] = y['ConfirmedCases'].astype(int)


In [25]:
for label, content in y.items():
    if pd.isnull(content).sum():
        y[label] = content.fillna(content.median())

In [26]:
from sklearn.tree import DecisionTreeRegressor
model_rfr = RandomForestRegressor()
model_dtr = DecisionTreeRegressor()

In [27]:
model_dtr.fit(X,y)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [61]:
model_rfr.fit(X,y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [47]:
df_test = pd.read_csv("Data/covid19-global-forecasting-week-4/test.csv", parse_dates=['Date'])
df_test.head()

Unnamed: 0,ForecastId,Province_State,Country_Region,Date
0,1,,Afghanistan,2020-04-02
1,2,,Afghanistan,2020-04-03
2,3,,Afghanistan,2020-04-04
3,4,,Afghanistan,2020-04-05
4,5,,Afghanistan,2020-04-06


In [48]:
df_test['Province_State'].fillna('missing', inplace=True)

In [49]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13459 entries, 0 to 13458
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ForecastId      13459 non-null  int64         
 1   Province_State  13459 non-null  object        
 2   Country_Region  13459 non-null  object        
 3   Date            13459 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 420.7+ KB


In [50]:
df_test['diseaseYear'] = df_test.Date.dt.year
df_test['diseaseMonth'] = df_test.Date.dt.month
df_test['diseaseDate'] = df_test.Date.dt.day
df_test['diseaseDayofWeek'] = df_test.Date.dt.dayofweek
df_test['diseaseDayofYear'] = df_test.Date.dt.dayofyear

In [51]:
df_test.drop('Date', axis=1, inplace=True)

In [52]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13459 entries, 0 to 13458
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ForecastId        13459 non-null  int64 
 1   Province_State    13459 non-null  object
 2   Country_Region    13459 non-null  object
 3   diseaseYear       13459 non-null  int64 
 4   diseaseMonth      13459 non-null  int64 
 5   diseaseDate       13459 non-null  int64 
 6   diseaseDayofWeek  13459 non-null  int64 
 7   diseaseDayofYear  13459 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 841.3+ KB


In [53]:
for label, content in df_test.items():
    if pd.api.types.is_string_dtype(content):
        df_test[label] = content.astype('category').cat.as_ordered()

In [54]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13459 entries, 0 to 13458
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   ForecastId        13459 non-null  int64   
 1   Province_State    13459 non-null  category
 2   Country_Region    13459 non-null  category
 3   diseaseYear       13459 non-null  int64   
 4   diseaseMonth      13459 non-null  int64   
 5   diseaseDate       13459 non-null  int64   
 6   diseaseDayofWeek  13459 non-null  int64   
 7   diseaseDayofYear  13459 non-null  int64   
dtypes: category(2), int64(6)
memory usage: 696.1 KB


In [55]:
for label, content in df_test.items():
    if pd.api.types.is_categorical_dtype(content):
        df_test[label] = pd.Categorical(content).codes+1

In [56]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13459 entries, 0 to 13458
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   ForecastId        13459 non-null  int64
 1   Province_State    13459 non-null  int16
 2   Country_Region    13459 non-null  int16
 3   diseaseYear       13459 non-null  int64
 4   diseaseMonth      13459 non-null  int64
 5   diseaseDate       13459 non-null  int64
 6   diseaseDayofWeek  13459 non-null  int64
 7   diseaseDayofYear  13459 non-null  int64
dtypes: int16(2), int64(6)
memory usage: 683.6 KB


In [58]:
preds = model_dtr.predict(df_test)
preds

array([[   6.,  273.],
       [   6.,  281.],
       [   6.,  281.],
       ...,
       [  98., 3868.],
       [  98., 3868.],
       [  98., 3868.]])

In [62]:
preds_rfr = model_rfr.predict(df_test)
preds_rfr

array([[   7.26,  242.54],
       [   8.89,  278.2 ],
       [  10.32,  298.74],
       ...,
       [ 379.07, 6345.04],
       [ 383.16, 6396.59],
       [ 382.8 , 6395.32]])

In [63]:
result_rfr = pd.DataFrame(preds_rfr)
result_rfr.head()

Unnamed: 0,0,1
0,7.26,242.54
1,8.89,278.2
2,10.32,298.74
3,9.15,341.68
4,10.48,366.96


In [59]:
final_result = pd.DataFrame(preds)
final_result.head()

Unnamed: 0,0,1
0,6.0,273.0
1,6.0,281.0
2,6.0,281.0
3,7.0,349.0
4,7.0,349.0


In [60]:
final_result.columns = ['Fatalities', 'ConfirmedCases']
final_result.head()

Unnamed: 0,Fatalities,ConfirmedCases
0,6.0,273.0
1,6.0,281.0
2,6.0,281.0
3,7.0,349.0
4,7.0,349.0


In [64]:
result_rfr.columns =  ['Fatalities', 'ConfirmedCases']
result_rfr.head()

Unnamed: 0,Fatalities,ConfirmedCases
0,7.26,242.54
1,8.89,278.2
2,10.32,298.74
3,9.15,341.68
4,10.48,366.96


In [65]:
model_dtr.score(X, y)



1.0

In [66]:
model_rfr.score(X,y)



0.9997500054562182

In [67]:
submission = pd.read_csv('Data/covid19-global-forecasting-week-4/submission.csv')
submission.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,1,1
1,2,1,1
2,3,1,1
3,4,1,1
4,5,1,1


In [70]:
submission['ConfirmedCases'] = final_result['ConfirmedCases'].astype(int)
submission['Fatalities'] = final_result['Fatalities'].astype(int)
submission.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,273,6
1,2,281,6
2,3,281,6
3,4,349,7
4,5,349,7


In [71]:
submission.to_csv('Data/covid19-global-forecasting-week-4/Final_Subm.csv', index=False)

In [72]:
final_result_sub = pd.read_csv("Data/covid19-global-forecasting-week-4/Final_Subm.csv")
final_result_sub.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,273,6
1,2,281,6
2,3,281,6
3,4,349,7
4,5,349,7
