# Covid-19 data exploration, basic prediction
From Kaggle Covid-19 Week 5 data competition data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train=pd.read_csv("/kaggle/input/covid19-global-forecasting-week-5/train.csv")
test=pd.read_csv("/kaggle/input/covid19-global-forecasting-week-5/test.csv")

In [None]:
train=train.drop(columns=['County','Province_State'])
test=test.drop(columns=['County','Province_State'])

In [None]:
train_dates = pd.to_datetime(train["Date"])
ldate = int(len(train_dates))
m = []
d = []
for i in range(0,ldate):
    dx = (train_dates[i].strftime("%d"))
    mx = (train_dates[i].strftime("%m"))
    m.append(int(mx))
    d.append(int(dx))
    
train.insert(6,"Month",m,False)
train.insert(7,"Day",d,False)

Add month and day numbers to test set

In [None]:
test_dates = pd.to_datetime(test["Date"])
ldate = int(len(test_dates))
mt = []
dt = []
for i in range(0,ldate):
    dx = (test_dates[i].strftime("%d"))
    mx = (test_dates[i].strftime("%m"))
    mt.append(int(mx))
    dt.append(int(dx))
    
test.insert(6,"Month",mt,False)
test.insert(7,"Day",dt,False)

That date format in the Date column might cause us problems down the line. Let's take out the hyphens.

In [None]:
dates = pd.to_datetime(train['Date'], errors='coerce')
train['Date'] = dates.dt.strftime("%Y%m%d").astype(int)

dates = pd.to_datetime(test['Date'], errors='coerce')
test['Date'] = dates.dt.strftime("%Y%m%d").astype(int)

Check what objects we have in training set

In [None]:
train.select_dtypes(include=['object']).columns

Transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.  Here needed for Country_Region and Target.

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
train.Country_Region = label.fit_transform(train.Country_Region)
train.Target = label.fit_transform(train.Target)

Also the test set

In [None]:
test.select_dtypes(include=['object']).columns

In [None]:
test.Country_Region = label.fit_transform(test.Country_Region)
test.Target = label.fit_transform(test.Target)
test.head()

In [None]:
train.head()

Prepare training set for prediction, split for model training 

In [None]:
x_train=train.drop(['TargetValue','Id'],axis=1)
y_train=train['TargetValue']

from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

# # Random forest regression

Set up a pipeline to standardize features (by removing the mean and scaling to unit variance) then applying the regressor.  Fit the model and predict.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipe = Pipeline([('scaler', StandardScaler()),('randomforestregressor: ', RandomForestRegressor())])
pipe.fit(x_train , y_train)
predict_train = pipe.predict(x_test)

In [None]:
score = pipe.score(x_test, y_test)
score

Now use the model for prediction on the actual test set 

In [None]:
forecast_ids=test['ForecastId']  ## preserve for output file 
test = test.drop(columns=['ForecastId'],axis=1)
test.head()

In [None]:
predict_for_submission = pipe.predict(test)

In [None]:
submission_output=pd.DataFrame({'ForecastId':forecast_ids,'TargetValue':predict_for_submission})
submission_output.head()

In [None]:
submission_output.to_csv("trial_output.csv",index=False)