In [None]:
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
import warnings


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_train.csv")
test = pd.read_csv("/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_test.csv")
submission = pd.read_csv("/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_submission.csv")

In [None]:
train.tail()

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Visualisation & EDA 

In [None]:
train.info()

In [None]:
cnf, dth, rec, act = '#393e46', '#ff2e63', '#21bf73', '#fe9801' 
temp = train[['Date','Fatalities', 'ConfirmedCases']]
temp = temp.melt(id_vars="Date", value_vars=['Fatalities', 'ConfirmedCases'])
fig = px.treemap(temp, path=["variable"], values="value", height=225, 
                 color_discrete_sequence=[rec, dth])
fig.data[0].textinfo = 'label+text+value'
fig.show()

In [None]:
df = train.groupby(['Country/Region','Province/State','Date'])['ConfirmedCases'].sum() \
                          .groupby(['Country/Region','Province/State']).max().sort_values() \
                          .groupby(['Country/Region']).sum()

df = pd.DataFrame(df)
df

In [None]:
df_by_date = pd.DataFrame(train.groupby(['Country/Region','Date'])['ConfirmedCases'].sum().sort_values().reset_index())
df_by_date

In [None]:
fig = px.bar(df_by_date.loc[(df_by_date.Date >= '2020-03-02')].sort_values('ConfirmedCases',ascending = False), 
             x='Date', y='ConfirmedCases', color="ConfirmedCases", color_continuous_scale=px.colors.sequential.BuGn)
fig.update_layout(title_text='Confirmed COVID-19 cases per day in CA-US')
fig.show()

In [None]:
df_by_date_F = pd.DataFrame(train.groupby(['Country/Region','Date'])['Fatalities'].sum().sort_values().reset_index())
df_by_date_F

In [None]:
fig = px.bar(df_by_date_F.loc[(df_by_date_F.Date >= '2020-03-02')].sort_values('Fatalities',ascending = False), 
             x='Date', y='Fatalities', color="Fatalities", color_continuous_scale=px.colors.sequential.BuGn)
fig.update_layout(title_text='Fatalities COVID-19 per day in CA-US')
fig.show()

In [None]:
train1 = train.copy() #pour Lasso
test1 = test.copy()
submission1 = submission.copy()

In [None]:
train2 = train.copy() #pour Linear Regression
test2 = test.copy()
submission2 = submission.copy()

In [None]:
train3 = train.copy() #POUR construire de modeles ARIMA
test3 = test.copy()

In [None]:
#liste pour stocker les rmse
rmsec=[]
rmsef=[]

# RANDOM FOREST MODEL

preparing data

In [None]:
# Formater la date to int pour ensuite l'utiliser dans la phase d'entrainement
train["Date"] = train["Date"].apply(lambda x: x.replace("-",""))
train["Date"]  = train["Date"].astype(int)
train.head()

In [None]:
test["Date"] = test["Date"].apply(lambda x: x.replace("-",""))
test["Date"]  = test["Date"].astype(int)
test.head()

Preparer l'entrainement

In [None]:
x = train[['Lat', 'Long', 'Date']]
y1 = train[['ConfirmedCases']]
y2 = train[['Fatalities']]
x_test = test[['Lat', 'Long', 'Date']]

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=10, random_state=0)

In [None]:
rf.fit(x,y1)
pred1 = rf.predict(x_test)
pred1 = pd.DataFrame(pred1)
pred1.columns = ["ConfirmedCases_prediction"]

In [None]:
#from math import sqrt
#from sklearn.metrics import mean_squared_error
#RMSE_RF = sqrt(mean_squared_error(y_true = y_test, y_pred = pred1))
#RMSE_RF
#rmsec.append(RMSE_RF)

In [None]:
rf.fit(x,y2)
pred2 = rf.predict(x_test)
pred2 = pd.DataFrame(pred2)
pred2.columns = ["Death_prediction"]

RMSE_RF = sqrt(mean_squared_error(y_true = y_test2, y_pred = y_pred))
RMSE_RF
rmsef.append(RMSE_RF)

In [None]:
pred2.head()

In [None]:
sub = submission[["ForecastId"]]
sub.head()

In [None]:
submit = pd.concat([pred1,pred2,sub],axis=1)
submit.head()

In [None]:
# Clean
submit.columns = ['ConfirmedCases', 'Fatalities', 'ForecastId']
submit = submit[['ForecastId','ConfirmedCases', 'Fatalities']]

submit["ConfirmedCases"] = submit["ConfirmedCases"].astype(int)
submit["Fatalities"] = submit["Fatalities"].astype(int)
submit

In [None]:
plt.figure(figsize=(16,6))
plt.plot(test1.Date,submit["ConfirmedCases"])
plt.title('Confirmed Cases Prediction using RF')
plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.plot(test1.Date,submit["Fatalities"])
plt.title('Fatalities Prediction using RF')
plt.show()

In [None]:
submit.describe()

# Lasso

In [None]:
train1.head()

In [None]:
train1['Date']=pd.to_datetime(train1['Date'])
train1['Date'] = train1['Date'].apply(lambda x:x.date().strftime('%m-%d'))
train1

Time series of confirmedCases and Fatalities

In [None]:
test1['Date']=pd.to_datetime(test1['Date'])
test1['Date'] = test1['Date'].apply(lambda x:x.date().strftime('%m-%d'))
test1

In [None]:
hor=train1['Date']
ver=train1['ConfirmedCases']
plt.figure(figsize=(20,10))
plt.plot(hor, ver)
plt.title('Time Series Confirmed Cases')
plt.show()

In [None]:
hor=train1['Date']
ver=train1['Fatalities']
plt.figure(figsize=(20,10))
plt.plot(hor, ver)
plt.title('Time Series Fatalities')
plt.show()

preaparation des données pour Lasso 

In [None]:
train2=train1[train1['ConfirmedCases']>0]  #excluding first values from train dataset as they are all zero
train2.head()

In [None]:
X_test1=test1[['ForecastId']]+50 #matching the test data Id in line to training ID's

In [None]:
X1=train2[['Id']]
y_con=train2[['ConfirmedCases']]
y_fat=train2[['Fatalities']]

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly=PolynomialFeatures(7) #Polynomial Feature with degree 7
X=poly.fit_transform(X1)
X_test2=poly.fit_transform(X_test1)

In [None]:
X_test2

Regression Lasso

In [None]:
from sklearn.linear_model import Lasso
model_con=Lasso()
model_con.fit(X, y_con)

In [None]:
y_pred_con=model_con.predict(X_test2)

In [None]:
model_fat=Lasso()
model_fat.fit(X, y_fat)

In [None]:
y_pred_fat=model_fat.predict(X_test2)
y_pred_fat

In [None]:
y_pred_con1=y_pred_con.ravel()
y_pred_fat1=y_pred_fat.ravel()
y_pred_fat1

In [None]:
y_pred_con1=y_pred_con1[13:]  #replacing 13 test prediction with training label as they overlap
y_con_t=train2['ConfirmedCases']
y_con_t=y_con_t[2:].ravel()  #getting those 13 labels from training set to put into prediction
y_pred_con_final=np.round(np.append(y_con_t, y_pred_con1))
y_pred_con_final

In [None]:
y_pred_fat1=y_pred_fat1[13:] #replacing 13 test prediction with training label as they overlap
y_fat_t=train2['Fatalities']
y_fat_t=y_fat_t[2:].ravel() #getting those 13 labels from training set to put into prediction
#y_fat_t=y_fat_t[50:].ravel()
y_pred_fat_final=np.round(np.append(y_fat_t, y_pred_fat1))
y_pred_fat_final

In [None]:
data={'ForecastId':submission.ForecastId,'ConfirmedCases':y_pred_con_final, 'Fatalities':y_pred_fat_final}
result=pd.DataFrame(data, index=submission.index)

In [None]:
result

In [None]:
hor=test1.Date
ver=y_pred_con_final
plt.figure(figsize=(20,10))
plt.plot(hor, ver)
plt.title('Confirmed Cases Prediction')
plt.show()

In [None]:
hor=test1.Date
ver=y_pred_fat_final
plt.figure(figsize=(20,10))
plt.plot(hor, ver)
plt.title('Fatalities Prediction')
plt.show()

In [None]:
X

rmse_lasso = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))
rmse_lasso
rmsec.append(rmse_lasso)

RMSE_lasso = sqrt(mean_squared_error(y_true = y_test2, y_pred = y_pred))
RMSE_lasso
rmsef.append(RMSE_lasso)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
model_con1=LinearRegression()
model_con1.fit(X, y_con)
y_pred_coné=model_con1.predict(X_test2)
model_fat1=LinearRegression()
model_fat1.fit(X, y_fat)
y_pred_faté=model_fat1.predict(X_test2)
y_pred_faté

In [None]:
y_pred_con2=y_pred_coné.ravel()
y_pred_fat2=y_pred_faté.ravel()
y_pred_fat2

In [None]:
y_pred_con2=y_pred_con2[13:]  #replacing 13 test prediction with training label as they overlap
y_con_t=train2['ConfirmedCases']
y_con_t=y_con_t[2:].ravel()  #getting those 13 labels from training set to put into prediction
y_pred_con_final=np.round(np.append(y_con_t, y_pred_con2))
y_pred_con_final

In [None]:
y_pred_fat2=y_pred_fat2[13:] #replacing 13 test prediction with training label as they overlap
y_fat_t=train2['Fatalities']
y_fat_t=y_fat_t[2:].ravel() #getting those 13 labels from training set to put into prediction
y_pred_fat_final=np.round(np.append(y_fat_t, y_pred_fat2))
y_pred_fat_final

In [None]:
data={'ForecastId':submission.ForecastId,'ConfirmedCases':y_pred_con_final, 'Fatalities':y_pred_fat_final}
result=pd.DataFrame(data, index=submission.index)
result

In [None]:
hor=test1.Date
ver=y_pred_con_final
plt.figure(figsize=(20,10))
plt.plot(hor, ver)
plt.title('Confirmed Cases Prediction using LR')
plt.show()

In [None]:
hor=test1.Date
ver=y_pred_fat_final
plt.figure(figsize=(20,10))
plt.plot(hor, ver)
plt.title('Fatalities Prediction using LR')
plt.show()

# ARIMA

In [None]:
df=train2.groupby(['Date','Country/Region']).agg('sum').reset_index()
df.head()

In [None]:
ts=df[['Date','ConfirmedCases']]
ts

In [None]:
ts=ts.set_index('Date')
ts

Rolling Mean (Moving Average)

In [None]:
ts.astype('int64')
ts_pos = ts.loc[(ts['ConfirmedCases']>=10)] #returne les ConfiremedCases superieur a 10
a=len(ts.loc[(ts['ConfirmedCases']>=10)]) #15 elements
ts=ts[-a:]
ts1 = ts.rolling(2).mean().dropna() #on va l'utiliser par la suite
ts1

In [None]:
ts['Rolling Close Average']=ts.rolling(2).mean() #Rolling Mean
ts

visualisation de RM

In [None]:
sns.set(palette = 'Set1',style='darkgrid')
plt.figure(figsize=(16,6))
plt.plot(ts.rolling(window=4,center=False).mean(),label='Rolling Mean')
plt.plot(ts['ConfirmedCases'],label='ConfirmedCases')
plt.plot(ts.rolling(window=4,center=False).std(),label='Rolling std')
plt.legend()
plt.title('Cases distribution in US with rolling mean and standard')
plt.xticks([])

In [None]:
def stationarity(ts):
    print('Results of Dickey-Fuller Test:')
    test_sta = adfuller(ts, autolag='AIC')
    results = pd.Series(test_sta[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for i,val in test_sta[4].items():
        results['Critical Value (%s)'%i] = val
    print (results)

In [None]:
tsUs=ts1['ConfirmedCases'].values
stationarity(tsUs)

AUTO Correlation

In [None]:
plot_acf(ts1,lags=12,title="ACF")
plot_pacf(ts1,lags=6,title="PACF")

Construction du modèle

In [None]:
train = train3.set_index(['Date'])
test = test3.set_index(['Date'])

In [None]:
def create_features(df,label=None):
    """
    Creates time series features from datetime index.
    """
    df = df.copy()
    df['Date'] = df.index
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['hour'] = df['Date'].dt.hour
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
   
    return X

In [None]:
train_features=pd.DataFrame(create_features(train))
test_features=pd.DataFrame(create_features(test))
features_and_target_train = pd.concat([train,train_features], axis=1)
features_and_target_test = pd.concat([test,test_features], axis=1)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()  #convert string of Country/Region to int
def FunLabelEncoder(df):
    for c in df.columns:
        if df.dtypes[c] == object:
            le.fit(df[c].astype(str))
            df[c] = le.transform(df[c].astype(str))
    return df
features_and_target_train= FunLabelEncoder(features_and_target_train)

In [None]:
features_and_target_train

In [None]:
x_train= features_and_target_train[['Country/Region','month', 'dayofyear', 'dayofmonth' , 'weekofyear']]
y1 = features_and_target_train[['ConfirmedCases']]
y2 =features_and_target_train[['Fatalities']]
x_test = features_and_target_test[['Country/Region', 'month', 'dayofyear', 'dayofmonth' , 'weekofyear']]

In [None]:
x_train

In [None]:
tsUs

In [None]:
pip install pmdarima

In [None]:
#Checking for the best model
from pmdarima import auto_arima
stepwise_fit = auto_arima(tsUs, trace=True,
suppress_warnings=True)

In [None]:
def mape(y1, y_pred): 
    y1, y_pred = np.array(y1), np.array(y_pred)
    return np.mean(np.abs((y1 - y_pred) / y1)) * 100

def split(ts):
    #splitting 85%/15% because of little amount of data
    size = int(len(ts) * 0.85)
    train= ts[:size]
    test = ts[size:]
    return(train,test)


#Arima modeling for ts
def arima(ts,test):
    p=d=q=range(0,6)
    a=9999
    pdq=list(itertools.product(p,d,q))
    
    #Determining the best parameters
    for var in pdq:
        try:
            model = ARIMA(ts, order=var)
            result = model.fit()

            if (result.aic<=a) :
                a=result.aic
                param=var 
        except:
            continue
            
    #Modeling
    model = ARIMA(ts, order=param)
    result = model.fit()
    result.plot_predict(start=int(len(ts) * 0.7), end=int(len(ts) * 1.2))
    pred=result.forecast(steps=len(test))[0]
    #Plotting results
    f,ax=plt.subplots()
    plt.plot(pred,c='green', label= 'predictions')
    plt.plot(test, c='red',label='real values')
    plt.legend()
    plt.title('True vs predicted values')
    #Printing the error metrics
    print(result.summary())        
    
    print('\nMean absolute percentage error: %f'%mape(test,pred))
    return (pred)

In [None]:
train,test=split(tsUs)
pred=arima(train,test)

In [None]:
#Fatalities
tsf=df[['Date','Fatalities']]
tsf=tsf.set_index('Date')
tsf.astype('int64')
tsf_fat = tsf.loc[(tsf['Fatalities']>0)] #returne les ConfiremedCases superieur a 0
b=len(tsf.loc[(tsf['Fatalities']>=10)]) #15 elements
tsf=tsf[-b:]
tsf1 = tsf.rolling(2).mean().dropna() #on va l'utiliser par la suite
tsf1

In [None]:
tsf['Rolling Close Average']=tsf.rolling(2).mean() #Rolling Mean
tsf

In [None]:
tsfUs=tsf1['Fatalities'].values
stationarity(tsfUs)

In [None]:
train,test=split(tsfUs)
pred=arima(train,test)