# Importing Necessary Packages

In [None]:
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import mean_squared_error
import plotly.express as px
from pandas.plotting import autocorrelation_plot, lag_plot
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.ar_model import AR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import (LinearRegression, Ridge, Lasso,LogisticRegression)
import warnings
warnings.simplefilter('ignore')
from math import sqrt

# Importing DataSet

In [None]:
train=pd.read_csv("/kaggle/input/covid19-global-forecasting-week-2/train.csv")
test=pd.read_csv("/kaggle/input/covid19-global-forecasting-week-2/test.csv")
submission=pd.read_csv("/kaggle/input/covid19-global-forecasting-week-2/submission.csv")

In [None]:
train.sample(6)

In [None]:
train.columns

In [None]:
train.shape

In [None]:
train.info()

# Visualisation & Analysis Data

In [None]:
cnf, dth, rec, act = '#393e46', '#ff2e63', '#21bf73', '#fe9801' 
temp = train[['Date','Fatalities', 'ConfirmedCases']]
temp = temp.melt(id_vars="Date", value_vars=['Fatalities', 'ConfirmedCases'])
fig = px.treemap(temp, path=["variable"], values="value", height=225, 
                 color_discrete_sequence=[rec, dth])
fig.data[0].textinfo = 'label+text+value'
fig.show()

In [None]:
#Confirmed cases by Country
cc = train.fillna('NA').groupby(['Country_Region','Province_State','Date'])['ConfirmedCases'].sum() \
                          .groupby(['Country_Region','Province_State']).max().sort_values() \
                          .groupby(['Country_Region']).sum().sort_values(ascending = False)

In [None]:
top10cc = pd.DataFrame(cc).head(10)
top10cc

In [None]:
#Fatalities cases by Country
fc = train.fillna('NA').groupby(['Country_Region','Province_State','Date'])['Fatalities'].sum() \
                          .groupby(['Country_Region','Province_State']).max().sort_values() \
                          .groupby(['Country_Region']).sum().sort_values(ascending = False)

In [None]:
top10fc = pd.DataFrame(fc).head(10)
top10fc

## Confirmed COVID-19 Cases by Country

In [None]:
fig = px.bar(top10cc, x=top10cc.index, y='ConfirmedCases', labels={'x':'Country'},
             color="ConfirmedCases", color_continuous_scale=px.colors.sequential.Plotly3)
fig.update_layout(title_text='Confirmed COVID-19 cases by country')
fig.show()

## Confirmed COVID-19 Cases by Country (Map)

In [None]:
def plot_map(df, col, pal):
    df = df[df[col]>0]
    fig = px.choropleth(df, locations="Country_Region", locationmode='country names', 
                  color=col, hover_name="Country_Region", 
                  title=col, hover_data=[col], color_continuous_scale=pal)
    fig.show()

In [None]:
plot_map(cc.reset_index(),'ConfirmedCases', 'matter')

### Note: From the charts above, we can say that the United States has the most confirmed cases and is followed by Italy. The virus started in China but the virus shows a bigger impact on the United States and Italy than China.

## Deaths COVID-19 Cases by Country

In [None]:
fig = px.bar(top10fc, x=top10fc.index, y='Fatalities', labels={'x':'Country'},
             color="Fatalities", color_continuous_scale=px.colors.sequential.Turbo)
fig.update_layout(title_text='Deaths COVID-19 Cases by Country')
fig.show()

## Deaths COVID-19 Cases by Country (Map)

In [None]:
plot_map(fc.reset_index(),'Fatalities', 'PuRd')

### Note: From the charts above, we can say that Italy has the most deaths and is followed by Spain. The virus started in China but the virus shows a greater impact on Italy than China in terms of deaths.

## Confirmed COVID-19 cases per day in US

In [None]:
df_by_date = pd.DataFrame(train.fillna('NA').groupby(['Country_Region','Date'])['ConfirmedCases'].sum()
                          .sort_values().reset_index())

fig = px.bar(df_by_date.loc[(df_by_date['Country_Region'] == 'US') &(df_by_date.Date >= '2020-03-01')]
             .sort_values('ConfirmedCases',ascending = False), 
             x='Date', y='ConfirmedCases', color="ConfirmedCases", color_continuous_scale=px.colors.sequential.Rainbow)
fig.update_layout(title_text='Confirmed COVID-19 cases per day in US')
fig.show()


### Note: From this graph we can say that with a short period of time the virus spread to the United States and held on to the 1st place.

## Confirmed COVID-19 cases per day in Italy

In [None]:
df_by_date = pd.DataFrame(train.fillna('NA').groupby(['Country_Region','Date'])['ConfirmedCases'].sum()
                          .sort_values().reset_index())

fig = px.bar(df_by_date.loc[(df_by_date['Country_Region'] == 'Italy') &(df_by_date.Date >= '2020-03-01')]
             .sort_values('ConfirmedCases',ascending = False), 
             x='Date', y='ConfirmedCases', color="ConfirmedCases", color_continuous_scale=px.colors.sequential.Plasma)
fig.update_layout(title_text='Confirmed COVID-19 cases per day in Italy')
fig.show()

### Note: In Italy, the spread of the virus lasted a long time and therefore affected more people. But in the United States in a short time, the spread is more in this way the United States had a great impact

## Confirmed COVID-19 cases per day in China

In [None]:
df_by_date = pd.DataFrame(train.fillna('NA').groupby(['Country_Region','Date'])['ConfirmedCases'].sum()
                          .sort_values().reset_index())

fig = px.bar(df_by_date.loc[(df_by_date['Country_Region'] == 'China') &(df_by_date.Date >= '2020-01-01')]
             .sort_values('ConfirmedCases',ascending = False), 
             x='Date', y='ConfirmedCases', color="ConfirmedCases", color_continuous_scale=px.colors.sequential.Aggrnyl)
fig.update_layout(title_text='Confirmed COVID-19 cases per day in China')
fig.show()


### Note: From the graph above we can say that in China this virus started and has continued to spread. From this region, the virus began to spread to all parts of the world. In China, confirmed cases are increasing day by day.

# Advanced Visualisations 

In [None]:
def plot_treemap(col,dtfr):
    df=dtfr.reset_index()
    fig = px.treemap(df, path=["Country_Region"], values=col, height=700,
                 title=col, color_discrete_sequence = px.colors.qualitative.Dark2)
    fig.data[0].textinfo = 'label+text+value'
    fig.show()

In [None]:
plot_treemap('ConfirmedCases',cc)

In [None]:
plot_treemap('Fatalities',fc)

## Top 20 

In [None]:
def plot_hbar(df, col, n, hover_data=[]):
    fig = px.bar(df.sort_values(col).tail(n), 
                 x=col, y="Country_Region", color=col,  
                 text=col, orientation='h', width=700, hover_data=hover_data,
                 color_discrete_sequence = px.colors.qualitative.Dark2)
    fig.update_layout(title=col, xaxis_title="", yaxis_title="", 
                      yaxis_categoryorder = 'total ascending',
                      uniformtext_minsize=8, uniformtext_mode='hide')
    fig.show()

In [None]:
plot_hbar(cc.reset_index(), 'ConfirmedCases', 20)

In [None]:
plot_hbar(fc.reset_index(), 'Fatalities', 20)

## Cases over time

In [None]:
temp = train.groupby('Date')['ConfirmedCases','Fatalities'].sum().reset_index()
temp = temp.melt(id_vars="Date", value_vars=['ConfirmedCases','Fatalities'],
                 var_name='Case', value_name='Count')
temp.head()

fig = px.area(temp, x="Date", y="Count", color='Case', height=600, width=700,
             title='Cases over time', color_discrete_sequence = [rec, dth])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()

# Time evaluation

In [None]:
df=train.groupby(['Date','Country_Region']).agg('sum').reset_index()
df.head(5)

In [None]:
df.columns

In [None]:
def pltCountry_cases(ConfirmedCases,*argv):
    f, ax=plt.subplots(figsize=(16,5))
    labels=argv
    for a in argv: 
        country=df.loc[(df['Country_Region']==a)]
        plt.plot(country['Date'],country['ConfirmedCases'],linewidth=3)
        plt.xticks(rotation=40)
        plt.legend(labels)
        ax.set(title='Evolution of the number of confirmed cases' )

In [None]:
def pltCountry_fatalities(Fatalities,*argv):
    f, ax=plt.subplots(figsize=(16,5))
    labels=argv
    for a in argv: 
        country=df.loc[(df['Country_Region']==a)]
        plt.plot(country['Date'],country['Fatalities'],linewidth=3)
        plt.xticks(rotation=40)
        plt.legend(labels)
        ax.set(title='Evolution of the number of fatalities' )

In [None]:
pltCountry_cases('ConfirmedCases','China')
pltCountry_fatalities('Fatalities','China')

In [None]:
pltCountry_cases('ConfirmedCases', 'US','Italy','Spain','China')
pltCountry_fatalities('Fatilities','Italy','Spain','US','France')

# Create Time Series for US

In [None]:
def roll(country,case):
    ts=df.loc[(df['Country_Region']==country)]  
    ts=ts[['Date',case]]
    ts=ts.set_index('Date')
    ts.astype('int64')
    a=len(ts.loc[(ts[case]>=10)])
    ts=ts[-a:]
    return (ts.rolling(window=4,center=False).mean().dropna())

In [None]:
tsc=roll('US','ConfirmedCases')
tsf=roll('US','Fatalities')

In [None]:
# create a time series Confirmed Cases
sc = pd.Series(tsc.unstack().values, index=tsc.index)

In [None]:
# create a time series Fatalities
sf = pd.Series(tsf.unstack().values, index=tsf.index)

In [None]:
fig = plt.figure(figsize=(19, 7))
plt.plot(sc, linewidth=3)
plt.title('Time Series', fontsize=22, fontweight="bold")
plt.xlabel('Date', fontsize=18)
plt.ylabel('ConfirmedCases', fontsize=18)

In [None]:
fig = plt.figure(figsize=(19, 7))
plt.plot(sf, linewidth=3)
plt.title('Time Series', fontsize=22, fontweight="bold")
plt.xlabel('Date', fontsize=18)
plt.ylabel('Fatalities', fontsize=18)

# Explore Time Series Data

In [None]:
sc.hist()

In [None]:
sf.hist()

## Lag Scatter Plots

Les TIMESERIES suppose une relation entre une observation et l'observation précédente. Les observations précédentes dans une série chronologique sont appelées des retards, avec l'observation au pas de temps précédent appelé lag = 1, l'observation à deux pas de temps il y a lag = 2, et ainsi de suite. Un type de graphique utile pour explorer la relation entre chaque observation et un décalage de cette observation est appelé le scatter plot. Pandas a une fonction intégrée pour exactement cela appelé le  lag plot. Il trace l'observation au temps t sur l'axe des x et le décalage = 1 observation (t-1) sur l'axe des y.

Si les points se regroupent le long d'une diagonale allant du bas à gauche vers le haut à droite du graphique, cela suggère une relation de corrélation positive. Si les points se regroupent le long d'une ligne diagonale allant du haut à gauche vers le bas à droite, cela suggère une relation de corrélation négative. Les deux relations sont bonnes car elles peuvent être modélisées. Plus de points plus serrés dans la ligne diagonale suggèrent une relation plus forte et une plus grande diffusion à partir de la ligne suggère une relation plus faible. Une balle au milieu ou une propagation à travers l'intrigue suggère une relation faible ou inexistante.

In [None]:
# create a scatter plot for confirmed cases
lag_plot(sc)

In [None]:
# create a scatter plot for fatalities
lag_plot(sf)

## Autocorrelation Plots

Nous pouvons quantifier la force et le type de relation entre les observations et leurs retards. Dans les statistiques, cela s'appelle la corrélation, et lorsqu'elle est calculée par rapport aux valeurs de retard dans les séries temporelles, elle est appelée autocorrélation (auto-corrélation). Une valeur de corrélation calculée entre deux groupes de nombres, comme les observations et leurs valeurs de décalage = 1, donne un nombre compris entre -1 et 1. Le signe de ce nombre indique une corrélation négative ou positive respectivement. Une valeur proche de zéro suggère une faible corrélation, tandis qu'une valeur plus proche de -1 ou 1 indique une forte corrélation.

Des valeurs de corrélation, appelées coeficients de corrélation, peuvent être calculées pour chaque observation et différentes valeurs de retard. Une fois calculé, un graphique peut être créé pour aider à mieux comprendre comment cette relation change avec le décalage. Ce type de tracé est appelé autocorrelation plot.

In [None]:
# create an autocorrelation plot for confirmed cases
autocorrelation_plot(sc)

In [None]:
# create an autocorrelation plot for fatalities
autocorrelation_plot(sf)

In [None]:
# autocorrelation plot of time series as a line plot for confirmed cases
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(sc)

In [None]:
# autocorrelation plot of time series as a line plot for fatalities
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(sf)

# Checking For Stationarity

#### Pour être sûr que les données sont stationnaires ou non, nous exécutons un test statistique fixe en utilisant le code suivant:

In [None]:
from statsmodels.tsa.stattools import adfuller
def ad_test(dataset):
     dftest = adfuller(dataset, autolag = 'AIC')
     print("1. ADF : ",dftest[0])
     print("2. P-Value : ", dftest[1])
     print("3. Num Of Lags : ", dftest[2])
     print("4. Num Of Observations Used For ADF Regression:",dftest[3])
     print("5. Critical Values :")
     for key, val in dftest[4].items():
         print("\t",key, ": ", val)
ad_test(sc)

### Remarque : p > 0.05 donc les données sont non stationnaires

In [None]:
ad_test(sf)

### Remarque : p > 0.05 donc les données sont non stationnaires

## Converting series to stationary

In [None]:
sc_diff = sc.diff(periods=1)
sf_diff = sf.diff(periods=1)
sc_diff = sc_diff[1:]
sf_diff = sf_diff[1:]

# Data Cleaning 

In [None]:
# Format date
train["Date"] = train["Date"].apply(lambda x: x.replace("-",""))
train["Date"]  = train["Date"].astype(int)
train.head()

In [None]:
# Drop NaNs
train = train.drop(['Province_State'],axis=1)
train = train.dropna()
train.isnull().sum()

In [None]:
#Cleaning Test Data
test["Date"] = test["Date"].apply(lambda x: x.replace("-",""))
test["Date"]  = test["Date"].astype(int)

In [None]:
train.isnull().sum()

# Prepare Training

In [None]:
train2=train.loc[(train['Country_Region'] == 'US')]
test2=test.loc[(test['Country_Region'] == 'US')]     
x = train2[['Date']]
y1 = train2[['ConfirmedCases']]
y2 = train2[['Fatalities']]
x_test = test2[['Date']]
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(x)
X_test=StandardScaler().fit_transform(x_test)
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size=0.25, random_state=324)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.25, random_state=324)

In [None]:
def split(ts):
    size = int(len(ts) * 0.85)
    train= ts[:size]
    test = ts[size:]
    return(train,test)
tscc=tsc['ConfirmedCases'].values
tsfc=tsf['Fatalities'].values
traincc,testcc=split(tscc)
trainfc,testfc=split(tsfc)

In [None]:
#liste pour stocker les rmse
rmsecc=[]
rmsefc=[]

# AR Model

In [None]:
# train autoregression for Confirmed cases
predictions = []
model = AR(traincc)
model_fit = model.fit()
window = model_fit.k_ar
coef = model_fit.params

In [None]:
# walk forward over time steps in test
history = traincc[len(traincc)-window:]
history = [history[i] for i in range(len(history))]
predictions = []
for t in range(len(testcc)):
    length = len(history)
    lag = [history[i] for i in range(length-window,length)]
    yhat = coef[0]
    for d in range(window):
        yhat += coef[d+1] * lag[window-d-1]
    obs = testcc[t]
    predictions.append(yhat)
    history.append(obs)
    #print('predicted=%f, expected=%f' % (yhat, obs))
RMSE_arcc = sqrt(mean_squared_error(y_true = testcc, y_pred = predictions))

In [None]:
print('Test RMSE: %.3f' % RMSE_arcc)
rmsecc.append(RMSE_arcc)

In [None]:
# plot
plt.plot(testcc)
plt.plot(predictions, color='red')

In [None]:
test_period = 3
for i in range(test_period):
    print('predicted=%f, expected=%f' % (predictions[i], testcc[i]))

RMSE_arcc = np.sqrt(mean_squared_error(testcc[:test_period], predictions[:test_period]))

In [None]:
# train autoregression for Fatalities
predictions = []
model = AR(trainfc)
model_fit = model.fit()
window = model_fit.k_ar
coef = model_fit.params

In [None]:
# walk forward over time steps in test
history = trainfc[len(trainfc)-window:]
history = [history[i] for i in range(len(history))]
predictions = []
for t in range(len(testfc)):
    length = len(history)
    lag = [history[i] for i in range(length-window,length)]
    yhat = coef[0]
    for d in range(window):
        yhat += coef[d+1] * lag[window-d-1]
    obs = testfc[t]
    predictions.append(yhat)
    history.append(obs)
    #print('predicted=%f, expected=%f' % (yhat, obs))
RMSE_arfc = np.sqrt(mean_squared_error(testfc, predictions))

In [None]:
print('Test RMSE: %.3f' % RMSE_arfc)
rmsefc.append(RMSE_arfc)

In [None]:
# plot
plt.plot(testfc)
plt.plot(predictions, color='red')

In [None]:
test_period = 3
for i in range(test_period):
    print('predicted=%f, expected=%f' % (predictions[i], testcc[i]))

RMSE_arfc = np.sqrt(mean_squared_error(testfc[:test_period], predictions[:test_period]))

# Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(max_depth=200, random_state=0)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
RMSE_RF = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))
rfcc = pd.DataFrame(y_pred)
rfcc.columns = ["Prediction_Confirmed"]
print(RMSE_RF)
rmsecc.append(RMSE_RF)

In [None]:
rfcc

In [None]:
rf.fit(X_train2, y_train2)
y_pred = rf.predict(X_test2)
RMSE_RF = sqrt(mean_squared_error(y_true = y_test2, y_pred = y_pred))
rffc = pd.DataFrame(y_pred)
rffc.columns = ["Prediction_Fatalities"]
print(RMSE_RF)
rmsefc.append(RMSE_RF)

In [None]:
rffc

In [None]:
sub = submission[["ForecastId"]]
submit = pd.concat([rfcc,rffc,sub],axis=1)
submit.head()

# XGBoost

In [None]:
from xgboost import XGBRegressor
xg = XGBRegressor(n_estimators=100)
xg.fit(X_train, y_train)
y_pred = xg.predict(X_test)
RMSE_xg = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))
xgcc = pd.DataFrame(y_pred)
xgcc.columns = ["Prediction_Confirmed"]
print(RMSE_xg)
rmsecc.append(RMSE_xg)

In [None]:
xgcc

In [None]:
xg.fit(X_train2, y_train2)
y_pred = xg.predict(X_test2)
RMSE_xg = sqrt(mean_squared_error(y_true = y_test2, y_pred = y_pred))
xgfc = pd.DataFrame(y_pred)
xgfc.columns = ["Prediction_Fatalities"]
print(RMSE_xg)
rmsefc.append(RMSE_xg)

In [None]:
xgfc

In [None]:
sub = submission[["ForecastId"]]
submit = pd.concat([xgcc,xgfc,sub],axis=1)
submit.head()

# Decision Tree Regressor

In [None]:
dtr = DecisionTreeRegressor(max_depth=20)
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)
RMSE_dtr = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))
dtrcc = pd.DataFrame(y_pred)
dtrcc.columns = ["Prediction_Confirmed"]
print(RMSE_dtr)
rmsecc.append(RMSE_dtr)
dtrcc

In [None]:
dtr.fit(X_train2, y_train2)
y_pred = dtr.predict(X_test2)
RMSE_dtr = sqrt(mean_squared_error(y_true = y_test2, y_pred = y_pred))
dtrfc = pd.DataFrame(y_pred)
dtrfc.columns = ["Prediction_Fatalities"]
print(RMSE_dtr)
rmsefc.append(RMSE_dtr)
dtrfc

In [None]:
sub = submission[["ForecastId"]]
submit = pd.concat([dtrcc,dtrfc,sub],axis=1)
submit.head()

# AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=30), learning_rate=0.5, random_state=42)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
RMSE_ada = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))
adacc = pd.DataFrame(y_pred)
adacc.columns = ["Prediction_Confirmed"]
print(RMSE_ada)
rmsecc.append(RMSE_ada)
adacc

In [None]:
ada.fit(X_train2, y_train2)
y_pred = ada.predict(X_test2)
RMSE_ada = sqrt(mean_squared_error(y_true = y_test2, y_pred = y_pred))
adafc = pd.DataFrame(y_pred)
adafc.columns = ["Prediction_Fatalities"]
print(RMSE_ada)
rmsefc.append(RMSE_ada)
adafc

In [None]:
sub = submission[["ForecastId"]]
submit = pd.concat([adacc,adafc,sub],axis=1)
submit.head()

# Lasso

In [None]:
ls = Lasso(alpha=0.1, normalize=True)
ls.fit(X_train, y_train)
y_pred = ls.predict(X_test)
RMSE_ls = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))
lscc = pd.DataFrame(y_pred)
lscc.columns = ["Prediction_Confirmed"]
print(RMSE_ls)
rmsecc.append(RMSE_ls)
lscc

In [None]:
ls.fit(X_train2, y_train2)
y_pred = ls.predict(X_test2)
RMSE_ls = sqrt(mean_squared_error(y_true = y_test2, y_pred = y_pred))
lsfc = pd.DataFrame(y_pred)
lsfc.columns = ["Prediction_Fatalities"]
print(RMSE_ls)
rmsefc.append(RMSE_ls)
lsfc

In [None]:
sub = submission[["ForecastId"]]
submit = pd.concat([lscc,lsfc,sub],axis=1)
submit.head()

# Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
RMSE_lr = sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))
lrcc = pd.DataFrame(y_pred)
lrcc.columns = ["Prediction_Confirmed"]
print(RMSE_lr)
rmsecc.append(RMSE_lr)
lrcc

In [None]:
lr.fit(X_train2, y_train2)
y_pred = lr.predict(X_test2)
RMSE_lr = sqrt(mean_squared_error(y_true = y_test2, y_pred = y_pred))
lrfc = pd.DataFrame(y_pred)
lrfc.columns = ["Prediction_Fatalities"]
print(RMSE_lr)
rmsefc.append(RMSE_lr)
lrfc

In [None]:
sub = submission[["ForecastId"]]
submit = pd.concat([lrcc,lrfc,sub],axis=1)
submit.head()

# ARIMA Model

In [None]:
pip install pmdarima

In [None]:
#Checking for the best model
from pmdarima import auto_arima
stepwise_fit = auto_arima(traincc, trace=True,
suppress_warnings=True)

In [None]:
#Checking for the best model
from pmdarima import auto_arima
stepwise_fit = auto_arima(trainfc, trace=True,
suppress_warnings=True)

In [None]:
def mape(y2, y_pred):
    return sqrt(mean_squared_error(y_true = y2, y_pred = y_pred))

#Arima modeling for ts
def arima(ts,test):
    p=d=q=range(0,6)
    a=99999
    pdq=list(itertools.product(p,d,q))
    
    #Determining the best parameters
    for var in pdq:
        try:
            model = ARIMA(ts, order=var)
            result = model.fit()

            if (result.aic<=a) :
                a=result.aic
                param=var
        except:
            continue
            
    #Modeling
    model = ARIMA(ts, order=param)
    result = model.fit()
    result.plot_predict(start=int(len(ts) * 0.7), end=int(len(ts) * 1.2))
    pred=result.forecast(steps=len(test))[0]
    #Plotting results
    f,ax=plt.subplots()
    plt.plot(pred,c='green', label= 'predictions')
    plt.plot(test, c='red',label='real values')
    plt.legend()
    plt.title('True vs predicted values')
    #Printing the error metrics
    print(result.summary())        
    
    print('\nMean squared error: %f'%mape(test,pred))
    return (pred)

In [None]:
pred=arima(traincc,testcc)
RMSE_arimacc=sqrt(mean_squared_error(y_true = testcc, y_pred = pred))
rmsecc.append(RMSE_arimacc)

In [None]:
pred=arima(trainfc,testfc)
RMSE_arimafc=sqrt(mean_squared_error(y_true = testfc, y_pred = pred))
rmsefc.append(RMSE_arimafc)

# Comparative table of the different models

In [None]:
rmsecc

In [None]:
rmsefc

In [None]:
tablecc=pd.DataFrame({'RMSE':rmsecc,'Algorithmes':['AR Model','RandomForestRegressor','XGBRegressor','DecisionTreeRegressor',
                                                   'AdaBoostRegressor','Lasso','LinearRegression','ARIMA']})
tablecc

# Note: ARIMA is the algorithm with the highest performance for the prediction of confirmed cases 

In [None]:
tablefc=pd.DataFrame({'RMSE':rmsefc,'Algorithmes':['AR Model','RandomForestRegressor','XGBRegressor','DecisionTreeRegressor',
                                                   'AdaBoostRegressor','Lasso','LinearRegression','ARIMA']})
tablefc

# Note: XGBoost is the highest performing algorithm for deaths prediction