# Table of Contents
- [Introduction](#introduction)
- [The Data](#the-data)
- [Analysis](#analysis)


In [None]:
#utilities
import numpy as np
import pandas as pd
from datetime import datetime

#modeling
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from pandas.plotting import autocorrelation_plot

#visuals
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Introduction<a class="anchor" id="introduction"></a>

## The Data<a class="anchor" id="the-data"></a>

The data comes from:
* COVID-19 Data Repository by the Center for Systems Science and Engineering (CSSE) at Johns Hopkins University (https://github.com/CSSEGISandData/COVID-19)
* Worldometer (https://www.worldometers.info/)

Conveniently organized to be used with kaggle from:<br>
https://www.kaggle.com/antgoldbloom/covid19-data-from-john-hopkins-university <br>
https://www.kaggle.com/okwirjulius/covid19-cases-in-africa <br>
https://www.kaggle.com/headsortails/covid19-us-county-jhu-data-demographics

In [None]:
# worldometer population statistics
worldpop = pd.read_csv('../input/covid19-cases-in-africa/World_population(2020).csv')

# jhu data at US county level and county stats
us_county = pd.read_csv('../input/covid19-us-county-jhu-data-demographics/us_county.csv')
us_covid = pd.read_csv('../input/covid19-us-county-jhu-data-demographics/covid_us_county.csv')

In [None]:
# John Hopkins University (JHU) data at global level - until 12/29/20
jhu_global_meta = pd.read_csv('/kaggle/input/covid19-data-from-john-hopkins-university/CONVENIENT_global_metadata.csv')
jhu_global_confirmed = pd.read_csv('/kaggle/input/covid19-data-from-john-hopkins-university/CONVENIENT_global_confirmed_cases.csv')
jhu_global_deaths = pd.read_csv('/kaggle/input/covid19-data-from-john-hopkins-university/CONVENIENT_global_deaths.csv')


In [None]:
# raw JHU data as it is downloaded from repository
jhu_raw_deaths = pd.read_csv('/kaggle/input/covid19-data-from-john-hopkins-university/RAW_global_deaths.csv')

In [None]:
#global confirmed cases data
jhu_global_confirmed.head()

In [None]:
# checking last updated date
jhu_raw_deaths.head().iloc[:,[0,1,2,3,-4,-3,-2,-1]]

In [None]:
# checking for na values
jhu_raw_deaths[jhu_raw_deaths.drop(columns=['Province/State','Lat','Long']).isna().any(axis=1)]

## Analysis<a class="anchor" id="analysis"></a>

### Autoregression Model

In [None]:
k = 14 #prediction horizon

# skipping first row (header row) and resetting index to begin at 0
dates = pd.to_datetime(jhu_global_confirmed['Country/Region'][1:], format="%m/%d/%y")
tempdata = pd.DataFrame(jhu_global_confirmed['US'][1:])
tempdata.index = dates
tempdata.index.freq='d' # setting time frequency to be by date
tempdata = tempdata.rename_axis('Date')

# fit model
model = AutoReg(tempdata, lags=[1,10,30])
model_fit = model.fit()
# make prediction
yhat = model_fit.predict(len(tempdata), len(tempdata)+k)

#plotting last recorded data point to show continuity and then forecasted data
pyhat = pd.Series(tempdata.tail(1).squeeze(), index=[tempdata.index[-1]]).append(yhat)
ax = tempdata.plot(figsize=(15,10))
pyhat.plot(marker='o',color='C1', ax=ax);
# model_fit.plot_predict(0, len(tempdata)+k)
model_fit.summary()

In [None]:
model_fit.plot_diagnostics(figsize=(15,10));

Data is near normal distribution. Residuals however do not exhibit homoskedacity, suggesting some trend is not captured.

## Autoregressive Moving Average (ARIMA)

In [None]:
# ARIMA
k = 14 #prediction horizon

# skipping first row (header row) and resetting index to begin at 0
dates = pd.to_datetime(jhu_global_confirmed['Country/Region'][1:], format="%m/%d/%y")
tempdata = pd.DataFrame(jhu_global_confirmed['US'][1:])
tempdata.index = dates
tempdata.index.freq='d' # setting time frequency to be by date
tempdata = tempdata.rename_axis('Date')

# fit model
model = ARIMA(tempdata, order=(3, 2, 1))
model_fit = model.fit()
# make prediction
yhat = model_fit.predict(len(tempdata), len(tempdata)+k, typ='levels')



In [None]:
#plotting last recorded data point to show continuity and then forecasted data
pyhat = pd.Series(tempdata.tail(1).squeeze(), index=[tempdata.index[-1]]).append(yhat)
ax = tempdata.plot(figsize=(15,10))
pyhat.plot(marker='o',color='C1', ax=ax);
# model_fit.plot_predict(0, len(tempdata)+k)
model_fit.summary()

In [None]:
model_fit.get_prediction(len(tempdata), len(tempdata)+k)

In [None]:
model_fit.plot_diagnostics(figsize=(15,10));

In [None]:
fig = px.line(tempdata.reset_index(),x='Date',y='US')
fig.show()

In [None]:
autocorrelation_plot(tempdata)