In [0]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from itertools import combinations
import statsmodels.api as sm

In [0]:
# File location and type
file_location = "/FileStore/tables/COVID_19-991e0.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

In [0]:
covid = df.toPandas()
covid = covid.loc[covid['countriesAndTerritories'].isin(['France', 'Germany','Italy', 'Spain', 'United_Kingdom'])][['dateRep', 'cases', 'deaths', 'countriesAndTerritories', 'popData2018']].rename(columns={'dateRep': 'date', 'countriesAndTerritories': 'country', 'popData2018': 'population'}).reset_index(drop=True).astype({"cases": int, "deaths": int, "population": int}) #pozostawienie danych dla 5 najludniejszych krajów Europy Zachodniej

covid['cases'] = covid['cases'].abs() #pozbywanie się wartości ujemnych
covid['deaths'] = covid['deaths'].abs() #pozbywanie się wartości ujemnych
covid['date'] = pd.to_datetime(covid['date'], format='%d/%m/%Y') #konwersja stringa na datetime

covid = covid.loc[covid['date'] > '2020-02-15']

covid['cases_per_100k'] = covid['cases']/covid['population']*100000
covid['deaths_per_100k'] = covid['deaths']/covid['population']*100000

covid.head()

In [0]:
countries = list(covid['country'].unique())
countries = [",".join(map(str, comb)) for comb in combinations(countries, 2)]
pairs = []
[pairs.append(countries[i].split("delimiter")) for i in range(len(countries))]
country_pairs = []
[country_pairs.append(pairs[i][0].split(",")) for i in range(len(pairs))]
country_pairs;

In [0]:
for i in range(len(country_pairs)):
    fig = plt.figure(figsize=(20,10))

    chart = sns.lineplot(x='date',
                         y='cases_per_100k',
                         color='r',linestyle='-', marker='o',
                         data = covid[covid['country'] == country_pairs[i][0]] 
                        )

    chart = sns.lineplot(x='date',
                         y='cases_per_100k',
                         color='g',linestyle='-', marker='o',
                         data = covid[covid['country'] == country_pairs[i][1]]
                        )

    chart.set_title(f'COVID-19 Cases per 100k, {country_pairs[i][0]} vs {country_pairs[i][1]}')

    fig.legend(labels=[country_pairs[i][0], country_pairs[i][1]])
    
    plt.xticks(rotation=45)
    plt.show();
    

In [0]:
plt.figure(figsize=(20,10))

chart = sns.lineplot(x='date',
                     y='cases_per_100k',
                     hue='country',linestyle='-', marker='o',   
                     data=covid
                    )

chart.set_title('COVID-19 Cases per 100k')

plt.xticks(rotation=45)
plt.show();

In [0]:
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

In [0]:
y = covid[covid['country'] == 'Italy'][['date', 'cases']].set_index('date').iloc[::-1]

In [0]:
aic = 2000

for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(y,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False);
            results = mod.fit();
            #print('ARIMA{}x{} - AIC:{}'.format(param, param_seasonal, results.aic))
            if results.aic < aic:
              aic = results.aic
              p = param
              p_s = param_seasonal
        except:
            continue

print(aic, p, p_s)            

In [0]:
mod = sm.tsa.statespace.SARIMAX(y,
                                order=p,
                                seasonal_order=p_s,
                                enforce_stationarity=False,
                                enforce_invertibility=False)
results = mod.fit();

In [0]:
pred = results.get_prediction(start=pd.to_datetime('2020-02-16'), dynamic=False)
pred_ci = pred.conf_int()
ax = y.plot(label='Observed')
pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7, figsize=(20, 10))
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('COVID-19 cases')
plt.legend()
plt.show()

In [0]:
pred_uc = results.get_forecast(steps=30)
pred_ci = pred_uc.conf_int()
ax = y.plot(label='Observed', figsize=(20, 10))
pred_uc.predicted_mean.plot(ax=ax, label='Forecast')

ax.set_xlabel('Date')
ax.set_ylabel('COVID-19 cases')
plt.legend()
plt.show()

In [0]:
for i in range(len(country_pairs)):
    fig = plt.figure(figsize=(20,10))

    chart = sns.lineplot(x='date',
                         y='deaths_per_100k',
                         color='r',linestyle='-', marker='o',
                         data = covid[covid['country'] == country_pairs[i][0]] 
                        )

    chart = sns.lineplot(x='date',
                         y='deaths_per_100k',
                         color='g',linestyle='-', marker='o',
                         data = covid[covid['country'] == country_pairs[i][1]]
                        )

    chart.set_title(f'COVID-19 Deaths per 100k, {country_pairs[i][0]} vs {country_pairs[i][1]}')

    fig.legend(labels=[country_pairs[i][0], country_pairs[i][1]])
    
    plt.xticks(rotation=45)
    plt.show();

In [0]:
plt.figure(figsize=(20,10))

chart = sns.lineplot(x='date',
                     y='deaths_per_100k',
                     hue='country',linestyle='-', marker='o',   
                     data=covid
                    )

chart.set_title('COVID-19 Deaths per 100k')

plt.xticks(rotation=45)
plt.show();

In [0]:
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]

In [0]:
y = covid[covid['country'] == 'Italy'][['date', 'deaths']].set_index('date').iloc[::-1]

In [0]:
aic = 2000

for param in pdq:
    for param_seasonal in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMAX(y,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)
            results = mod.fit()
            #print('ARIMA{}x{} - AIC:{}'.format(param, param_seasonal, results.aic))
            if results.aic < aic:
              aic = results.aic
              p = param
              p_s = param_seasonal
        except:
            continue

print(aic, p, p_s)

In [0]:
mod = sm.tsa.statespace.SARIMAX(y,
                                order=p,
                                seasonal_order=p_s,
                                enforce_stationarity=False,
                                enforce_invertibility=False)
results = mod.fit();

In [0]:
pred = results.get_prediction(start=pd.to_datetime('2020-02-16'), dynamic=False)
pred_ci = pred.conf_int()
ax = y.plot(label='Observed')
pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7, figsize=(20, 10))
ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('COVID-19 Deaths')
plt.legend()
plt.show()

In [0]:
pred_uc = results.get_forecast(steps=30)
pred_ci = pred_uc.conf_int()
ax = y.plot(label='Observed', figsize=(20, 10))
pred_uc.predicted_mean.plot(ax=ax, label='Forecast')

ax.set_xlabel('Date')
ax.set_ylabel('COVID-19 cases')
plt.legend()
plt.show()