In [None]:
CONTEXT = 21

In [None]:
import warnings

warnings.filterwarnings("ignore")
from zipfile import ZipFile # For reading the data from within the zip file

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandasql as ps
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from matplotlib.style import use
from sklearn.ensemble import (AdaBoostRegressor, GradientBoostingRegressor,
                              RandomForestRegressor)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import (BayesianRidge, ElasticNet, HuberRegressor,
                                  Lars, Lasso, LinearRegression, Ridge)
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.utils import shuffle
from xgboost import XGBRegressor

use("ggplot")

%matplotlib inline


In [None]:
with ZipFile("data/archive.zip", 'r') as file:
	with file.open("England_Regions_COVID_Dataset.csv") as raw:
		raw = pd.read_csv(raw)

## Data Preprocessing

In [None]:
def get_cases(region) -> pd.DataFrame:
	return ps.sqldf(
	    f"""
SELECT date, areaName, newCasesByPublishDate FROM raw
WHERE areaName = "{region}"
ORDER BY date
"""
	).fillna(0)["newCasesByPublishDate"]

In [None]:
data = pd.DataFrame()
data["London"] = get_cases("London")
data["South West"] = get_cases("South West")
data["North West"] = get_cases("North West")
data["East England"] = get_cases("East of England")
data["East Midlands"] = get_cases("East Midlands")

imp = SimpleImputer(strategy="mean")
data = pd.DataFrame(imp.fit_transform(data), columns=data.columns, index=data.index)

In [None]:
plt.figure(figsize=(24, 10))

for column in data:
	plt.plot(data[[column]], label=column)

plt.legend()
plt.show()

In [None]:
X = data[["London"]]

for i in range(1, CONTEXT + 1):
	X[str(i)] = X.iloc(axis=1)[-1].shift(1)

X.columns = range(CONTEXT + 1)
X = X.dropna()

y = X[CONTEXT]
X.drop(CONTEXT, axis=1, inplace=True)

X = X.values

# Models

In [None]:
X, y = shuffle(X, y)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y)

In [None]:
def predict(model, y, x):
    predictions = np.array(
        list(
            zip(
                range(len(y)),
                np.array(sorted(list(zip(y, model.predict(x))), key=lambda x: x[1])).T[0],
                np.array(sorted(list(zip(y, model.predict(x))), key=lambda x: x[1])).T[1]
            )
        )
    ).T
    tp = str(type(model)).split(".")[-1][:-2]

    print(f"{tp} MAE: {mean_absolute_error(y, model.predict(x))}")
    return predictions

In [None]:
def get_predictions(model, X_train, X_valid, y_train, y_valid, fit_params: dict = {}, **params):
	tp = str(type(model(**params))).split(".")[-1][:-2]

	model = model(**params)
	model.fit(X_train, y_train, **fit_params)

	predictions = predict(model, y_valid, X_valid)

	plt.figure(figsize=(24, 4))

	plt.title(f"{tp}: Predictions to Targets")
	plt.scatter(predictions[0], predictions[1], label="True Values", marker="v")
	plt.scatter(predictions[0], predictions[2], label="Predicted Values", marker="x")
	plt.legend()
	plt.show()

	return model

## Model Selection: Loss

The following contains plots of the models' predictions compared to the labels of the validation data.

In [None]:
get_predictions(AdaBoostRegressor, X_train, X_valid, y_train, y_valid)
get_predictions(BayesianRidge, X_train, X_valid, y_train, y_valid)
get_predictions(CatBoostRegressor, X_train, X_valid, y_train, y_valid, {"verbose": False})
get_predictions(DecisionTreeRegressor, X_train, X_valid, y_train, y_valid)
get_predictions(ElasticNet, X_train, X_valid, y_train, y_valid)
get_predictions(ExtraTreeRegressor, X_train, X_valid, y_train, y_valid)
get_predictions(GradientBoostingRegressor, X_train, X_valid, y_train, y_valid)
get_predictions(HuberRegressor, X_train, X_valid, y_train, y_valid)
get_predictions(KNeighborsRegressor, X_train, X_valid, y_train, y_valid)
get_predictions(Lars, X_train, X_valid, y_train, y_valid)
get_predictions(Lasso, X_train, X_valid, y_train, y_valid)
get_predictions(LGBMRegressor, X_train, X_valid, y_train, y_valid)
get_predictions(LinearRegression, X_train, X_valid, y_train, y_valid)
get_predictions(MLPRegressor, X_train, X_valid, y_train, y_valid)
get_predictions(RandomForestRegressor, X_train, X_valid, y_train, y_valid)
get_predictions(Ridge, X_train, X_valid, y_train, y_valid)
get_predictions(SVR, X_train, X_valid, y_train, y_valid, kernel="poly")
get_predictions(XGBRegressor, X_train, X_valid, y_train, y_valid)

# Forecasting

In [None]:
def forecast(length: int, X, y, model: type = HuberRegressor, init_params:dict = {}, fit_params: dict = {}):
	from copy import deepcopy

	model = model(**init_params)
	model.fit(X, y, **fit_params)

	prediction = []
	X = list(list(deepcopy(X.T))[0][-CONTEXT:])

	for _ in range(length):
		p = model.predict([X])
		prediction.append(p)
		X.append(p)
		X = X[1:]
	
	return [i[0] for i in prediction]

forecast(10, X, y)