In [None]:
import numpy as np 
import pandas as pd 

### Prepare pytest for unit testing

In [None]:
!pip install ipytest

In [None]:
import pytest
import ipytest

ipytest.autoconfig()

In [None]:
full_grouped=pd.read_csv('../input/corona-virus-report/full_grouped.csv')

only_death = full_grouped[['Date', 'Country/Region', 'New deaths']]
only_death = only_death[only_death['Country/Region'] == 'United Kingdom']
only_death = only_death[['Date', 'New deaths']]

### Check that all dates and New deaths in dataset are valid

In [None]:
%%run_pytest[clean]

def test_valid_dates():
    assert((pd.to_datetime(only_death['Date']).isnull() == True).any() == False)
    
def test_valid_new_deaths():
    assert((only_death['New deaths'].isnull() == True).any() == False)
    assert((only_death['New deaths'] < 0).any() == False)

## Linear regression

In [None]:
trainDataset = only_death.sample(frac=0.8,random_state=0)
testDataset = only_death.drop(trainDataset.index)

In [None]:
import datetime

trainInput = pd.Series([datetime.datetime.strptime(
    d, '%Y-%m-%d') for d in trainDataset['Date']])
start_date = trainInput.min()
trainInput = pd.Series(
    [(d - start_date) / datetime.timedelta(days=1) for d in trainInput])
trainTarget = trainDataset['New deaths']
testInput = pd.Series(
    [(datetime.datetime.strptime(d, '%Y-%m-%d') - start_date) /
     datetime.timedelta(days=1) for d in testDataset['Date']]
)
testTarget = testDataset['New deaths']

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
model = keras.Sequential([
      keras.layers.Dense(1, use_bias=True, input_shape=(1,))
    ])

In [None]:
optimizer = keras.optimizers.Adam(
    learning_rate=0.01, beta_1=0.9, beta_2=0.99, epsilon=1e-05, amsgrad=False,
    name='Adam')
  
# Model compiling settings
model.compile(loss='mse', optimizer=optimizer, metrics=['mae','mse'])

In [None]:
n_idle_epochs = 100
earlyStopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=n_idle_epochs, min_delta=0.01
)

n_epochs = 200
history = model.fit(
    trainInput, trainTarget, batch_size=10,
    epochs=n_epochs, validation_split=0.1, verbose=1, callbacks=[earlyStopping]
)


In [None]:
import matplotlib.pyplot as plt

# The fit model returns the history object for each Keras model
# Let's explore what is inside history
print('keys:', history.history.keys())

# Returning the desired values for plotting and turn to numpy array
mae = np.asarray(history.history['mae'])
val_mae = np.asarray(history.history['val_mae'])

# Creating the data frame
num_values = (len(mae))
values = np.zeros((num_values,2), dtype=float)
values[:,0] = mae
values[:,1] = val_mae

# Using pandas to frame the data
steps = pd.RangeIndex(start=0,stop=num_values)
data = pd.DataFrame(values, steps, columns=["training-mae", "val-mae"])

# Plotting
plt.figure(figsize=(20,10))
plt.plot(data['training-mae'], label='train')
plt.plot(data['val-mae'], label='validation')
plt.title('Training and validation loss', fontsize=18)
plt.ylabel('Loss', fontsize=18)
plt.xlabel('Epoch', fontsize=18)
plt.legend(prop={'size': 18})


In [None]:
predictions = model.predict(testInput).flatten()
metric = keras.metrics.MeanAbsoluteError()
metric.update_state(predictions, testTarget)
metric.result().numpy()

In [None]:
model.summary()
layer = model.get_layer('dense')
w1,w0 = layer.get_weights()
w1 = float(w1[0])
w0 = float(w0[0])

In [None]:
plt.plot(pd.to_datetime(only_death['Date']),
         only_death['New deaths'], 'g', label="real")
only_death['Linear'] = (pd.to_datetime(
    only_death['Date']) - start_date) / datetime.timedelta(days=1) * w1 + w0
plt.plot(pd.to_datetime(
    only_death['Date']), only_death['Linear'], 'b', marker='.', label="linear")
plt.show()


## Polynomial regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from keras.layers import Input, Dense
from keras.optimizers import Adam

trX, trY = pd.to_datetime(only_death['Date']), only_death['New deaths']
trX = (trX - trX.min()) / datetime.timedelta(days = 1)
trX, trY = trX/trX.max(), trY/trY.max()


trs, models = [], []
ns = [3, 5, 10, 15]
for n in ns:
    poly = PolynomialFeatures(n)


    trX_expanded = np.expand_dims(trX, axis=1)
    trX_expanded = poly.fit_transform(trX_expanded)
    
    graph = tf.Graph()
    inp = Input((n+1)) 

    out = Dense(1)(inp)
    model = keras.Model(inputs=inp, outputs=out)
    model.compile(optimizer=Adam(lr=1e-3), loss="mean_squared_error")

    model.fit(trX_expanded, trY, epochs=500)
    models.append(model)
    trs.append(trX_expanded)

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(pd.to_datetime(only_death['Date']),
         only_death['New deaths'], 'g', label="real")

colors = ['violet', 'blue', 'yellow', 'orange']
dates = pd.to_datetime(only_death['Date'])

polynomial_predictions = []
for m, n, trX_expanded, c in zip(models, ns, trs, colors):
    polynomial_predictions.append(
        m.predict(trX_expanded) * only_death['New deaths'].max())
    plt.plot(dates, polynomial_predictions[-1], c, label=f'polynomial {n}')
plt.plot(pd.to_datetime(
    only_death['Date']), only_death['Linear'], 'cyan', marker='.', label="linear")
plt.ylabel('New deaths', fontsize=18)
plt.xlabel('Date', fontsize=18)
plt.legend(prop={'size': 18})
plt.show()


## Long short-term memory (LSTM)

In [None]:
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

rcParams['figure.figsize'] = 16, 10

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
def get_cases_by_date(data, country):
    if country not in np.unique(data['Country/Region']):
        return
    data = data[data['Country/Region'] == country][['New deaths']]

    return data

In [None]:
dates = only_death['Date']


In [None]:
plt.plot(pd.to_datetime(dates),
         only_death['New deaths'], label='Deaths by date')
plt.legend();


### Data processing

In [None]:
res = np.array(only_death['New deaths']).flatten()
dates = np.array(dates).flatten()
df = pd.DataFrame(dict(dead=res), index=dates, columns=['dead'])
print(df.head())

In [None]:
train_size = int(len(df) * 0.8) 
test_size = len(df) - train_size
train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]

In [None]:
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)        
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

In [None]:
time_steps = 1

# reshape to [samples, time_steps, n_features]
print(train.iloc[: time_steps].values)
print(train.iloc[: time_steps])
X_train, y_train = create_dataset(train, train.dead, time_steps)
X_test, y_test = create_dataset(test, test.dead, time_steps)
print(train)
print(X_train.shape, y_train.shape)

In [None]:
model = keras.Sequential()
model.add(keras.layers.LSTM(128, input_shape=(
    X_train.shape[1], X_train.shape[2])))
model.add(keras.layers.Dense(1))
model.compile(loss='mean_squared_error',
              optimizer=keras.optimizers.Adam(0.001))


In [None]:
history = model.fit(
    X_train, y_train, 
    epochs=100, 
    batch_size=1, 
    validation_split=0.1, 
    verbose=1, 
    shuffle=False
)

### Training and validation loss

In [None]:
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='validation')
plt.title('Training and validation loss', fontsize=18)
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend();

In [None]:
y_pred = model.predict(X_test)

## Test predicted values

In [None]:
%%run_pytest[clean]

def test_predict():
    assert ((y_pred < 0).any() == False)

In [None]:
plt.plot(pd.to_datetime(dates[:len(y_train)]), y_train, 'g', label="previous")
plt.plot(pd.to_datetime(dates[len(y_train):len(
    y_train)+len(y_test)]), y_test, marker='.', label="true")
plt.plot(pd.to_datetime(dates[len(y_train):len(
    y_train) + len(y_test)]), y_pred, 'r', label="prediction")
plt.title('Prediction on the background of previous values')
plt.ylabel('New deaths')
plt.xlabel('Date')
plt.legend()
plt.show()

In [None]:
plt.plot(pd.to_datetime(dates[len(y_train):len(
    y_train)+len(y_test)]), y_test, 'magenta', marker='.', label="true")
plt.plot(pd.to_datetime(dates[len(y_train):len(
    y_train)+len(y_test)]), y_pred, 'r', label="prediction")
for m, n, trX_expanded, c in zip(models, ns, trs, colors):
    plt.plot(
        pd.to_datetime(
            dates[len(y_train):len(y_train)+len(y_test)]
        ),
        m.predict(trX_expanded)[len(y_train):len(
            y_train)+len(y_test)] * only_death['New deaths'].max(),
        c,
        linestyle=':',
        label=f'polynomial {n}')
plt.plot(
    pd.to_datetime(dates[len(y_train):len(y_train) +
                         len(y_test)]),
    only_death['Linear'][len(y_train):len(y_train)+len(y_test)],
    'cyan',
    linestyle=':',
    label='linear'
)

plt.title('Prediction against the true values')
plt.ylabel('New deaths')
plt.xlabel('Date')
plt.xticks(rotation=40)
plt.legend()
plt.show()


## Models MAE comparison

In [None]:
metric = keras.metrics.MeanAbsoluteError()
metric.update_state(y_pred, y_test)
# only_death['Linear'][len(y_train):len(y_train)+len(y_test)]
print(F'LSTM MAE: {metric.result():.2f}')
metric.update_state(only_death['Linear'][len(
    y_train):len(y_train)+len(y_test)], y_test)
print(F'Linear regression MAE: {metric.result():.2f}')

graph = tf.Graph()
for m, n, trX_expanded, pred in zip(models, ns, trs, polynomial_predictions):
    metric.update_state(np.array(pred).flatten()[
                        len(y_train):len(y_train)+len(y_test)], y_test)
    print('Polynomial {} regression MAE {:.2f}'
          .format(n, metric.result()
                  )
          )


## Relation between steps count and prediction accuracy

In [None]:
train_size = int(len(df) * 0.8) 
test_size = len(df) - train_size
train, test = df.iloc[0:train_size], df.iloc[train_size:len(df)]

In [None]:
time_steps = [2, 3, 4, 5, 10]

# reshape to [samples, time_steps, n_features]
train_list, test_list = [], []

for t in time_steps:
    X_train, y_train = create_dataset(train, train.dead, t)
    X_test, y_test = create_dataset(test, test.dead, t)
    train_list.append((X_train, y_train))
    test_list.append((X_test, y_test))

In [None]:
predictions = []

for X_train, y_train in train_list:
    model = keras.Sequential()
    model.add(keras.layers.LSTM(128, input_shape=(
    X_train.shape[1], X_train.shape[2])))
    model.add(keras.layers.Dense(1))
    model.compile(loss='mean_squared_error',
              optimizer=keras.optimizers.Adam(0.001))

    model.fit(
        X_train, y_train, 
        epochs=100, 
        batch_size=1, 
        validation_split=0.1, 
        verbose=1, 
        shuffle=False
    )
    
    y_pred = model.predict(X_test)
    predictions.append(y_pred)
    

In [None]:
plt.plot(pd.to_datetime(dates[len(y_train):len(
    y_train)+len(y_test)]), y_test, 'magenta', marker='.', label="true")

for p, c, s in zip(predictions, colors + ['black'], time_steps):
    plt.plot(pd.to_datetime(dates[len(y_train):len(
    y_train)+len(y_test)]), p, c, linestyle=':', label=f'steps {s}')

plt.title('Predictions comparison')
plt.ylabel('New deaths')
plt.xlabel('Date')
plt.xticks(rotation=40)
plt.legend()
plt.show()

In [None]:
graph = tf.Graph()

for p, step in zip(predictions, time_steps):
    metric.update_state(np.array(p).flatten(), y_test)
    print('Step {} LSTM MAE {:.2f}'
          .format(step, metric.result()
                  )
          )