In [1]:
import pandas as pd
import numpy as np

from torch.utils.data import Dataset, DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


In [193]:
url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
usa_df = pd.read_csv(url)

In [31]:
usa_df.tail()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,4/9/20,4/10/20,4/11/20,4/12/20,4/13/20,4/14/20,4/15/20,4/16/20,4/17/20,4/18/20
3250,84090055,US,USA,840,90055.0,Unassigned,Wisconsin,US,0.0,0.0,...,0,0,0,0,0,1,1,0,0,0
3251,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3252,84099999,US,USA,840,99999.0,,Grand Princess,US,0.0,0.0,...,103,103,103,103,103,103,103,103,103,103
3253,84070004,US,USA,840,,Michigan Department of Corrections (MDOC),Michigan,US,0.0,0.0,...,0,0,0,0,370,429,472,472,514,550
3254,84070005,US,USA,840,,Federal Correctional Institution (FCI),Michigan,US,0.0,0.0,...,0,0,0,0,21,23,36,36,44,45


In [6]:
usa_df.shape

(3255, 99)

In [8]:
usa_df.columns

Index(['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Province_State',
       'Country_Region', 'Lat', 'Long_', 'Combined_Key', '1/22/20', '1/23/20',
       '1/24/20', '1/25/20', '1/26/20', '1/27/20', '1/28/20', '1/29/20',
       '1/30/20', '1/31/20', '2/1/20', '2/2/20', '2/3/20', '2/4/20', '2/5/20',
       '2/6/20', '2/7/20', '2/8/20', '2/9/20', '2/10/20', '2/11/20', '2/12/20',
       '2/13/20', '2/14/20', '2/15/20', '2/16/20', '2/17/20', '2/18/20',
       '2/19/20', '2/20/20', '2/21/20', '2/22/20', '2/23/20', '2/24/20',
       '2/25/20', '2/26/20', '2/27/20', '2/28/20', '2/29/20', '3/1/20',
       '3/2/20', '3/3/20', '3/4/20', '3/5/20', '3/6/20', '3/7/20', '3/8/20',
       '3/9/20', '3/10/20', '3/11/20', '3/12/20', '3/13/20', '3/14/20',
       '3/15/20', '3/16/20', '3/17/20', '3/18/20', '3/19/20', '3/20/20',
       '3/21/20', '3/22/20', '3/23/20', '3/24/20', '3/25/20', '3/26/20',
       '3/27/20', '3/28/20', '3/29/20', '3/30/20', '3/31/20', '4/1/20',
       '4/2/20', '4/3/20', '4

In [17]:
usa_case_df = usa_df.filter(like='/20' ) # regex

# Transform data

In [38]:
usa_case_arr = usa_case_df.to_numpy()
case_max = np.max(usa_case_arr)
usa_case_arr = np.log10(1 + usa_case_arr)/np.log10(1 + case_max)
case_mean = np.mean(  usa_case_arr )
usa_case_arr = usa_case_arr - case_mean
np.max(usa_case_arr), np.min(usa_case_arr)

(0.949140021936568, -0.05085997806343206)

# Add features

In [67]:
import datetime
import numpy as np

day_vec = np.array( [  datetime.datetime.strptime(d, '%m/%d/%y').weekday()  for d in usa_case_df.columns ] )

# Create Dataset and DataLoader

In [128]:
def get_pred_no(no_region, no_day):
    pred_range = list( range(9, no_day) ) # the 10th day, 7 day input + 3 day ahead
    # given i in pred_range, use range( i - 9 : i - 2  ) # 7 days to predict i
    no_pred = no_region * len(pred_range)
    return no_pred,   pred_range

class covid_dataset(Dataset):
    def __init__(self,  arr, day_vec):
        self.arr = arr
        no_region, no_day = arr.shape
        no_pred, pred_range = get_pred_no(no_region, no_day)
        self.no_region = no_region
        self.no_day = no_day
        # predict index in days
        self.pred_range = pred_range
        self.n_len = no_pred
        self.day_vec = day_vec
    def __len__(self):
        return  self.n_len
    def __getitem__(self, idx):
        # predict 3 day ahead
        reg_idx = idx//len(self.pred_range) # region index
        day_idx = self.pred_range[ idx %   len(self.pred_range)  ]
        
        x = torch.tensor( np.concatenate( [self.arr[reg_idx, (day_idx - 9):(day_idx - 2)].reshape(-1, 1),  
                                           self.day_vec[(day_idx - 9):(day_idx - 2)].reshape((-1, 1))/7,   
                                           np.arange((day_idx - 9), (day_idx - 2)).reshape(-1, 1)/self.no_day ], axis = 1), dtype=torch.float).view( -1,  3)
        y = torch.tensor(  self.arr[reg_idx, day_idx], dtype=torch.float).view(-1)
        return x, y
    
def my_collate(batch):
    xs,ys  = zip(*batch)
    return torch.stack(xs),torch.stack(ys)

In [168]:
np.random.seed(2020)
n_sample = usa_case_arr.shape[0]
idx = np.random.permutation( n_sample )
no_tr = round(n_sample * .6)
no_va = round(n_sample * 0.2 )
usa_case_arr_tr = usa_case_arr[ :no_tr , :]
usa_case_arr_va = usa_case_arr[no_tr: (no_tr + no_va), :]
usa_case_arr_te = usa_case_arr[(no_tr + no_va):, :]

batch_size = 128




def to_device(data, device):
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
    def __iter__(self):
        for b in self.dl:
            yield to_device(b, self.device)
    def __len__(self):
        return len(self.dl)


tr_ds = covid_dataset(usa_case_arr_tr, day_vec)
va_ds = covid_dataset(usa_case_arr_va, day_vec)
te_ds = covid_dataset(usa_case_arr_te, day_vec)
tr_dl = DataLoader(tr_ds,  batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=my_collate)
va_dl = DataLoader(va_ds,  batch_size=len(va_ds), shuffle=False, drop_last=False, collate_fn=my_collate)
te_dl = DataLoader(te_ds,  batch_size=len(te_ds), shuffle=False, drop_last=False, collate_fn=my_collate)


device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

tr_dl = DeviceDataLoader(tr_dl, device)
va_dl = DeviceDataLoader(va_dl, device)
te_dl = DeviceDataLoader(te_dl, device)




In [169]:
x, y = next( iter(tr_dl) )

In [170]:
x.shape

torch.Size([128, 7, 3])

In [171]:
y.shape

torch.Size([128, 1])

In [173]:
len(tr_ds)

154287

In [172]:
len(tr_dl)

1205

In [174]:
len(te_dl)

1

# Model

In [163]:
input_dim = 3
output_dim = 1 
hidden_dim = 4
num_layers = 1
dropout = 0.5 if num_layers > 1 else 0

class rnn_seq2scalar(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, batch_size, num_layers, dropout, rnn_model='gru'):
        super(rnn_seq2scalar, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers = num_layers, batch_first = True, dropout=dropout)
        self.rnn = nn.GRU(input_dim, hidden_dim, num_layers = num_layers, batch_first = True, dropout=dropout)
        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(hidden_dim, output_dim)
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.rnn_model = rnn_model
        
    def forward(self, x):
        if self.rnn_model == 'gru':
            _, h_n = self.rnn(x)
            h_n = h_n[-1].view(-1, self.hidden_dim)
        else:
            lstm_out, (h_n, c_n) = self.lstm(x)
            if self.num_layers > 1:
                h_n = h_n.narrow(0, self.num_layers - 1, 1).view(-1, self.hidden_dim)
        ypred = self.fc1( h_n  )
        return ypred.view(-1)

    
model = rnn_seq2scalar(input_dim, hidden_dim, output_dim, batch_size, num_layers, dropout).to(device)



In [164]:
list(  model.parameters() )

[Parameter containing:
 tensor([[ 0.2360,  0.4627,  0.4139],
         [-0.3929,  0.2864, -0.4096],
         [-0.1732, -0.2209, -0.3831],
         [ 0.4124, -0.3919,  0.0720],
         [-0.0559,  0.3628,  0.0066],
         [ 0.4141, -0.3865, -0.4502],
         [-0.1323, -0.4867,  0.3779],
         [-0.4063, -0.2893, -0.1291],
         [ 0.1879, -0.0672, -0.1819],
         [ 0.0485, -0.3925, -0.1920],
         [ 0.1578, -0.1727, -0.0711],
         [ 0.4360, -0.3341,  0.0584],
         [-0.4776, -0.4319, -0.2273],
         [-0.3043, -0.0542, -0.4767],
         [ 0.1525, -0.2570, -0.3368],
         [ 0.1731,  0.3477,  0.1181]], requires_grad=True),
 Parameter containing:
 tensor([[-0.2015, -0.0342,  0.2576, -0.1201],
         [-0.3979, -0.3582, -0.3989, -0.2033],
         [ 0.1907, -0.0365,  0.2553,  0.0515],
         [ 0.1436,  0.3092, -0.3693, -0.1821],
         [-0.4706,  0.1698, -0.3658,  0.0457],
         [ 0.2029,  0.0898, -0.1253, -0.2499],
         [-0.3750, -0.3850,  0.0979, -0.47

# Loss and Optim

In [165]:
lr = 1e-3
from torch.optim import lr_scheduler


loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

# Run epochs

In [179]:
n_epochs= 5

def run_epoch(model, n_epochs):
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
    for epoch in range(n_epochs):
        model.train()
        loss_val = 0
        for i, (x, y) in enumerate(tr_dl):
            model.zero_grad()
            ypred = model(x)
            loss = loss_function(ypred.view(-1),  y.view(-1) )
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_val += loss.item()

        model.eval()
        va_loss_val = 0
        for i, (x, y) in enumerate(va_dl):
            ypred = model(x)
            loss = loss_function(ypred.view(-1),  y.view(-1) )
            va_loss_val += loss.item()


        te_loss_val = 0
        for i, (x, y) in enumerate(te_dl):
            ypred = model(x)
            loss = loss_function(ypred.view(-1),  y.view(-1) )
            te_loss_val += loss.item()

    #     scheduler.step(va_loss_val)
        print_str = f'epoch={epoch}, tr_loss={loss_val:.6f}, va_loss={va_loss_val:.6f}, te_loss={te_loss_val:.6f}'
        print(print_str)

model = rnn_seq2scalar(input_dim, hidden_dim, output_dim, batch_size, num_layers, dropout).to(device)
run_epoch(model, n_epochs)

epoch=0, tr_loss=3.239356, va_loss=0.000496, te_loss=0.000752
epoch=1, tr_loss=0.638326, va_loss=0.000453, te_loss=0.000720
epoch=2, tr_loss=0.599095, va_loss=0.000453, te_loss=0.000739
epoch=3, tr_loss=0.581335, va_loss=0.000430, te_loss=0.000714
epoch=4, tr_loss=0.572574, va_loss=0.000424, te_loss=0.000709


# LSTM

In [180]:
model = rnn_seq2scalar(input_dim, hidden_dim, output_dim, batch_size, num_layers, dropout, rnn_model='lstm').to(device)
run_epoch(model, n_epochs)

epoch=0, tr_loss=11.940415, va_loss=0.000687, te_loss=0.000908
epoch=1, tr_loss=0.893565, va_loss=0.000605, te_loss=0.000845
epoch=2, tr_loss=0.777006, va_loss=0.000529, te_loss=0.000785
epoch=3, tr_loss=0.669499, va_loss=0.000516, te_loss=0.000804
epoch=4, tr_loss=0.590504, va_loss=0.000430, te_loss=0.000725


# Compare with Random Forests

In [184]:
device = 'cpu'

tr_dl = DeviceDataLoader(tr_dl, device)
va_dl = DeviceDataLoader(va_dl, device)
te_dl = DeviceDataLoader(te_dl, device)


def convert_dl_to_x_y(dl):
    x_list=[]
    y_list=[]
    for i, (x, y) in enumerate(dl):
        x_list.append( x.numpy().reshape((-1, 3*7)) )
        y_list.append( y.numpy() )
    return np.row_stack(x_list), np.row_stack(y_list)
    
X_tr, y_tr = convert_dl_to_x_y(tr_dl)
X_va, y_va = convert_dl_to_x_y(va_dl)
X_te, y_te = convert_dl_to_x_y(te_dl)

In [186]:
X_tr.shape, y_tr.shape

((154240, 21), (154240, 1))

In [192]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error


def run_rf():
    regr = RandomForestRegressor(n_estimators = 100, max_depth=5, random_state=0)
    regr.fit(X_tr, y_tr.flatten())
    
    tr_loss = mean_squared_error( regr.predict(X_tr), y_tr.flatten()   )
    va_loss = mean_squared_error( regr.predict(X_va), y_va.flatten()   )
    te_loss = mean_squared_error( regr.predict(X_te), y_te.flatten()   )
    
    print_str = f'tr_loss={tr_loss:.6f}, va_loss={va_loss:.6f}, te_loss={te_loss:.6f}'
    print(print_str)

run_rf()

tr_loss=0.000392, va_loss=0.000352, te_loss=0.000681
