In [None]:
from __future__ import print_function

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from nn.lstm import SequenceLSTM
from nn.utils import RMSLELoss
from custom_dataset import SlidingWindowDataset
import copy
import numpy as np
import pandas as pd
import json
import random
import collections
import sys

In [None]:
# Model training variables

DATASET = "demand"
SCALE_DIVISOR = 1
BATCH_SIZE = 1
VALUE_COLUMN = "y"
WINDOW_SIZE = 1
LOSS_FUNCTION = "mae"
#LOSS_FUNCTION = "mse"
#LOSS_FUNCTION = "rmsle"

LEARNING_RATE = 3e-4 # Chosen since Karpathy recommends 3e-4 for Adam
#DATA_FORMAT = "csv"
DATA_FORMAT = "feather"

ROUNDS = 1
FRACTION = 0.1

MODEL_NAME = "lr" + str(LEARNING_RATE) + "_mae"

device = None
DISABLE_CUDA = False
if not DISABLE_CUDA and torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
torch.manual_seed(4)
np.random.seed(4)
random.seed(4)

In [None]:
# Tensor of shape [n, m] -> Tensor of shape [m, n, 1]
def shape(tensor: torch.Tensor):
    t = tensor.permute(1, 0)
    return t.view(t.size(0), t.size(1), 1)

In [None]:
# Credit: https://www.jessicayung.com/lstms-for-time-series-in-pytorch/
class SequenceLSTM(nn.Module):
    def __init__(self, input_size_is_num_features, hidden_size):
        super(SequenceLSTM, self).__init__()
        self.input_size = (
            input_size_is_num_features  # For us it's 1 since each timestep is 1 number
        )
        self.hidden_size = hidden_size  # arbitrary
        self.lstm1 = nn.LSTM(
            input_size_is_num_features, hidden_size, 1
        )  # one layer in LSTM
        self.linear = nn.Linear(hidden_size, 1)  # Infer just 1 future step

    def forward(self, input):
        # input of shape (in_a_batch, seq_len_is_timesteps_per_sequence, num_features_input_size)
        input = shape(input)
        batch_size = input.size(1)

        # h0 = torch.randn(num_layers, in_a_batch, hidden_size_arbitrary)
        # use .new_zeros to ensure it's on the same device (cpu/gpu)
        h_t = input.new_zeros(
            1, batch_size, self.hidden_size, dtype=torch.double
        )  # 1 is num of layers
        c_t = input.new_zeros(1, batch_size, self.hidden_size, dtype=torch.double)

        lstm_out, (h_t, c_t) = self.lstm1(input, (h_t, c_t))

        # Only take the output from the final timetep - lstm_out[-1]
        y_pred = self.linear(lstm_out[-1].view(batch_size, -1))
        return y_pred.view(-1)

In [None]:
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks-in-python
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i+n]

In [None]:
class SlidingWindowDataset(torch.utils.data.Dataset):
    """Sliding dataset over an array with window size n, return n+1 as target."""

    def __init__(self, tensor, size):
        self.data = tensor
        self.window_size = size

    def __len__(self):
        return len(self.data) - self.window_size  # Number of sequences

    def __getitem__(self, idx):
        # Get single sequence by idx and split the last element away as target
        return (
            self.data[idx : idx + self.window_size],
            self.data[(idx + self.window_size)],
        )

In [None]:
from google.colab import drive
from pathlib import Path
drive.mount('/content/drive')

# Adjust path based on needs
BASE_PATH = "/content/drive/My Drive/Colab Notebooks/data/" + DATASET

with open(BASE_PATH + '/' + 'train.json') as json_file:
    train_paths = [BASE_PATH + "/train/" + p for p in json.load(json_file)]

with open(BASE_PATH + '/' + 'valid.json') as json_file:
    valid_paths = [BASE_PATH + "/valid/" + p for p in json.load(json_file)]

print(len(train_paths), "training files")
print(len(valid_paths), "validation files")

In [None]:
# https://discuss.pytorch.org/t/rmsle-loss-function/67281
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))

In [None]:
loss_fn = None
if LOSS_FUNCTION == "mse":
    loss_fn = nn.MSELoss()
elif LOSS_FUNCTION == "rmsle":
    loss_fn = RMSLELoss()
elif LOSS_FUNCTION == "mae":
    loss_fn = nn.L1Loss()

In [None]:
# Only read the files of the validation dataset in one go
valid_dfs = [pd.read_feather(p) for p in valid_paths]

In [None]:
valid_dss = [SlidingWindowDataset(torch.from_numpy(df[VALUE_COLUMN].to_numpy() / SCALE_DIVISOR).double().to(device=device), WINDOW_SIZE) for df in valid_dfs
  if len(df) > WINDOW_SIZE
]

valid_ds = ConcatDataset(valid_dss)
valid_dl = DataLoader(
    valid_ds,
    shuffle=True,
    num_workers=0,
    batch_size=100
)
valid_input, valid_target = next(iter(valid_dl))

In [None]:
if not os.path.isdir(BASE_PATH + "/params"):
    os.mkdir(BASE_PATH + "/params")

if not os.path.isdir(BASE_PATH + "/params/" + MODEL_NAME):
    os.mkdir(BASE_PATH + "/params/" + MODEL_NAME)

In [None]:
print(MODEL_NAME)
global_model = SequenceLSTM(1, 100).double()
global_model.double().to(device=device)

for t in range(ROUNDS):
    print(" ROUND {}".format(t))
    m = int(max(FRACTION * len(train_paths), 1))
    print(m, "partitions per round")
    selected_partitions = random.sample(train_paths, m)
    total_n = 0
    local_ns = []  # Store n_k here
    partition_weights = [] # Store the final model of the partition here for averaging
    training_losses = []
    for k in range(len(selected_partitions)):
        partition_df = None
        try:
            partition_df = pd.read_feather(selected_partitions[k])
        except:
            print("Failure to read df")
            continue
    partition_ds = SlidingWindowDataset(torch.from_numpy(partition_df[VALUE_COLUMN].to_numpy() / SCALE_DIVISOR).double().to(device=device), WINDOW_SIZE)
    sys.stdout.write(".")
    sys.stdout.flush()
    # Train one partition
    local_model = copy.deepcopy(global_model)
    optimizer = optim.Adam(local_model.parameters(), lr=LEARNING_RATE)
    local_dl = DataLoader(partition_ds, batch_size=BATCH_SIZE)
    local_n = 0
    for inputs, targets in iter(local_dl):
        local_n = local_n + len(inputs)

        def closure():
            optimizer.zero_grad()
            out = local_model(inputs)
            loss = loss_fn(out, targets)
            loss.backward()
            training_losses.append(loss.item())
            return loss
      
      optimizer.step(closure)

    local_ns.append(local_n)
    total_n = total_n + local_n
    partition_weights.append(local_model.state_dict())

  # Average the models
    next_model = collections.OrderedDict()
    for k in range(len(partition_weights)):
        local_model = partition_weights[k]
        for key in global_model.state_dict().keys():
            if key in next_model:
                next_model[key] += (local_ns[k] / total_n) * local_model[key]
            else:
                next_model[key] =  (local_ns[k] / total_n) * local_model[key]

    global_model.load_state_dict(next_model)

    with torch.no_grad():
        pred = global_model(valid_input)
        print("Valid loss MSE:", nn.MSELoss()(pred * SCALE_DIVISOR, valid_target * SCALE_DIVISOR))
        print("Valid loss RSMLSE:", RMSLELoss()(pred * SCALE_DIVISOR, valid_target * SCALE_DIVISOR))
        print("Valid loss MAE:", nn.L1Loss()(pred * SCALE_DIVISOR, valid_target * SCALE_DIVISOR))

        torch.save(global_model.state_dict(), 
                    BASE_PATH + "/params/{}/{}_round{}.pt"
                    .format(MODEL_NAME, MODEL_NAME, t))
