# Training the model defined in lstm_model.py

In [2]:
import os
import json

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR

import preproc

In [None]:
N_EPOCHS = 200
BATCH_SIZE = 64

## Load the dataset

In [3]:
data_dir = 'dataset/reduced_preprocessed'
train = np.load(os.path.join(data_dir, 'train.npy'))
valid = np.load(os.path.join(data_dir, 'valid.npy'))
test = np.load(os.path.join(data_dir, 'test.npy'))

bounds = {}
bounds_path = os.path.join(data_dir, 'bounds.json')
with open(bounds_path) as file:
    bounds = json.load(file)

# the names of the columns in our dataset
cols = list(bounds.keys())
cols

['latitude', 'longitude', 'timestamp']

## Initialize the model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use a GPU if we can

# These are some of the dimensions of the model
event_vec_dim = train.shape[-1] # the length of each vector in the sequence
seq_len = preproc.WINDOW_SIZE # the length of the sequence
hidden_dim = 1024 # number of neurons in the LSTM layers
num_layers = 4 # number of LSTM layers

In [None]:
from lstm_model import FirePredictor

# actually create the model
model = FirePredictor(event_vec_dim, seq_len, hidden_dim, num_layers)

In [None]:
# visualize model structure
# https://github.com/mert-kurttutan/torchview

import graphviz
graphviz.set_jupyter_format('png')

from torchview import draw_graph
model_graph = draw_graph(model, input_size=(BATCH_SIZE, seq_len, event_vec_dim), device='cuda')
model_graph.visual_graph

## Write the loss function
NOTE: I ended up discarding these (because they kept spewing out NaNs everywhere). There are ways to deal with some of the variance in the dataset using the loss function so they may be worth revisiting later.

In [None]:
# def euclidean_dist(t1, t2, dim):
#     return (t1 - t2).pow(2).sum(dim).sqrt()
#
# def avg_distance(x, y):
#     if x.isnan().any():
#         print(x)
#         raise RuntimeError("Loss func: x contains NAN")
#     # use nearest neighbor upsampling
#     reshaped = torch.nn.functional.interpolate(x[:,:,None,:], size=y.shape[2:], mode='nearest')
#     if reshaped.isnan().any():
#         print(reshaped)
#         raise RuntimeError("Loss func: reshaped x contains NAN")
#     result = (euclidean_dist(reshaped, y, dim=2)).sum() / np.array(y.shape[0:-1]).prod()
#     if result.isnan().any():
#         print(result)
#         raise RuntimeError("Loss func: result contains NAN")
#     return result
#
#
# x = torch.Tensor(sequences[0:3,0:64,0,:])
# y = torch.Tensor(sequences[0:3,0:64,1:,:])
#
# print(avg_distance(torch.Tensor(x), torch.Tensor(y)))
# print(f'{x.shape=}, {y.shape=}')

## Prepare Optimizer, Scheduler, and Data Loader

In [None]:
# split train & validation data into inputs and labels
# too large to fit all of this on the GPU, so only move it when we load each batch
x_train = torch.Tensor(train[:,:preproc.WINDOW_SIZE,:])
y_train = torch.Tensor(train[:,preproc.WINDOW_SIZE,:])

# this can all fit in GPU memory, so put it there now
x_valid = torch.Tensor(valid[:,:preproc.WINDOW_SIZE,:]).to(device)
y_valid = torch.Tensor(valid[:,preproc.WINDOW_SIZE,:]).to(device)

In [None]:
print(f'{x_valid.shape=}, {y_valid.shape}')

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.03, momentum=0.9)

scheduler = StepLR(optimizer, step_size=10, gamma=0.5)

# since we already put the data into sequences, we can shuffle it without breaking things
data_loader = DataLoader(TensorDataset(x_train, y_train), batch_size=BATCH_SIZE, shuffle=True)

# oh yeah, we use mean squared error
loss_function = nn.MSELoss()

## Training Loop
This is all very standard.

In [None]:
# we store some performance metrics in these
losses_valid = np.zeros(N_EPOCHS)
losses_train = np.zeros(N_EPOCHS)

for epoch in range(N_EPOCHS):
    # loop over batches

    # add up the loss across all batches
    acc_train_loss = 0.0
    for i, data in enumerate(data_loader):

        x_batch, y_batch = data
        # send our batch over to the GPU (if we have one)
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()

        preds = model(x_batch)

        loss = loss_function(preds, y_batch)
        acc_train_loss += loss # save the loss
        loss.backward()

        optimizer.step()

    # record average loss of all batches in this epoch
    losses_train[epoch] = acc_train_loss / (x_train.shape[0]/BATCH_SIZE)

    # compute validation loss
    model.eval() # disable training stuff (not exactly sure what this does)
    with torch.no_grad(): # disable gradient calculation
        v_preds = model(x_valid)
        losses_valid[epoch] = loss_function(v_preds, y_valid).item()
    model.train()

    # print out the losses so we can see it update as we train
    print(f'Epoch {epoch} -- train loss: {losses_train[epoch]} valid loss: {losses_valid[epoch]}')

In [None]:
# print out a random prediction to prove it works
model(x_train[0][None, :, :].to(device))

# Analyze the results

## Plot the losses

In [None]:
import matplotlib.pyplot as plt
plt.plot(losses_train, label='train loss')
plt.plot(losses_valid, label='validation loss', color='r')
plt.legend()

plt.show()

## Compare predictions against ground truth real quick
See eval_model.ipynb for more detailed analysis.

In [None]:
# calculate some predictions
model.eval()
with torch.no_grad():
    res = model(x_train[0:128].to(device)).cpu().detach()
    res = pd.DataFrame(res, columns=cols)

# undo preprocessing
prediction = preproc.unprocess(res, bounds)
actual = preproc.unprocess(pd.DataFrame(y_train[0:128].numpy(), columns=cols), bounds)

# display the dataframes
display(prediction)
display(actual)

In [None]:
# sort them both by timestamp to make them easier to compare
display(prediction.sort_values('timestamp'))
display(actual.sort_values('timestamp'))

In [None]:
# compute standard deviation to tell us if the model is covering about the right size of range
display(prediction.std(axis=0))
display(actual.std(axis=0))

# Save the model's state_dict to a JSON file to evaluate later
WARNING: These are 600+MiB on a single line. Be careful when opening them!

In [None]:
model_path = "models/reduced_lstm.json"
model.to_json(model_path)

In [None]:
# load the model, just to be sure that we can
model.from_json(model_path)

In [None]:
# use the model to make sure that the loaded one is the same as the original one
model.eval()
with torch.no_grad():
    res = model(x_train[0:128].to(device)).cpu().detach()
    res = pd.DataFrame(res, columns=cols)

pred2 = preproc.unprocess(res, bounds)

In [None]:
# do the same sorting stuff as before
display(prediction.sort_values('timestamp'))
display(pred2.sort_values('timestamp'))