In [0]:
import torch
from torch.utils.data import DataLoader
from datetime import datetime
from torch.utils.data import random_split
from torch import nn
import numpy as np
from torch.optim.lr_scheduler import ExponentialLR
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
### FUNCTIONS TO LOAD DATA FROM CHRIS'S THING (don't run this cell) ###
from CountyDataset import (
    CountyDataset,
)
def get_dataset(master_path):
    dataset = CountyDataset(master_path)
    X, Y = next(iter(dataset))

    print('-'*89)
    print('Total Dataset Summary')
    print('-'*89)

    print(f'total samples {len(dataset):,} '
          f'X {X.size()} | Y {Y.size()}')

    return dataset

def get_numpy(dataset):
    data = []
    for i in range(len(dataset)):
        X, Y = dataset[i]
        data.append((X.numpy(), Y.numpy()))

    print('-'*89)
    print(f'Numpy Dataset Summary')
    print('-'*89)
    X, Y = zip(*data)
    print(f'total samples {len(X)} '
          f'| input time periods {len(X[0])} '
          f'| input features {X[0].shape[1]} '
          f'| output time periods {len(Y[0])}'
          f'| target col {dataset.target_col}')

    return data, dataset.xcols, dataset.target_col, dataset.date_index



In [2]:
### LOAD DATASET FROM CHRIS'S THING (also don't run this cell) ###
master_path = 'utils/data/county_table.csv'
data = get_numpy(get_dataset(master_path))

# save data to pickle file
import pickle
with open('dataset.pickle', 'wb') as handle:
    pickle.dump(data[0], handle)

NameError: ignored

In [0]:
### LOAD DATA FROM PICKLE FILES ###
# change file paths to wherever the pickled data files are stored
import pickle
with open('drive/My Drive/DL project/datasets/test_24_1.pickle', 'rb') as handle:
    test_dataset = pickle.load(handle)
with open('drive/My Drive/DL project/datasets/train_24_1.pickle', 'rb') as handle:
    train_dataset = pickle.load(handle)
with open('drive/My Drive/DL project/datasets/val_24_1.pickle', 'rb') as handle:
    val_dataset = pickle.load(handle)

num_months = 24 ## change this line as necessary to the number of training months per instance

# this bit gets rid of all the incomplete series (i.e. those with less than num_months months)
train_dataset = [data for data in train_dataset if len(data[0]) == num_months]
test_dataset = [data for data in test_dataset if len(data[0]) == num_months]
val_dataset = [data for data in val_dataset if len(data[0]) == num_months]

In [0]:
# make DataLoaders for each of the splits of the data
train_loader = DataLoader(train_dataset, batch_size=512,
                          shuffle=False, pin_memory=False,
                          num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=1,
                          shuffle=False, pin_memory=False,
                          num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=512,
                          shuffle=False, pin_memory=False,
                          num_workers=4)

In [0]:
### train, val, and test functions for main training loop ###
def train(model, train_loader, optimizer, loss_function, epoch, device=None):
    model.train()
    for idx, (X_train, y_train) in enumerate(train_loader):
        if device is not None:
            X_train, y_train = X_train.to(device), y_train.to(device)

        optimizer.zero_grad()
        y_pred = model(X_train)
        loss = torch.sqrt(loss_function(y_pred, y_train)) # loss is RMSE of log prices
        loss.backward()
        optimizer.step()

    return loss.item()

def val(model, val_loader, loss_function, device=None):
    model.eval()
    val_loss = []
    with torch.no_grad():
        for idx, (X_val, y_val) in enumerate(val_loader):
            if device is not None:
                X_val, y_val = X_val.to(device), y_val.to(device)
            y_pred = model(X_val)
            loss = torch.sqrt(loss_function(y_pred, y_val)).item()
            val_loss += [loss]
    return np.mean(val_loss)

def test(model, test_loader, loss_function, device=None):
    model.eval()
    test_loss = []
    # mape = 0  # commented out stuff gives MAPE metric - 
                # maybe we record this as well since it's more interpratable
    with torch.no_grad():
        for idx, (X_test, y_test) in enumerate(test_loader):
            if device is not None:
                X_test, y_test = X_test.to(device), y_test.to(device)
            y_pred = model(X_test)
            y_pred_actual = torch.exp(y_pred)
            y_test_actual = torch.exp(y_test)
            # mape += torch.abs(y_test_actual - y_pred_actual) / y_test_actual
            # if idx % 1000 == 0: # this bit just prints a few actual prices
            #     print('{}\t{}'.format(torch.exp(y_pred).item(), torch.exp(y_test).item()))
            loss = torch.sqrt(loss_function(y_pred, y_test)).item()
            test_loss += [loss]
        # mape /= len(test_loader.dataset)
    # return np.mean(test_loss), mape.item()
    return np.mean(test_loss)



In [6]:
### TRAINING ###
device = 'cuda' # use GPU; change from 'cuda' to None for CPU

# training parameters
num_epochs = 10 # pick 10, 50, 100
hidden_dim = 10 # pick 10, 50, 100
learning_rate = .01 # pick .01, .001
learning_rate_decay = .95 # pick .95

### PICK A MODEL ###
# don't forget to update the import line to whatever path the model file is stored at
from LSTM import LSTM
model = LSTM(input_dim=235, hidden_dim=hidden_dim, output_dim=1, num_layers=2)

# from RNN import RNN
# model = RNN(D_in=235, H=hidden_dim, D_out=1, L=1, dropout=0.0, device=device)

if device is not None:
    model.to(device)

# set up optimizer, etc.
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = ExponentialLR(optimizer, gamma=learning_rate_decay)

print('Epoch\tTrain RMSE\tVal RMSE\tTime Elapsed')
print('----------------------------------------------------')
# main training loop
for i in range(num_epochs):
    start = time.time()
    train_loss = train(model, train_loader, optimizer, loss_function, i+1, device=device)
    val_loss = val(model, val_loader, loss_function, device=device)
    scheduler.step()
    time_elapsed = time.time() - start
    print('{}\t{:.6f}\t{:.6f}\t{:.3f}s'.format(i+1, train_loss, val_loss, time_elapsed))

Epoch	Train RMSE	Val RMSE	Time Elapsed
----------------------------------------------------
1	0.353139	0.429124	3.912s
2	0.360261	0.355582	3.783s
3	0.298506	0.249291	3.731s
4	0.208733	0.230087	3.657s
5	0.159793	0.212679	3.628s
6	0.196586	0.200727	3.779s
7	0.143031	0.271291	3.647s
8	0.124488	0.193582	3.663s
9	0.126722	0.190084	3.612s
10	0.133099	0.185574	3.725s


In [12]:
### TESTING ###
start = time.time()
test_loss = test(model, test_loader, loss_function, device=device)
time_elapsed = time.time() - start
print('Test RMSE: {}'.format(test_loss))
print('Time Elapsed: {:.3f}'.format(time_elapsed))

Test RMSE: 0.1528758873138454
Time Elapsed: 43.098
