# PyTorch, a cleaner story

In [07_pytorch_exploration.ipynb](07_pytorch_exploration.ipynb) we built our first network _actually_ using PyTorch, but it was pretty messy and hard to follow. Let's continue that work and see what else we can learn/optimize (both the human and the model 🙃).

In [1]:
import json
import os
import pandas as pd
import sys
import time
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

PADDING_TOKEN_INDEX = 0
PAD_TO_SIZE = 45

class CrosswordClueAnswersDataset(Dataset):
    """Crossword clues and answers dataset."""

    def __init__(self, csv_file):
        """
        Args:
            csv_file (string): Path to the csv file with clues and answers.
        """
        self.values = pd.read_csv(csv_file, keep_default_na=False).values
        
    def __len__(self):
        return len(self.values)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        data = self.values[idx, :]
        return (data[0], data[1])

class CrosswordModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, padding_size, hidden_size, num_class, device):
        super(CrosswordModel, self).__init__()
        self.C = torch.nn.Embedding(vocab_size, embed_dim, device=device)
        self.W1 = torch.nn.Linear(embed_dim * padding_size, hidden_size, device=device)
        self.M = nn.Tanh()
        self.W2 = torch.nn.Linear(hidden_size, num_class, device=device)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.C.weight.data.uniform_(-initrange, initrange)
        self.W1.weight.data.uniform_(-initrange, initrange)
        self.W1.bias.data.zero_()
        self.W2.weight.data.uniform_(-initrange, initrange)
        self.W2.bias.data.zero_()

    def forward(self, text):
        emb = self.C(text)
        h = self.M(self.W1(emb.view(-1, self.W1.in_features)))
        return self.W2(h)
    
class DataHandler():
    
    def __init__(self, device):
        self.tokenizer = get_tokenizer('basic_english')
        self.device = device

    def yield_clues(self, data_iter):
        for _, clue in data_iter:
            yield self.tokenizer(clue)

    def yield_answers(self, data_iter):
        for answer, _ in data_iter:
            yield self.tokenizer(answer)
            
    def clue_pipeline(self, x):
        return self.clues_vocab(self.tokenizer(x))
    
    def answer_pipeline(self, x):
        return self.answers_vocab([x])[0]
        
    def collate_batch(self, batch):
        answer_list, clue_list = [], []

        for (answer, clue) in batch:
            clue_indicies = self.clue_pipeline(clue)
            clue_indicies += [PADDING_TOKEN_INDEX] * (PAD_TO_SIZE - len(clue_indicies))
            clue_list.append(clue_indicies)

            answer_list.append(self.answer_pipeline(answer))

        answer_list = torch.tensor(answer_list).to(device)
        clue_list = torch.tensor(clue_list).to(device)

        return answer_list, clue_list

    def createDatasets(self):
        """
        Creates train, test, and dev datasets.
        Returns a tuple of (test, train, dev, all)
        """
        self.dataset = CrosswordClueAnswersDataset("cleaned_data/clean_2.csv")
        train_size = int(0.8 * len(self.dataset))
        dev_size = int(0.1 * len(self.dataset))
        test_size = len(self.dataset) - train_size - dev_size
        # this manual_seed is important to ensure that we consistently split the dataset
        g = torch.Generator().manual_seed(42)
        self.train_dataset, self.test_dataset, self.dev_dataset = torch.utils.data.random_split(self.dataset, [train_size, test_size, dev_size], generator=g)

        return (self.train_dataset, self.test_dataset, self.dev_dataset, self.dataset)

    def createVocabs(self):
        """
        Creates answers and clues vocabularies
        Returns tuple of (answers_vocab, clues_vocab)
        """
        self.answers_vocab = build_vocab_from_iterator(self.yield_answers(train_dataset), specials=['<unk>'])
        # add this in case we see an answer in the test/dev set that we don't have in the training set
        self.answers_vocab.set_default_index(0)
        self.clues_vocab = build_vocab_from_iterator(self.yield_clues(train_dataset), specials=['<pad>', '<unk>'])
        self.clues_vocab.set_default_index(1)
        return (self.answers_vocab, self.clues_vocab)

def train(model, optimizer, criterion, dataloader, epoch):
    model.train()
    
    # stats for each log interval
    log_interval_batch_count, log_interval_count, log_interval_loss, log_interval_acc,  = 0, 0, 0, 0
    log_interval = 200
    log_interval_start_time = time.time()
    
    # stats for the entire dataset
    running_loss, running_acc, running_count = 0, 0, 0

    for idx, (answer, clue) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_answer = model(clue)
        loss = criterion(predicted_answer, answer)
        loss.backward()
        optimizer.step()
        
        batch_acc = (predicted_answer.argmax(1) == answer).sum().item()
        batch_size = answer.size(0)
        
        running_count += batch_size
        running_loss += loss.item()
        running_acc += batch_acc

        log_interval_batch_count += 1
        log_interval_count += batch_size
        log_interval_loss += loss.item() 
        log_interval_acc += batch_acc

        
        if idx % log_interval == 0:
            elapsed = time.time() - log_interval_start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f} | loss {:8.7f} | time {:5.2f}s '.format(epoch, idx, len(dataloader),
                                              log_interval_acc / log_interval_count, log_interval_loss / log_interval_batch_count, elapsed))
            log_interval_batch_count, log_interval_count, log_interval_acc, log_interval_loss = 0, 0, 0, 0
            log_interval_start_time = time.time()
    
    return running_acc / running_count, running_loss / len(dataloader)

class Trainer:
    def evaluate(self, model, criterion, dataloader):
        """
        Evaluate the model against a dataset
        Returns a tuple of (acurate_pct, loss)
        """
        model.eval()
        total_acc, total_count, running_loss = 0, 0, 0

        with torch.no_grad():
            for idx, (answer, clue) in enumerate(dataloader):
                predicted_answer = model(clue)
                loss = criterion(predicted_answer, answer).item()
                total_acc += (predicted_answer.argmax(1) == answer).sum().item()
                total_count += answer.size(0)
                running_loss += loss
        return total_acc / total_count, running_loss / len(dataloader)

    def start(self, hyperparameters, data_handler, previous_model):
        """
        Creates a model and trains it based on the hyperparameters
        Model output is saved
        Returns accu_pct, loss
        """
        # shuffle the training dataloader so we go through different batches each time
        train_dataloader = DataLoader(data_handler.train_dataset, batch_size=hyperparameters['BATCH_SIZE'], shuffle=True, collate_fn=data_handler.collate_batch)
        dev_dataloader = DataLoader(data_handler.dev_dataset, batch_size=hyperparameters['BATCH_SIZE'], shuffle=False, collate_fn=data_handler.collate_batch)

        num_class = len(data_handler.answers_vocab)
        vocab_size = len(data_handler.clues_vocab)
        model = CrosswordModel(vocab_size, hyperparameters['EMBEDDING_LAYER_SIZE'], PAD_TO_SIZE, hyperparameters['HIDDEN_SIZE'], num_class, device)

        criterion = torch.nn.CrossEntropyLoss(ignore_index=0)
        optimizer = torch.optim.Adam(model.parameters(), lr=hyperparameters['LR'])
        
        # load previous model if we have one
        if previous_model is not None:
            #TODO: would be nice to just continue this run - we'd need to save state for the optimizer, load up the training dict, and maybe some other stuff
            print('Loading from previous model:', previous_model)
            model.load_state_dict(torch.load(previous_model))
            dev_accu_pct, dev_loss = self.evaluate(model, criterion, dev_dataloader)
            print(f'Starting with model: {dev_accu_pct} accuracy; {dev_loss} loss;')
        
        # setup directories and files for output
        timestr = time.strftime("%Y%m%d-%H%M%S")
        train_dir = os.path.join('training_results', timestr + '-training')
        if not os.path.exists(train_dir):
            os.makedirs(train_dir)
        results_filename = os.path.join(train_dir, 'training-results.json')
        
        # stats to track for eeach epoch
        dev_accu_pcts, dev_losses = [], []
        train_accu_pcts, train_losses = [], []
        lrs, elapsed_times = [], []

        for epoch in range(1, hyperparameters['EPOCHS'] + 1):
            epoch_start_time = time.time()
            
            train_accu_pct, train_loss = train(model, optimizer, criterion, train_dataloader, epoch)
            train_accu_pcts.append(train_accu_pct)
            train_losses.append(train_loss)
            
            dev_accu_pct, dev_loss = self.evaluate(model, criterion, dev_dataloader)
            dev_accu_pcts.append(dev_accu_pct)
            dev_losses.append(dev_loss)
            # learning rate doesn't change for now. Maybe we should log some detail from the optimzer?
            lrs.append(hyperparameters['LR'])
            
            epoch_elapsed_time = time.time() - epoch_start_time
            elapsed_times.append(epoch_elapsed_time)
            
            print('-' * 59)
            print('| end of epoch {:3d} | time: {:5.2f}s | '
                  'dev accuracy {:8.3f} | loss {:8.7f} '.format(epoch,
                                                   epoch_elapsed_time,
                                                   dev_accu_pct, dev_loss))
            print('-' * 59)
            
            
            # save results and model for this epoch
            model_filename = os.path.join(train_dir, 'model-epoch-' + str(epoch) + '.pt')
            training_dict = {
                'hyperparameters': hyperparameters,
                'dev_accu_pcts': dev_accu_pcts,
                'dev_losses': dev_losses,
                'train_accu_pcts': train_accu_pcts,
                'train_losses': train_losses,
                'elapsed_times': elapsed_times,
                'learning_rates': lrs,
                'model': model_filename,
                'previous_model': previous_model
            }

            with open(results_filename, 'w') as file:
                 file.write(json.dumps(training_dict))
            torch.save(model.state_dict(), model_filename)

        return model, dev_accu_pcts, dev_losses, train_accu_pcts, train_losses, elapsed_times

In [2]:
%%time
# attempt to run on mps - will do work on the GPU for MacOS
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
#device = 'cpu'
print(f'Running on:', device)

data_handler = DataHandler(device)

# load dataset, split data
train_dataset, test_dataset, dev_dataset, dataset = data_handler.createDatasets()
print(f'{len(dataset)=}\n{len(train_dataset)=}\n{len(test_dataset)=}\n{len(dev_dataset)=}\n')

# build vocabulary
answers_vocab, clues_vocab = data_handler.createVocabs()

print(f'{len(answers_vocab)=},{len(clues_vocab)=}\n')

Running on: mps
len(dataset)=770361
len(train_dataset)=616288
len(test_dataset)=77037
len(dev_dataset)=77036

len(answers_vocab)=60323,len(clues_vocab)=77891

CPU times: user 4.2 s, sys: 68.5 ms, total: 4.27 s
Wall time: 4.27 s


In [3]:
trainer = Trainer()

embed_hidden_sizes = [(50,100)] 

for i in range(len(embed_hidden_sizes)):
    sizes = embed_hidden_sizes[i]
    HYPERPARAMETERS = {
        'EPOCHS': 25,
        'LR': 0.001,
        'BATCH_SIZE': 256,
        'EMBEDDING_LAYER_SIZE': sizes[0],
        'HIDDEN_SIZE': sizes[1]
    }
    previous_model = None
    if len(sizes) > 2:
        previous_model = sizes[2]
    
    print(HYPERPARAMETERS)
    trainer.start(HYPERPARAMETERS, data_handler, previous_model)

{'EPOCHS': 25, 'LR': 0.001, 'BATCH_SIZE': 256, 'EMBEDDING_LAYER_SIZE': 50, 'HIDDEN_SIZE': 100}
| epoch   1 |     0/ 2408 batches | accuracy    0.000 | loss 14.1939583 | time  0.19s 
| epoch   1 |   200/ 2408 batches | accuracy    0.000 | loss 12.4046675 | time 18.03s 
| epoch   1 |   400/ 2408 batches | accuracy    0.000 | loss 11.5938124 | time 18.03s 
| epoch   1 |   600/ 2408 batches | accuracy    0.002 | loss 11.1976735 | time 18.07s 
| epoch   1 |   800/ 2408 batches | accuracy    0.002 | loss 10.9545483 | time 18.10s 
| epoch   1 |  1000/ 2408 batches | accuracy    0.005 | loss 10.7679338 | time 18.03s 
| epoch   1 |  1200/ 2408 batches | accuracy    0.007 | loss 10.6005633 | time 18.05s 
| epoch   1 |  1400/ 2408 batches | accuracy    0.012 | loss 10.4701159 | time 18.02s 
| epoch   1 |  1600/ 2408 batches | accuracy    0.015 | loss 10.3548116 | time 18.01s 
| epoch   1 |  1800/ 2408 batches | accuracy    0.021 | loss 10.2325546 | time 18.05s 
| epoch   1 |  2000/ 2408 batches |

KeyboardInterrupt: 

## Sample from the Model

In [6]:
# sample from the model
def predict(clue):
    with torch.no_grad():
        # create clue tensor and pad
        clue_indicies = data_handler.clue_pipeline(clue)
        clue_indicies += [PADDING_TOKEN_INDEX] * (PAD_TO_SIZE - len(clue_indicies))
        output = trained_model(torch.tensor(clue_indicies))
        return output.argmax(1).item()
    
def load_model(training_dir):
    with open(os.path.join(training_dir, 'training-results.json'), 'r') as f:
        result = json.load(f)
        last_epoch = len(result['dev_losses'])
        embed_size = result['hyperparameters']['EMBEDDING_LAYER_SIZE']
        hidden_size = result['hyperparameters']['HIDDEN_SIZE']

        trained_model = CrosswordModel(len(clues_vocab), embed_size, PAD_TO_SIZE, hidden_size, len(answers_vocab), device)
        trained_model.load_state_dict(torch.load(os.path.join(training_dir, f'model-epoch-{last_epoch}.pt')))
        
        return trained_model

# load model from saved file
training_dir = 'training_results/20230321-100932-training'
trained_model = load_model(training_dir)
trained_model = trained_model.to("cpu")
trained_model.eval()

test_clue = 'capital of canada'
print(f'{test_clue}: {data_handler.answers_vocab.get_itos()[predict(test_clue)]}')

capital of canada: hanoi
