In [1]:
# Import library
import random
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import numpy as np
import spacy
from utils import lemmatize_sentence, bleu, save_checkpoint, load_checkpoint
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from torchtext.data import Field, BucketIterator, TabularDataset, Dataset
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("./data/eng-lem.csv", sep="\t" , header=0, names=["Sentence", "Lemmatized"])

In [3]:
df.head()

Unnamed: 0,Sentence,Lemmatized
0,10 Years of Time Team presented a round- up wh...,10 year of Time team present a round- up what ...
1,13 October 1962 marked the initial working ses...,13 October 1962 mark the initial working sessi...
2,"1945 Overhauled, Indianapolis joined Vice Admi...","1945 overhaul, Indianapolis join vice Admiral ..."
3,1965 was also the deadline for AAFSS selection...,"1965 be also the deadline for AAFSS selection,..."
4,"1969 to 1982 The British Telecom"" T symbol log...","1969 to 1982 the British Telecom"" T symbol log..."


In [4]:
train, valid = train_test_split(df, test_size=0.4)
valid, test = train_test_split(valid, test_size=0.5)

In [5]:
train.to_json("./data/train.json", orient="records", lines=True)
valid.to_json("./data/valid.json", orient="records", lines=True)
test.to_json("./data/test.json", orient="records", lines=True)

In [6]:
spacy_eng = spacy.load("en")

def tokenize(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [7]:
sentences = Field(init_token="<sos>", eos_token="<eos>", tokenize=tokenize, lower=True)
lemmatized = Field(init_token="<sos>", eos_token="<eos>", tokenize=tokenize, lower=True)

In [8]:
fields = {"Sentence": ("sentences", sentences), "Lemmatized": ("lemmatized", lemmatized)}

In [9]:
train_data, valid_data = TabularDataset.splits(
    path="data", train="train.json", test="valid.json", format="json", fields=fields
)

test_data = TabularDataset.splits(
    path="data", test="test.json", format="json", fields=fields
)

In [10]:
sentences.build_vocab(train_data, max_size=10000, min_freq=2)
lemmatized.build_vocab(train_data, max_size=10000, min_freq=2)

In [11]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, sentences):
        src_mask = sentences.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, sentences, lemmatized):
        src_seq_length, N = sentences.shape
        trg_seq_length, N = lemmatized.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(sentences) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(lemmatized) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(sentences)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out

In [12]:
# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = False
save_model = True

# Training hyperparameters
num_epochs = 1000
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
src_vocab_size = len(sentences.vocab)
trg_vocab_size = len(lemmatized.vocab)
embedding_size = 512    # default: 512
num_heads = 8
num_encoder_layers = 3  # 6 in paper
num_decoder_layers = 3
dropout = 0.10
max_len = 100
forward_expansion = 4
src_pad_idx = sentences.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

In [13]:
device

device(type='cuda')

In [14]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.sentences),
    device=device,
)

In [15]:
batch = next(iter(train_iterator))
print(batch.sentences)

tensor([[   2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2],
        [ 103,    5,    5,  838,  790,   47,    0, 2065,   42,    0,   92,    5,
           74,   16,  688,   54,  263,    0,   22, 4930,   27, 3488,  335, 5588,
           79,    0,   99,  141,  186,  182,   92,    0],
        [  40,    0, 1700,    0,  140,  530,   40, 2263, 4543,   23,    5, 3854,
            5,  719,    6,    7,   10,   63,   45, 4333,   12,   10,   13, 5919,
           50,    0,  304,  216,   10,  811,  328,   35],
        [1729, 1480, 7519,    0, 1853, 5289,   51,    5,    6,   56,    0, 1335,
         1443,    6,    0,   60,    5, 7326,   28, 1536,  155,    5, 1626, 2182,
           78,   32,   40,    0,  484,   16, 5354,   68],
        [ 109,  439, 1167,    9,   16,    6, 1329,  108, 2986,  815,  258, 7445,
            6,   11,    9,    0, 5098, 

In [16]:
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

In [17]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

In [19]:
pad_idx = sentences.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [20]:
if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

sentence = "A horse is standing under a bridge beside a boat"

In [21]:
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": model.state_dict(),
            "optimizer": optimizer.state_dict(),
        }
        save_checkpoint(checkpoint)

    model.eval()
    lemma_sentence = lemmatize_sentence(
        model, sentence, sentences, lemmatized, device, max_length=10
    )

    print(f"Lemmatize example sentence: \n {lemma_sentence}")
    model.train()
    losses = []

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.sentences.to(device)
        target = batch.lemmatized.to(device)

        # Forward prop
        output = model(inp_data, target[:-1, :])

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin.
        # Let's also remove the start token while we're at it
        output = output.reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()

        loss = criterion(output, target)
        losses.append(loss.item())

        # Back prop
        loss.backward()
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)

[Epoch 0 / 1000]
=> Saving checkpoint
Lemmatize example sentence: 
 ['young', 'young', 'young', 'young', 'young', 'young', 'young', 'young', 'young', 'young']


RuntimeError: CUDA error: device-side assert triggered