In [1]:
import os
import spacy
import torch
import wandb
from tqdm import tqdm
from src.model import Transformer
from torchtext import data, datasets
from torchtext import data, datasets
from src.loss import MultiGPULossCompute
from src.training import (
    LabelSmoothing, DataIterator, rebatch,
    batch_size_function, NoamOptimizer, train_step
)
from secret import WANDB_API_KEY

In [2]:
os.environ['WANDB_API_KEY'] = WANDB_API_KEY
os.environ['WANDB_NOTEBOOK_NAME'] = 'German-English-IWSLT'
wandb.init(project="transformer-pytorch", name="German-English-IWSLT")

W&B Run: https://app.wandb.ai/19soumik-rakshit96/transformer-pytorch/runs/3eb9a3vu

In [3]:
spacy_source = spacy.load('de')
spacy_target = spacy.load('en')

In [4]:
def tokenize_source(text):
    return [tok.text for tok in spacy_source.tokenizer(text)]

def tokenize_target(text):
    return [tok.text for tok in spacy_target.tokenizer(text)]

In [5]:
source = data.Field(tokenize=tokenize_source, pad_token="<blank>")
target = data.Field(
    tokenize=tokenize_target, init_token="<s>",
    eos_token="</s>", pad_token="<blank>"
)

In [6]:
max_length = 100

train, val, test = datasets.IWSLT.splits(
    exts=('.de', '.en'), fields=(source, target), 
    filter_pred=lambda x: len(vars(x)['src']) \
    <= max_length and len(vars(x)['trg']) <= max_length
)

downloading de-en.tgz


de-en.tgz: 100%|██████████| 24.2M/24.2M [00:22<00:00, 1.09MB/s]


.data/iwslt/de-en/IWSLT16.TED.dev2010.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.dev2010.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2010.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2010.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2011.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2011.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2012.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2012.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2013.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2013.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TED.tst2014.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TED.tst2014.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TEDX.dev2012.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TEDX.dev2012.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2013.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2013.de-en.en.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2014.de-en.de.xml
.data/iwslt/de-en/IWSLT16.TEDX.tst2014.de-en.en.xml
.data/iwslt/de-en/train.tags.de-en.de
.data/iwslt/de-en/train.tags.de-en.en


In [7]:
min_frequency = 2

source.build_vocab(train.src, min_freq=min_frequency)
target.build_vocab(train.trg, min_freq=min_frequency)

In [8]:
pad_idx = target.vocab.stoi["<blank>"]
model = Transformer(len(source.vocab), len(target.vocab), n=6)
model.cuda()

print(model)

  torch.nn.init.xavier_uniform(p)


EncoderDecoder(
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attention): MultiHeadedAttention(
          (linear_layers): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Linear(in_features=512, out_features=512, bias=True)
            (2): Linear(in_features=512, out_features=512, bias=True)
            (3): Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): PositionWiseFeedForward(
          (w_1): Linear(in_features=512, out_features=2048, bias=True)
          (w_2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayer): ModuleList(
          (0): ResidualConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): ResidualConnection(
            (norm): 

In [9]:
criterion = LabelSmoothing(
    size=len(target.vocab),
    padding_index=pad_idx, smoothing=0.1
)
criterion.cuda()



LabelSmoothing(
  (criterion): KLDivLoss()
)

In [10]:
train_iter = DataIterator(
    train, batch_size=1200, device=0,
    repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
    batch_size_fn=batch_size_function, train=True
)



In [11]:
valid_iter = DataIterator(
    val, batch_size=1200, device=0,
    repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
    batch_size_fn=batch_size_function, train=False
)



In [11]:
model_parameters = torch.nn.DataParallel(model, device_ids=[0])

In [12]:
model_optimizer = NoamOptimizer(
    model.source_embedding[0].d_model, 1, 2000,
    torch.optim.Adam(
        model.parameters(), lr=0,
        betas=(0.9, 0.98), eps=1e-9
    )
)

In [12]:
model_parameters = torch.nn.DataParallel(model, device_ids=[0])

In [13]:
model_optimizer = NoamOptimizer(
    model.source_embedding[0].d_model, 1, 2000,
    torch.optim.Adam(
        model.parameters(), lr=0,
        betas=(0.9, 0.98), eps=1e-9
    )
)

In [14]:
def train(epochs):
    for epoch in range(epochs):
        print('Epochs:', (epoch + 1))
        model_parameters.train()
        train_step(
            (rebatch(pad_idx, b) for b in train_iter), 
            model_parameters, MultiGPULossCompute(
                model.generator, criterion,
                devices=[0], opt=model_optimizer
            ), log_on_wandb=True
        )
        model_parameters.eval()
        loss = train_step(
            (rebatch(pad_idx, b) for b in valid_iter), 
            model_parameters, MultiGPULossCompute(
                model.generator, criterion, 
                devices=[0], opt=None
            ), log_on_wandb=True
        )

In [15]:
train(10)

0it [00:00, ?it/s]

Epochs: 1


3754it [13:47,  4.54it/s]


NameError: name 'devices' is not defined