#### Setup Codes

In [None]:
%load_ext autoreload
%autoreload 2

##### Google Colab Setup
we need to run a few commands to set up our environment on Google Colab. If you are running this notebook on a local machine you can skip this section. Run the following cell to mount your Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import sys

# TODO: Fill in the Google Drive path where you uploaded the assignment
# Example: If you create a 'Test' folder and put all the files under 'example' folder, then 'Test/example'
# GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'Test/example'
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'GIT/tutorials/utils/'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
sys.path.append(GOOGLE_DRIVE_PATH)

print(os.listdir(GOOGLE_DRIVE_PATH))

['__pycache__', 'for_knn.py', 'linear_classifier.py', 'word_classification.py', 'word2vec.py', 'custom_model_utils', 'image_captioning', 'Convolutional_Neural_Network', '_modules.py', '_utils.py', 'save.py', '_word_processing.py', '_layers.py', 'enc2dec', 'data', '_data.py', 'seq2seq.py', 'image_captioning.py']


##### NLP Setup Codes

In [None]:
!pip install 'portalocker>=2.0.0'

Collecting portalocker>=2.0.0
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2


In [None]:
import torch
import torchtext
import torchdata

print(f'torch version: {torch.__version__}')
print(f'torchtext version: {torchtext.__version__}')
print(f'torchtext data: {torchdata.__version__}')

torch version: 2.2.1+cu121
torchtext version: 0.17.1+cpu
torchtext data: 0.7.1


##### Import Packages

In [None]:
import random
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim

# custom packages
import data.word_processing as wp
import data.multi30k as multi30k
import enc2dec.utils as utils

### Multi30k Datasets

In [None]:
!python -m spacy download de_core_news_sm

In [None]:
train_datasets, val_datasets = multi30k.load_Multi30k(root='.')
tokenizer, vocab, ln_idx = multi30k.build_Multi30k_vocab(train_datasets, min_freq=2)

In [None]:
collate_fn = multi30k.Multi30kCollate(src_transform=wp.build_transform(tokenizer['de'], vocab['de'].token_to_idx),
                                      tgt_transform=wp.build_transform(tokenizer['en'], vocab['en'].token_to_idx),
                                      PAD_IDX=vocab['en'].stoi['<pad>'],
                                      batch_first=True)

### seq2seq

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, batch_first):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=batch_first)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        embed = self.embedding(src)
        embed = self.dropout(embed)

        _, (hidden, cell) = self.lstm(embed)

        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, batch_first):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=batch_first)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hid_dim, input_dim)

    def forward(self, input, prev_h, prev_c):
        input = input.unsqueeze(1)
        embed = self.embedding(input)
        embed = self.dropout(embed)
        output, (hidden, cell) = self.lstm(embed, (prev_h, prev_c))
        pred = self.fc(output.squeeze(1))


        return pred, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, enc_in, dec_in, emb_dim, hid_dim,
                 n_layers=2, dropout=0.5, teacher_forcing_ratio=0.5,
                 BOS_IDX=2, EOS_IDX=3, batch_first=True):
        super().__init__()

        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.BOS_IDX = BOS_IDX
        self.EOS_IDX = EOS_IDX

        self.encoder = Encoder(input_dim=enc_in,
                               emb_dim=emb_dim,
                               hid_dim=hid_dim,
                               n_layers=n_layers,
                               dropout=dropout,
                               batch_first=batch_first)

        self.decoder = Decoder(input_dim=dec_in,
                               emb_dim=emb_dim,
                               hid_dim=hid_dim,
                               n_layers=n_layers,
                               dropout=dropout,
                               batch_first=batch_first)


    def forward(self, src, tgt):

        prev_h, prev_c = self.encoder(src)

        output_list = []
        input = tgt[:, 0]
        for t in range(1, tgt.size(1)):
            output, prev_h, prev_c = self.decoder(input, prev_h, prev_c)
            output_list.append(output)

            # teacher forcing
            top1 = output.argmax(dim=1)
            input = tgt[:, t] if random.random() < self.teacher_forcing_ratio else top1

        return torch.stack(output_list, dim=1)


    def inference(self, src, max_length=15):


        input = torch.full((src.size(0),), self.BOS_IDX, dtype=torch.int64).to(src.device)
        prev_h, prev_c = self.encoder(src)

        pred_tokens = []
        for _ in range(1, max_length):
            output, prev_h, prev_c = self.decoder(input, prev_h, prev_c)

            input = output.argmax(dim=1)
            pred_tokens.append(input)

        return torch.stack(pred_tokens, dim=1)

### Sanity check

In [None]:
num_train = len(train_datasets)
num_val = len(val_datasets)

print(f"number of train : {num_train}")
print(f"number of val : {num_val}")
print(f'size of target vocab : {len(vocab["en"])}')

number of train : 29001
number of val : 1015
number of train_sample : 2901
number of val_sample : 102
size of target vocab : 5893


In [None]:
data_loaders = {}
data_loaders['train'] = DataLoader(train_datasets, batch_size=64, collate_fn=collate_fn, shuffle=True)
data_loaders['val'] = DataLoader(val_datasets, batch_size=64, collate_fn=collate_fn)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=vocab['en'].stoi['<pad>'])

model = Seq2Seq(enc_in=len(vocab['de']),
                dec_in=len(vocab['en']),
                emb_dim=256, hid_dim=512,
                n_layers=2, dropout=0.5, teacher_forcing_ratio=0.5)


src, tgt = next(iter(data_loaders['train']))
out = model(src, tgt).transpose(2,1)
loss = criterion(out, tgt[:,1:])

loss.backward()
print(f"loss : {loss.item()}")

pred = model.inference(src)

tokens = vocab['en'].idx_to_sentence(pred[0])
print(' '.join(tokens))

loss : 8.682872772216797
gated installing voice musician guitar guitar uphill salmon met falling alone furniture miners awe


### Train Net

In [None]:
model = Seq2Seq(enc_in=len(vocab['de']),
                dec_in=len(vocab['en']),
                emb_dim=256, hid_dim=512,
                n_layers=2, dropout=0.5, teacher_forcing_ratio=0.5)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=vocab['en'].stoi['<pad>'])

history = utils.runner(vocab['en'], model, criterion, optimizer, data_loaders, num_epochs=20)

Train using cuda
Epoch [1/20]          time: 0:01:04          train Loss: 4.8136          train BLEU: 0.0076          val Loss: 4.2762          val BLEU: 0.0136          
Epoch [2/20]          time: 0:01:04          train Loss: 4.2033          train BLEU: 0.0178          val Loss: 3.8586          val BLEU: 0.0240          
Epoch [3/20]          time: 0:01:06          train Loss: 3.8751          train BLEU: 0.0301          val Loss: 3.6618          val BLEU: 0.0347          
Epoch [4/20]          time: 0:01:04          train Loss: 3.6463          train BLEU: 0.0444          val Loss: 3.3704          val BLEU: 0.0490          
Epoch [5/20]          time: 0:01:05          train Loss: 3.4663          train BLEU: 0.0559          val Loss: 3.3109          val BLEU: 0.0622          
Epoch [6/20]          time: 0:01:05          train Loss: 3.3102          train BLEU: 0.0673          val Loss: 3.2067          val BLEU: 0.0772          
Epoch [7/20]          time: 0:01:03          train Loss: 3.

In [None]:
torch.save(model.state_dict(), "seq2seq.pth")

In [None]:
model = Seq2Seq(enc_in=len(vocab['de']),
                dec_in=len(vocab['en']),
                emb_dim=256, hid_dim=512,
                n_layers=2, dropout=0.5, teacher_forcing_ratio=0.5)

model.load_state_dict(torch.load("seq2seq.pth"))

<All keys matched successfully>

In [None]:
src, tgt = next(iter(data_loaders['val']))
pred = model.inference(src)

idx = random.randint(0, src.size(0))

infer_sentence = vocab['en'].idx_to_sentence(pred[idx])
gt_sentence = vocab['en'].idx_to_sentence(tgt[idx])
src_sentence = vocab['de'].idx_to_sentence(src[idx])

print(f"src sentence : {src_sentence}")
print(f"gt sentence : {gt_sentence}")
print(f"infer sentence : {infer_sentence}")

src sentence : ['eine', 'frau', 'mit', 'pinkfarbener', 'tasche', 'sitzt', 'auf', 'einer', 'bank', '.']
gt sentence : ['a', 'woman', 'with', 'a', 'pink', 'purse', 'is', 'sitting', 'on', 'a', 'bench', '.']
infer sentence : ['a', 'woman', 'with', 'a', 'hat', 'is', 'sitting', 'on', 'a', 'bench', 'bench', '.']
