In [1]:
# !pip install spacy
# !python -m spacy download en
# !python -m spacy download de

# !pip install nltk
# !pip install bert_score
# !pip install torchmetrics
# !pip install torchtext==0.9.0
# !pip install moverscore

In [2]:
# from mover_custom.moverscore import get_idf_dict, word_mover_score
# # from moverscore import get_idf_dict, word_mover_score
# import numpy as np
# # Hypotheses and references
# hyp_list = ['The cat sat on the mat.', 'The cat lay on the mat.']
# ref_list = ['The cat is sitting on the mat.', 'The cat is lying on the mat.']

# # IDF dictionary
# idf_dict_hyp = get_idf_dict(hyp_list) 
# idf_dict_ref = get_idf_dict(ref_list)

# # Compute MoverScore
# scores = word_mover_score(ref_list, hyp_list, idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=1, remove_subwords=True)

# print(np.mean(scores))

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import random

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SEED = 42
torch.manual_seed(SEED)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')


In [5]:
import spacy
spacy_en = spacy.load('en_core_web_md')
spacy_de = spacy.load('de_core_news_md')

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

# Source field (German)
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True)

# Target field (English)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(SRC, TRG))

train_data.examples = train_data.examples[:len(train_data.examples)//4]
valid_data.examples = valid_data.examples[:len(valid_data.examples)//4]
test_data.examples = test_data.examples[:len(test_data.examples)//4]

SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0, :]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs


In [7]:
input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)

ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(input_dim, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(output_dim, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [8]:
def indices_to_text(indices, field):
    tokens = [field.vocab.itos[i] for i in indices]
    # Remove <sos> and <eos> tokens
    tokens = [token for token in tokens if token not in ['<sos>', '<eos>']]
    text = ' '.join(tokens)
    return text

In [9]:
import logging
from tqdm import tqdm

logging.basicConfig(filename='all_logs.txt', level=logging.INFO, format='%(message)s')

train_logger = logging.getLogger('train')
train_logger.addHandler(logging.FileHandler('trainlog.txt'))

eval_logger = logging.getLogger('eval')
eval_logger.addHandler(logging.FileHandler('evallog.txt'))

In [10]:
criterion = nn.CrossEntropyLoss()

from bert_score import BERTScorer
bert_based_scorer = BERTScorer(lang="en", rescale_with_baseline=False, device=device)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
from nltk.translate.bleu_score import sentence_bleu

# # Hypothesis and reference
# # Note: For sentence_bleu, the reference needs to be a list of lists and the hypothesis needs to be a list.
# hyp = ['The', 'cat', 'sat', 'on', 'the', 'mat']
# ref = [['The', 'cat', 'is', 'sitting', 'on', 'the', 'mat']]

# # Compute BLEU score
# bleu_score = sentence_bleu(ref, hyp)

# print(bleu_score)

6.147254555356275e-78


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [20]:
optimizer = optim.Adam(model.parameters())

def train(model, iterator, optimizer, criterion, scorer, clip=1):
    model.train()
    epoch_loss = 0

    for batch in tqdm(iterator, total=len(iterator), desc="Training"):
        src = batch.src.to(device)
        trg = batch.trg.to(device)
        optimizer.zero_grad()

        output = model(src, trg)
        output_dim = output.shape[-1]

        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)

        if scorer == "CrossEntropyLoss":
            pass

        elif scorer == "BERTScore":
            # Convert output and targets to text strings
            output_text = indices_to_text(torch.argmax(output, dim=1).cpu().numpy(), TRG)
            trg_text = indices_to_text(trg.cpu().numpy(), TRG)
            # print(f'Output: {output_text}')
            # print(f'Target: {trg_text}')

            P, R, F1 = bert_based_scorer.score([output_text], [trg_text])
            reward = F1.mean().item()
            # print("previous loss:", loss)
            # print("previous loss:", loss.item())
            loss *= (1 - reward)
            # print("post loss:", loss.item())

        elif scorer == "BLEU":
            output_text = indices_to_text(torch.argmax(output, dim=1).cpu().numpy(), TRG)
            trg_text = indices_to_text(trg.cpu().numpy(), TRG)

            # Compute BLEU score
            bleu_score = sentence_bleu([trg_text.split()], output_text.split())
            loss *= (1 - bleu_score)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        epoch_loss += loss.item()
        
        # break

    avg_loss = epoch_loss / len(iterator)
    return avg_loss


In [12]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    for batch in tqdm(iterator, total=len(iterator), desc="Evaluating"):
        src = batch.src.to(device)
        trg = batch.trg.to(device)
        output = model(src, trg, 0) # Turn off teacher forcing
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)

        output_text = indices_to_text(torch.argmax(output, dim=1).cpu().numpy(), TRG)
        trg_text = indices_to_text(trg.cpu().numpy(), TRG)
        
        scores = criterion(output_text, trg_text)
        loss = (scores['rouge1_fmeasure'] + scores['rouge2_fmeasure'] + scores['rougeL_fmeasure']) / 3
        epoch_loss += loss

    avg_loss = epoch_loss / len(iterator)
    return avg_loss

In [13]:
from torchmetrics.text.rouge import ROUGEScore
rouge = ROUGEScore()

  warn(


In [14]:
N_EPOCHS = 10

input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)

ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(input_dim, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(output_dim, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

best_val_loss = float('inf')

train_logger.info("cross training")
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion, "CrossEntropyLoss")
    valid_loss = evaluate(model, valid_iterator, rouge)

    if valid_loss < best_val_loss:
        best_val_loss = valid_loss
        torch.save(model.state_dict(), 'cross_model.pt')

    train_logger.info(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss}')
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')

Training: 100%|██████████| 114/114 [00:55<00:00,  2.07it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.17it/s]


Epoch: 01 | Train Loss: 2.963


Training: 100%|██████████| 114/114 [00:54<00:00,  2.09it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.19it/s]


Epoch: 02 | Train Loss: 2.508


Training: 100%|██████████| 114/114 [00:58<00:00,  1.94it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.13it/s]


Epoch: 03 | Train Loss: 2.349


Training: 100%|██████████| 114/114 [00:54<00:00,  2.11it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.19it/s]


Epoch: 04 | Train Loss: 2.269


Training: 100%|██████████| 114/114 [00:59<00:00,  1.92it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.13it/s]


Epoch: 05 | Train Loss: 2.169


Training: 100%|██████████| 114/114 [00:58<00:00,  1.94it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.14it/s]


Epoch: 06 | Train Loss: 2.072


Training: 100%|██████████| 114/114 [01:01<00:00,  1.86it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s]


Epoch: 07 | Train Loss: 2.006


Training: 100%|██████████| 114/114 [00:56<00:00,  2.00it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.12it/s]


Epoch: 08 | Train Loss: 1.932


Training: 100%|██████████| 114/114 [01:01<00:00,  1.85it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.16it/s]


Epoch: 09 | Train Loss: 1.874


Training: 100%|██████████| 114/114 [00:56<00:00,  2.03it/s]
Evaluating: 100%|██████████| 4/4 [00:03<00:00,  1.14it/s]

Epoch: 10 | Train Loss: 1.818





In [None]:
N_EPOCHS = 10

input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)

ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(input_dim, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(output_dim, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

best_val_loss = float('inf')

train_logger.info("bert training")
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion, "BERTScore")
    valid_loss = evaluate(model, valid_iterator, rouge)

    if valid_loss > best_val_loss:
        best_val_loss = valid_loss
        torch.save(model.state_dict(), 'bert_model.pt')

    train_logger.info(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss}')
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')

In [None]:
N_EPOCHS = 10

input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)

ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(input_dim, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(output_dim, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

best_val_loss = float('-inf')

train_logger.info("Bleu training")
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion, "BLEU")
    valid_loss = evaluate(model, valid_iterator, rouge)

    if valid_loss > best_val_loss:
        best_val_loss = valid_loss
        torch.save(model.state_dict(), 'bleu_model.pt')

    train_logger.info(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss}')
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')

# now transfer the learning

In [None]:
N_EPOCHS = 10

model.load_state_dict(torch.load('cross_model.pt'))

best_val_loss = float('-inf')

train_logger.info("Bleu with transfer learning training")
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion, "BLEU")
    valid_loss = evaluate(model, valid_iterator, rouge)

    if valid_loss > best_val_loss:
        best_val_loss = valid_loss
        torch.save(model.state_dict(), 'bleu_transfer_model.pt')

    train_logger.info(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss}')
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')

In [None]:
N_EPOCHS = 10

model.load_state_dict(torch.load('cross_model.pt'))

best_val_loss = float('-inf')

train_logger.info("Bert with transfer learning training")
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion, "BERTScore")
    valid_loss = evaluate(model, valid_iterator, rouge)

    if valid_loss > best_val_loss:
        best_val_loss = valid_loss
        torch.save(model.state_dict(), 'bert_transfer_model.pt')

    train_logger.info(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss}')
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')

In [None]:
model.load_state_dict(torch.load('model.pt'))
test_loss = evaluate(model, test_iterator, rouge)
eval_logger.info(f'Test Evaluation Loss: {test_loss}')
print(f'Test Loss: {test_loss:.3f}')