In [None]:
# imports
import os
import sys
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchtext
import transformers
import time
import random

sys.path.append(os.path.abspath('../input/math-problem/data'))
import evaluator

os.environ["HF_HUB_DISAYMLINKS_WARNING"] = "1"


In [None]:
# Utils

# Creating directory
def create_dir(addr):
    if not os.path.exists(addr):
        os.mkdir(addr)

# Delete folder and its content
def remove_folder_contents(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                remove_folder_contents(file_path)
                os.rmdir(file_path)
        except Exception as e:
            print(e)

# Addresses

# Raw data
raw_address = "../input/math-problem/data"
raw_train = "../input/math-problem/data/train.json"
raw_test = "../input/math-problem/data/test.json"
raw_dev = "../input/math-problem/data/dev.json"

# Model
result = "results/"
model_glove_add = "results/model_glove"
model_glove_attention_add1 = "results/model_glove_attention_0.3"
model_glove_attention_add2 = "results/model_glove_attention_0.6"
model_glove_attention_add3 = "results/model_glove_attention_0.9"
model_bert_frozen_add = "results/model_bert_frozen"
model_bert_adaptive_add = "results/model_bert_adaptive"

# Temp
temp = "temp/"

# Creating Directory
create_dir(result)
create_dir(temp)
create_dir(model_glove_add)
create_dir(model_glove_attention_add1)
create_dir(model_glove_attention_add2)
create_dir(model_glove_attention_add3)
create_dir(model_bert_frozen_add)
create_dir(model_bert_adaptive_add)

# HyperParameters
embedding_size = 200
hidden_size = 256
lr = 1e-3
num_epoch = 40
dropout = 0.5
n_layers = 2
clip = 1.0
max_out_size = 100

# Special Tokens
NUM = "<NUM>"
UNK = "<UNK>"
SOS = "<SOS>"
EOS = "<EOS>"
PAD = "<PAD>"
BERT_PAD_VALUE = 0

# Random Seed
torch.random.manual_seed(68)
random.seed(68)

In [None]:
# cuda
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Working with {device}")

In [None]:
# Creating DataSet, DataLoader and Vocab

def remove_commas_from_numbers(text):
    # Define a regular expression pattern to match numbers with commas
    pattern = r'(\d{1,3}(,\d{3})*)'
    
    # Replace commas in numbers with empty string
    result = re.sub(pattern, lambda x: x.group(0).replace(',', ''), text)
    
    return result

def create_vocab(data, threshold = 3):
    x_vocab, y_vocab = {}, set()
    for dat in data:
        for word in dat['Problem'].split():
            word = word.lower()
            if not word or all(char.isdigit() or char=='.' for char in word):
                continue
            if word in x_vocab:
                x_vocab[word] += 1
            else:
                x_vocab[word] = 1
        for word in dat['linear_formula'].split("|"):
            word = word.lower()
            if not word:
                continue
            y_vocab.add(word)
    x_vocab = sorted([word for word in x_vocab if x_vocab[word]>=threshold])
    x_vocab.extend([SOS, EOS, PAD, UNK, NUM])
    x_stoi = {x_vocab[ind]: ind for ind in range(len(x_vocab))}
    y_vocab = sorted(list(y_vocab))
    y_vocab.extend([SOS, EOS, PAD, UNK])
    y_stoi = {y_vocab[ind]: ind for ind in range(len(y_vocab))}
    return x_vocab, x_stoi, y_vocab, y_stoi

class Dataset(torch.utils.data.Dataset):
    def __init__(self, json_file):
        with open(json_file, "rb") as file:
            self.data = pd.DataFrame(json.load(file))
        self.data['Problem'] = self.data['Problem'].apply(remove_commas_from_numbers)
        self.data['Problem'] = self.data['Problem'].apply(lambda x: x.lower())
        self.data = self.data.to_dict(orient='records')

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ret_data = self.data[idx]
        return ret_data
    
    def tokenize(self, x_stoi, y_stoi):
        bert_tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
        for dat in self.data:
            bert_sentence = []
            dat['x'] = [x_stoi[SOS]]
            for word in dat['Problem'].split():
                if word in x_stoi:
                    dat['x'].append(x_stoi[word])
                    bert_sentence.append(word)
                elif all(char.isdigit() or char=='.' for char in word):
                    dat['x'].append(x_stoi[NUM])
                    bert_sentence.append(NUM)
                else:
                    dat['x'].append(x_stoi[UNK])
                    bert_sentence.append(UNK)
            bert_sentence = " ".join(bert_sentence)
            dat['bert_x'] = bert_tokenizer(bert_sentence)
            dat['bert_x']['input_ids'] = torch.tensor(dat['bert_x']['input_ids'])
            dat['bert_x']['attention_mask'] = torch.tensor(dat['bert_x']['attention_mask'])
            dat['x'].append(x_stoi[EOS])
            dat['x'] = torch.tensor(dat['x'])
            if 'linear_formula' in dat:
                dat['y'] = [y_stoi[word] if word in y_stoi else y_stoi[UNK] for word in dat['linear_formula'].split("|")]
            else:
                dat['y'] = [y_stoi[UNK] for _ in range(max_out_size)]
            dat['y'] = [y_stoi[SOS]] + dat['y'] + [y_stoi[EOS]]
            dat['y'] = torch.tensor(dat['y'])
            
class DataLoader:
    def __init__(self, batch_size = 256, shuffle = True):
        self.data_train = Dataset(raw_train)
        self.x_vocab, self.x_stoi, self.y_vocab, self.y_stoi = create_vocab(self.data_train.data)
        self.data_train.tokenize(self.x_stoi, self.y_stoi)
        self.train = torch.utils.data.DataLoader(self.data_train, batch_size = batch_size, collate_fn = self.collate, shuffle = shuffle)

        self.data_dev = Dataset(raw_dev)
        self.data_dev.tokenize(self.x_stoi, self.y_stoi)
        self.dev = torch.utils.data.DataLoader(self.data_dev, batch_size = batch_size, collate_fn = self.collate, shuffle = False)

        self.data_test = Dataset(raw_test)
        self.data_test.tokenize(self.x_stoi, self.y_stoi)
        self.test = torch.utils.data.DataLoader(self.data_test, batch_size = batch_size, collate_fn = self.collate, shuffle = False)

    def collate(self, batch):
        batch_x = torch.nn.utils.rnn.pad_sequence([example['x'] for example in batch], padding_value=self.x_stoi[PAD])
        batch_y = torch.nn.utils.rnn.pad_sequence([example['y'] for example in batch], padding_value=self.y_stoi[PAD])
        # Taking pad value of bert to be zero (default value)
        batch_bert_x = {'x': torch.nn.utils.rnn.pad_sequence([example['bert_x']['input_ids'] for example in batch], padding_value=BERT_PAD_VALUE),
                        'a_mask': torch.nn.utils.rnn.pad_sequence([example['bert_x']['attention_mask'] for example in batch], padding_value=0)
                        }
        return {'x': batch_x, 'y': batch_y, 'bert_x': batch_bert_x}

data = DataLoader()


In [None]:
# Models
class Attention(torch.nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()

        self.Wa = torch.nn.Linear(hidden_size, hidden_size)
        self.Ua = torch.nn.Linear(hidden_size, hidden_size)
        self.Va = torch.nn.Linear(hidden_size, 1)
    
    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys))).squeeze(2)
        weights = torch.nn.functional.softmax(scores, dim=0).permute(1, 0).unsqueeze(1)
        context = torch.bmm(weights, keys.permute(1, 0, 2))

        return context.permute(1, 0, 2)

class Encoder(torch.nn.Module):
    def __init__(self, embedding = None, input_size = len(data.x_vocab), embedding_size = embedding_size, hidden_size = hidden_size, n_layers = n_layers, dtype = torch.float):
        super(Encoder, self).__init__()

        self.dtype = dtype
        self.embedding = torch.nn.Embedding(input_size, embedding_size)
        self.lstm = torch.nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=n_layers, bidirectional=True, dropout=dropout, dtype=self.dtype)
        self.dropout = torch.nn.Dropout(dropout)

        if embedding == "glove":
            emb = torchtext.vocab.GloVe(name='6B', dim=embedding_size)
            new_vectors = []
            mean, std = torch.mean(emb.vectors), torch.std(emb.vectors)
            for ind in range(input_size):
                word = data.x_vocab[ind]
                if word in emb.stoi:
                    new_vectors.append(emb.vectors[emb.stoi[word]])
                else:
                    new_vectors.append(mean + std*torch.randn((embedding_size,)))
            self.embedding.weight.data = torch.FloatTensor(torch.stack(new_vectors))

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell

class BertEncoder(torch.nn.Module):
    def __init__(self, freeze, hidden_size=hidden_size, n_layers=n_layers, device=device, dtype=torch.float):
        super(BertEncoder, self).__init__()

        self.dtype = dtype
        self.device = device
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased')
        if freeze:
            for param in self.bert.parameters():
                param.requires_grad = False
        self.dropout = torch.nn.Dropout(dropout)
        self.linear = torch.nn.Linear(768, 2*n_layers*hidden_size)
        self.linear_2 = torch.nn.Linear(768, hidden_size)

    def forward(self, x):
        x, mask = x['x'].to(self.device), x['a_mask'].to(self.device)
        input = x.permute(1, 0)
        mask = mask.permute(1, 0)
        token_outputs, pooled_outputs = self.bert(input, attention_mask=mask, return_dict=False)
        token_outputs = self.linear_2(token_outputs)
        linear_output = self.linear(self.dropout(pooled_outputs))

        outputs = token_outputs.permute(1, 0, 2)
        hidden = linear_output.reshape(2*self.n_layers, -1, self.hidden_size)
        cell = linear_output.reshape(2*self.n_layers, -1, self.hidden_size)

        return outputs, hidden, cell

class Decoder(torch.nn.Module):
    def __init__(self, attention = False, output_size = len(data.y_vocab), embedding_size = embedding_size, hidden_size = 2*hidden_size, n_layers = n_layers, dtype = torch.float):
        super(Decoder, self).__init__()

        self.dtype = dtype
        self.attention = attention
        self.embedding = torch.nn.Embedding(output_size, embedding_size)
        if self.attention:
            self.lstm = torch.nn.LSTM(input_size=embedding_size+hidden_size, hidden_size=hidden_size, num_layers=n_layers, dropout=dropout, dtype=self.dtype)
        else:
            self.lstm = torch.nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=n_layers, dropout=dropout, dtype=self.dtype)
        self.fc = torch.nn.Linear(hidden_size, output_size)
        self.dropout = torch.nn.Dropout(dropout)
        if attention:
            self.att = Attention(hidden_size=hidden_size)

    def forward(self, input, hidden, cell, enc_out = None):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        if self.attention:
            query = hidden[-1].unsqueeze(0)
            keys = enc_out
            context = self.att(query, keys)
            dec_inp = torch.concat((embedded, context), dim=-1)
        else:
            dec_inp = embedded
        out, (hidden, cell) = self.lstm(dec_inp, (hidden, cell))
        pred = self.fc(out.squeeze(0))
        return pred, hidden, cell

class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder, tf_ratio = 0.6, device = device, dtype = torch.float):
        super(Seq2Seq, self).__init__()

        self.dtype = dtype
        self.encoder = encoder
        self.decoder = decoder
        self.tf_ratio = tf_ratio
        self.device = device

    def forward(self, input, target, tf_ratio = None):
        # Preparing output
        outputs = torch.zeros(target.shape[0], target.shape[1], len(data.y_vocab), device=self.device)

        # Encoding
        enc_output, hidden, cell = self.encoder(input)

        # Preparing Input for decoder
        n_layer = hidden.shape[0]//2
        hidden = torch.concat((hidden[:n_layers], hidden[n_layer:]), dim=-1)
        cell = torch.concat((cell[:n_layers], cell[n_layer:]), dim=-1)
        dec_input = target[0, :]

        # Setting Teacher Forcing ratio
        if tf_ratio is None:
            tf_ratio = self.tf_ratio

        # Decoding
        for t in range(1, target.shape[0]):
            dec_output, hidden, cell = self.decoder(dec_input, hidden, cell, enc_output)
            outputs[t] = dec_output

            # Teacher Forcing
            dec_input = target[t] if random.random() < tf_ratio else dec_output.argmax(1)

        return outputs


In [None]:
def evaluate(model: Seq2Seq, dataloader, bert = False):
    # Evaluation
    model.eval()
    epoch_loss = 0
    batch_ct = 0

    # Loss function
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=data.y_stoi[PAD])

    with torch.no_grad():
        for batch in dataloader:
            # Loading Data
            if bert:
                x = batch['bert_x']
            else:
                x = batch['x'].to(device)
            y = batch['y'].to(device)

            # Getting Output
            y_pred = model(x, y, 0)

            # Calculating Loss
            y_pred = y_pred[1:].reshape(-1, y_pred.shape[-1])
            y = y[1:].reshape(-1)
            loss = loss_fn(y_pred, y)

            # Adding to total loss
            epoch_loss += loss.item()
            batch_ct += 1
        
    return epoch_loss/batch_ct

def train(model: Seq2Seq, data: DataLoader, model_address: str, lr=lr, bert = False):
    start_time = time.time()
    print("learning rate:", lr)

    # Training
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=data.y_stoi[PAD])
    loss_arr =[[], []]

    # Early Stopping
    best_loss = float("inf")

    for epoch in range(num_epoch):
        model.train()

        # Loss
        epoch_loss = 0
        batch_ct = 0

        for batch in data.train:
            # Loading Data
            if bert:
                x = batch['bert_x']
            else:
                x = batch['x'].to(device)
            y = batch['y'].to(device)

            # Getting Output
            optimizer.zero_grad()
            y_pred = model(x, y)

            # Calculating Loss
            y_pred = y_pred[1:].reshape(-1, y_pred.shape[-1])
            y = y[1:].reshape(-1)
            loss = loss_fn(y_pred, y)

            # Back Propagation with gradient clipping
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            # Adding to total loss
            epoch_loss += loss.item()
            batch_ct += 1

        print(f"Epoch: {epoch} Loss: {epoch_loss/batch_ct}\tTime: {time.time()-start_time}")

        dev_loss = evaluate(model, data.dev, bert=bert)
        if dev_loss < best_loss:
            best_loss = dev_loss
            torch.save(model.state_dict(), os.path.join(model_address, 'param.pt'))

        # Updating loss array
        loss_arr[0].append(epoch_loss/batch_ct)
        loss_arr[1].append(dev_loss)

        print(f"Validation Loss: {dev_loss}\n")
    
    return best_loss, loss_arr

def beam_search(model: Seq2Seq, data: DataLoader, dev_add, test_add, beam_size = 10, bert = False):
    # Defining Beam Search Per Batch
    def beam_search_per_batch(batch):
        # Loading Data
        if bert:
            x = batch['bert_x']
        else:
            x = batch['x'].to(device)
        y = batch['y'].to(device)

        # Getting Output
        y_pred = model(x, y, 0)
        y_pred = prob(y_pred[1:])
        y_top_val, y_top_ind = torch.topk(y_pred, beam_size)
        
        # # Greedy Approach
        if beam_size == 1:
            y_pred = y_pred.argmax(-1)
            sentence_arr = []
            for ind in range(y_pred.shape[1]):
                sequence = []
                for token in y_pred[:, ind]:
                    word = data.y_vocab[token]
                    sequence.append(word)
                    if word == EOS:
                        break
                sequence = [word for word in sequence if word not in [SOS, EOS, UNK, PAD]]
                sentence_arr.append("|".join(sequence))
            return sentence_arr

        # Converting to sentence
        sentence_arr = []
        for ind in range(y_pred.shape[1]):
            sequences = [(0, [])]
            for t in range(y_pred.shape[0]):
                new_sequences = []
                for loss, seq in sequences:
                    if seq and seq[-1] == EOS:
                        new_loss = loss
                        if len(new_sequences) >= beam_size and new_loss > new_sequences[-1][0]:
                            continue
                        new_seq = list(seq)
                        elem = (new_loss, new_seq)
                        # for ind in range(len(new_sequences)):
                        #     if elem[0] < new_sequences[ind][0]:
                        #         new_sequences[ind], elem = elem, new_sequences[ind]
                        # if len(new_sequences) != beam_size:
                        new_sequences.append(elem)
                        new_sequences.sort()
                        new_sequences = new_sequences[:beam_size]
                    else:
                        for word_ind in y_top_ind[t, ind, :]:
                            word = data.y_vocab[word_ind]
                            new_loss = loss - y_pred[t, ind, data.y_stoi[word]].item()
                            if len(new_sequences) >= beam_size and new_loss > new_sequences[-1][0]:
                                continue
                            new_seq = seq + [word]
                            elem = (new_loss, new_seq)
                            # for ind in range(len(new_sequences)):
                            #     if elem[0] < new_sequences[ind][0]:
                            #         new_sequences[ind], elem = elem, new_sequences[ind]
                            # if len(new_sequences) != beam_size:
                            new_sequences.append(elem)
                            new_sequences.sort()
                            new_sequences = new_sequences[:beam_size]
                sequences = new_sequences
            best_sequence = [word for word in sequences[0][1] if word not in [SOS, EOS, UNK, PAD]]
            sentence = "|".join(best_sequence)
            sentence_arr.append(sentence)
            # print(sentence)
            ct[0] += 1
        print(f"Sentence: {ct[0]}/{tot}\tTime: {time.time() - start_time}")
        return sentence_arr

    start_time = time.time()

    # Evaluation
    model.eval()
    
    # For Calculating log softmax
    prob = torch.nn.LogSoftmax(dim=2)

    with torch.no_grad():
        ct = [0]
        tot = len(data.data_dev) + len(data.data_test)
        # Test
        sentences_test = []
        for batch in data.test:
            sentences_test.extend(beam_search_per_batch(batch))
        test_data = json.load(open(raw_test, 'rb'))
        for i in range(len(sentences_test)):
            test_data[i]['predicted'] = sentences_test[i]
        json.dump(test_data, open(test_add, 'w'))

        # Dev
        sentences_dev = []
        for batch in data.dev:
            sentences_dev.extend(beam_search_per_batch(batch))
        dev_data = json.load(open(raw_dev, 'rb'))
        for i in range(len(sentences_dev)):
            dev_data[i]['predicted'] = sentences_dev[i]
        json.dump(dev_data, open(dev_add, 'w'))
        
def learn_model(model: Seq2Seq, model_add, model_name, bert = False, lr=lr):
    def plot(arr, title):
        fig, ax = plt.subplots()
        arr_x = list(range(1, num_epoch+1))
        ax.plot(arr_x, arr[0], label = 'Train')
        ax.plot(arr_x, arr[1], label = 'Validation')
        ax.set_xlabel("num_epochs")
        ax.set_ylabel("Loss")
        ax.set_title(title)
        ax.legend()
        plt.savefig(os.path.join(model_add, 'loss_curve'))

    val_loss, loss_arr = train(model, data, model_add, bert=bert, lr=lr)
    plot(loss_arr, model_name)
    model.load_state_dict(torch.load(os.path.join(model_add, 'param.pt')))
    val_loss = evaluate(model, data.dev, bert=bert)
    print("Val Loss:", val_loss)
    test_loss = evaluate(model, data.test, bert=bert)
    print("Test Loss:", test_loss)
    dev_add, test_add = os.path.join(model_add, 'dev.json'), os.path.join(model_add, 'test.json')
    beam_search(model, data, dev_add, test_add, bert=bert)
    print("Test:")
    acc_test = evaluator.main(os.path.join(model_add, 'test.json'))
    print("Val:")
    acc_dev = evaluator.main(os.path.join(model_add, 'dev.json'))
    with open(os.path.join(model_add, 'report_pc.txt'), 'w') as report:
        report.write(f"Val Loss: {val_loss}\n")
        report.write(f"Exact Val Accuracy: {acc_dev[1]}\n")
        report.write(f"Execution Val Accuracy: {acc_dev[0]}\n")
        report.write(f"Test Loss: {test_loss}\n")
        report.write(f"Exact Test Accuracy: {acc_test[1]}\n")
        report.write(f"Execution Test Accuracy: {acc_test[0]}\n")
        report.write('\nLoss Arr:\n')
        # report.write(str(loss_arr))
        report.write('\nHyperParameters:\n')
        report.write(f'\tNum Epochs: {num_epoch}\n')
        report.write(f"\tEmbedding Dim: {embedding_size}\n")
        report.write(f"\tHidden Dim: {hidden_size}\n")
        report.write(f"\tLearning Rate: {lr}\n")
        report.write(f"\tDropout Probability: {dropout}\n")
        report.write(f"\tNum Layers: {n_layers}\n")
        report.write(f"\tGradient Clip: {clip}\n")

def access_model_beam_size(model: Seq2Seq, model_add, bert = False):
    for beam_size in {1, 10, 20}:
        model.load_state_dict(torch.load(os.path.join(model_add, 'param.pt')))
        val_loss = evaluate(model, data.dev, bert=bert)
        print("Val Loss:", val_loss)
        test_loss = evaluate(model, data.test, bert=bert)
        print("Test Loss:", test_loss)
        dev_add, test_add = os.path.join(model_add, 'dev.json'), os.path.join(model_add, 'test.json')
        beam_search(model, data, dev_add, test_add, bert=bert, beam_size=beam_size)
        print("Test:")
        acc_test = evaluator.main(os.path.join(model_add, 'test.json'))
        print("Val:")
        acc_dev = evaluator.main(os.path.join(model_add, 'dev.json'))
        with open(os.path.join(model_add, f'report_{beam_size}.txt'), 'w') as report:
            report.write(f"Val Loss: {val_loss}\n")
            report.write(f"Exact Val Accuracy: {acc_dev[1]}\n")
            report.write(f"Execution Val Accuracy: {acc_dev[0]}\n")
            report.write(f"Test Loss: {test_loss}\n")
            report.write(f"Exact Test Accuracy: {acc_test[1]}\n")
            report.write(f"Execution Test Accuracy: {acc_test[0]}\n")
            report.write('\nHyperParameters:\n')
            report.write(f'\tNum Epochs: {num_epoch}\n')
            report.write(f"\tEmbedding Dim: {embedding_size}\n")
            report.write(f"\tHidden Dim: {hidden_size}\n")
            report.write(f"\tLearning Rate: {lr}\n")
            report.write(f"\tDropout Probability: {dropout}\n")
            report.write(f"\tNum Layers: {n_layers}\n")
            report.write(f"\tGradient Clip: {clip}\n")


In [None]:
# Glove Model
model_glove = Seq2Seq(Encoder(embedding="glove"), Decoder(), device=device).to(device)
learn_model(model_glove, model_glove_add, 'Seq2Seq model with GloVe embeddings')

In [None]:
# Glove Model with Attention
model_glove_attention2 = Seq2Seq(Encoder(embedding="glove"), Decoder(attention=True), device=device).to(device)
learn_model(model_glove_attention2, model_glove_attention_add2, 'Seq2Seq+Attention model with GloVe embeddings, tf_ratio = 0.6')

In [None]:
# Impact of Teacher Forcing
model_glove_attention1 = Seq2Seq(Encoder(embedding="glove"), Decoder(attention=True), tf_ratio=0.3, device=device).to(device)
learn_model(model_glove_attention1, model_glove_attention_add1, 'Seq2Seq+Attention model with GloVe embeddings, tf_ratio = 0.3')

model_glove_attention3 = Seq2Seq(Encoder(embedding="glove"), Decoder(attention=True), tf_ratio=0.9, device=device).to(device)
learn_model(model_glove_attention3, model_glove_attention_add3, 'Seq2Seq+Attention model with GloVe embeddings, tf_ratio = 0.9')

In [None]:
lr = 1e-5

# Frozen BERT Model with Attention
model_bert_frozen = Seq2Seq(BertEncoder(freeze=True), Decoder(), device=device).to(device)
learn_model(model_bert_frozen, model_bert_frozen_add, "A Seq2Seq+Attention model using a pre-trained frozen BERT-base-cased Encoder", bert=True, lr=lr)

In [None]:
# Adaptive BERT Model with Attention
model_bert_adaptive = Seq2Seq(BertEncoder(freeze=False), Decoder(), device=device).to(device)
learn_model(model_bert_adaptive, model_bert_adaptive_add, "A Seq2Seq+Attention model using a pre-trained adaptive BERT-base-cased Encoder", bert=True, lr=lr)
access_model_beam_size(model_bert_adaptive, model_bert_adaptive_add, bert=True)

In [None]:
access_model_beam_size(model_glove_attention2, model_glove_attention_add2, bert=False)