In [1]:
import json
import os
import sys

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torch.nn.utils.rnn import pad_sequence
from transformers import BertModel, BertForMaskedLM, BertConfig, EncoderDecoderModel
from transformers import BertTokenizer

2023-07-21 18:34:24.206381: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-21 18:34:24.253773: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
torch.cuda.is_available()

True

In [3]:
from tokenizers.processors import BertProcessing
from tokenizers.implementations import BertWordPieceTokenizer


def train_tokenizer(filename, params):
    """
    Train a BertWordPieceTokenizer with the specified params and save it
    """
    # Get tokenization params
    save_location = params["tokenizer_path"]
    max_length = params["max_length"]
    min_freq = params["min_freq"]
    vocabsize = params["vocab_size"]

    tokenizer = BertWordPieceTokenizer()
    tokenizer.do_lower_case = False
    tokenizer.train(files=[filename], vocab_size=vocabsize, min_frequency=min_freq) #, special_tokens = special_tokens)

    sep_id = tokenizer.token_to_id("[SEP]")
    cls_id = tokenizer.token_to_id("[CLS]")
    tokenizer._tokenizer.post_processor = BertProcessing(sep=("[SEP]", sep_id), cls=("[CLS]", cls_id), )
    tokenizer.enable_truncation(max_length=max_length)
    print("Saving tokenizer ... " + save_location)
    if not os.path.exists(save_location):
        os.makedirs(save_location)
    tokenizer.save(save_location + "vocab.txt")
    return tokenizer

def count_parameters(mdl):
    return sum(p.numel() for p in mdl.parameters() if p.requires_grad)


def compute_loss(predictions, targets):
    """Compute our custom loss"""
    predictions = predictions[:, :-1, :].contiguous()
    targets = targets[:, 1:]

    rearranged_output = predictions.view(predictions.shape[0] * predictions.shape[1], -1)
    rearranged_target = targets.contiguous().view(-1)

    loss = criterion(rearranged_output, rearranged_target)

    return loss


def train_model():
    model.train()
    epoch_loss = 0

    for i, (en_input, en_masks, de_output, de_masks) in enumerate(train_dataloader):
        optimizer.zero_grad()

        en_input = en_input.to(device)
        de_output = de_output.to(device)
        en_masks = en_masks.to(device)
        de_masks = de_masks.to(device)
        
        lm_labels = de_output.clone()
        out = model(input_ids=en_input, attention_mask=en_masks,
                    decoder_input_ids=de_output, decoder_attention_mask=de_masks, labels=lm_labels)
        prediction_scores = out[1]
        predictions = F.log_softmax(prediction_scores, dim=2)
        loss = compute_loss(predictions, de_output)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        epoch_loss += loss.item()

    print("Mean epoch loss:", (epoch_loss / num_train_batches))


def eval_model():
    model.eval()
    epoch_loss = 0

    for i, (en_input, en_masks, de_output, de_masks) in enumerate(train_dataloader):
        optimizer.zero_grad()

        en_input = en_input.to(device)
        de_output = de_output.to(device)
        en_masks = en_masks.to(device)
        de_masks = de_masks.to(device)

        lm_labels = de_output.clone()

        out = model(input_ids=en_input, attention_mask=en_masks,
                    decoder_input_ids=de_output, decoder_attention_mask=de_masks, labels=lm_labels)

        prediction_scores = out[1]
        predictions = F.log_softmax(prediction_scores, dim=2)
        loss = compute_loss(predictions, de_output)
        epoch_loss += loss.item()

    print("Mean validation loss:", (epoch_loss / num_valid_batches))


class TranslationDataset(data.Dataset):

    def __init__(self, inp_file, targ_file, inp_tokenizer, targ_tokenizer, inp_maxlength, targ_maxlength):

        self.inp_tokenizer = inp_tokenizer
        self.targ_tokenizer = targ_tokenizer
        self.inp_maxlength = inp_maxlength
        self.targ_maxlength = targ_maxlength

        print("Loading and Tokenizing the data ...")
        self.encoded_inp = []
        self.encoded_targ = []

        # Read the EN lines
        num_inp_lines = 0
        with open(inp_file, "r", encoding="utf-8") as ef:
            for line in ef:
                stripped_line = line.strip()
                # print(stripped_line)
                enc = self.inp_tokenizer.encode(stripped_line) #, add_special_tokens=True, max_length=self.inp_maxlength)
                # print(enc)
                self.encoded_inp.append(torch.tensor(enc))
                # self.encoded_inp.append(enc)
                num_inp_lines += 1

        # read the DE lines
        num_targ_lines = 0
        with open(targ_file, "r", encoding="utf-8") as df:
            for line in df:
                enc = self.targ_tokenizer.encode(line.strip()) # , add_special_tokens=True, max_length=self.targ_maxlength)
                self.encoded_targ.append(torch.tensor(enc))
                # self.encoded_targ.append(enc)
                num_targ_lines += 1

        assert (num_inp_lines == num_targ_lines), "Mismatch in Nl and Code lines"
        print("Read", num_inp_lines, "lines from Nl and Code files.")

    def __getitem__(self, offset):
        en = self.encoded_inp[offset]
        de = self.encoded_targ[offset]

        return en, en.shape[0], de, de.shape[0]

    def __len__(self):
        return len(self.encoded_inp)

    def collate_function(self, batch):

        (inputs, inp_lengths, targets, targ_lengths) = zip(*batch)

        padded_inputs = self._collate_helper(inputs, self.inp_tokenizer)
        padded_targets = self._collate_helper(targets, self.targ_tokenizer)

        max_inp_seq_len = padded_inputs.shape[1]
        max_out_seq_len = padded_targets.shape[1]

        input_masks = [[1] * l + [0] * (max_inp_seq_len - l) for l in inp_lengths]
        target_masks = [[1] * l + [0] * (max_out_seq_len - l) for l in targ_lengths]

        input_tensor = padded_inputs.to(torch.int64)
        target_tensor = padded_targets.to(torch.int64)
        input_masks = torch.Tensor(input_masks)
        target_masks = torch.Tensor(target_masks)

        return input_tensor, input_masks, target_tensor, target_masks

    def _collate_helper(self, examples, tokenizer):
        length_of_first = examples[0].size(0)
        are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
        if are_tensors_same_length:
            return torch.stack(examples, dim=0)
        else:
            if tokenizer._pad_token is None:
                raise ValueError(
                    "You are attempting to pad samples but the tokenizer you are using"
                    f" ({tokenizer.__class__.__name__}) does not have one."
                )
            return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)


In [4]:
configfile = r"config.json"

In [5]:
# Read the params
with open(configfile, "r") as f:
    config = json.load(f)

globalparams = config["global_params"]
encparams = config["encoder_params"]
decparams = config["decoder_params"]

# Get the dataset files
train_en_file = globalparams["train_en_file"]
train_de_file = globalparams["train_de_file"]

# Train the tokenizers
test_en_tokenizer = train_tokenizer(train_en_file, encparams)
test_de_tokenizer = train_tokenizer(train_de_file, decparams)





Saving tokenizer ... tokenizers/en_tok/



Saving tokenizer ... tokenizers/de_tok/


In [6]:
from transformers import BertTokenizerFast

globalparams = config["global_params"]
encparams = config["encoder_params"]
decparams = config["decoder_params"]
modelparams = config["model_params"]

# Load the tokenizers
# additional_special_tokens = {"additional_token": 999}
# ["[S]","[/S]"]

special_tokens = [
    ("[PAD]", 0),
    ("[UNK]", 1),
    ("[MASK]", 2),
    ("[SEP]", 3),
    ("[CLS]", 4)
]

en_tok_path = encparams["tokenizer_path"]
en_tokenizer = BertTokenizerFast(os.path.join(en_tok_path, "vocab.txt"), special_tokens=special_tokens, tokenizer_object = test_en_tokenizer)
#en_tokenizer = BertTokenizerFast.from_pretrained(os.path.join(en_tok_path, "vocab.txt"))

de_tok_path = decparams["tokenizer_path"]
#de_tokenizer = BertTokenizerFast.from_pretrained(os.path.join(de_tok_path, "vocab.txt"))
de_tokenizer = BertTokenizerFast(os.path.join(de_tok_path, "vocab.txt"), special_tokens=special_tokens, tokenizer_object = test_en_tokenizer)

# Init the dataset
train_en_file = globalparams["train_en_file"]
train_de_file = globalparams["train_de_file"]
valid_en_file = globalparams["valid_en_file"]
valid_de_file = globalparams["valid_de_file"]

enc_maxlength = encparams["max_length"]
dec_maxlength = decparams["max_length"]
batch_size = modelparams["batch_size"]

In [7]:
train_dataset = TranslationDataset(train_en_file, train_de_file, en_tokenizer, de_tokenizer, enc_maxlength,
                                   dec_maxlength)
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, \
                                               drop_last=True, num_workers=1, collate_fn=train_dataset.collate_function)

Loading and Tokenizing the data ...
Read 100000 lines from Nl and Code files.


In [8]:
valid_dataset = TranslationDataset(valid_en_file, valid_de_file, en_tokenizer, de_tokenizer, enc_maxlength,
                                   dec_maxlength)
valid_dataloader = torch.utils.data.DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False, \
                                               drop_last=True, num_workers=1, collate_fn=valid_dataset.collate_function)

Loading and Tokenizing the data ...
Read 10000 lines from Nl and Code files.


In [9]:
from transformers import BertLMHeadModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

print("Loading models ..")
vocabsize = encparams["vocab_size"]
max_length = encparams["max_length"]
encoder_config = BertConfig(vocab_size=vocabsize,
                            max_position_embeddings=max_length + 64,  # this shuold be some large value
                            num_attention_heads=encparams["num_attn_heads"],
                            num_hidden_layers=encparams["num_hidden_layers"],
                            hidden_size=encparams["hidden_size"],
                            type_vocab_size=1)

encoder = BertModel(config=encoder_config)

vocabsize = decparams["vocab_size"]
max_length = decparams["max_length"]
decoder_config = BertConfig(vocab_size=vocabsize,
                            add_cross_attention=True,
                            max_position_embeddings=max_length + 64,  # this shuold be some large value
                            num_attention_heads=decparams["num_attn_heads"],
                            num_hidden_layers=decparams["num_hidden_layers"],
                            hidden_size=decparams["hidden_size"],
                            type_vocab_size=1,
                            is_decoder=True)  # Very Important

decoder = BertLMHeadModel(config=decoder_config)


# Define encoder decoder model
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)
model.to(device)

Using device: cuda
Loading models ..


EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(25000, 512, padding_idx=0)
      (position_embeddings): Embedding(576, 512)
      (token_type_embeddings): Embedding(1, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-7): 8 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elementw

In [10]:
print(f'The encoder has {count_parameters(encoder):,} trainable parameters')
print(f'The decoder has {count_parameters(decoder):,} trainable parameters')
print(f'The model has {count_parameters(model):,} trainable parameters')

optimizer = optim.Adam(model.parameters(), lr=modelparams['lr'])
criterion = nn.NLLLoss(ignore_index=de_tokenizer.pad_token_id)

num_train_batches = len(train_dataloader)
num_valid_batches = len(valid_dataloader)

The encoder has 46,974,976 trainable parameters
The decoder has 55,283,112 trainable parameters
The model has 102,258,088 trainable parameters


In [11]:
display(num_train_batches)
display(num_valid_batches)

6250

625

In [None]:
# MAIN TRAINING LOOP
for epoch in range(modelparams['num_epochs']):
    print("Starting epoch", epoch + 1)
    train_model()
    eval_model()

print("Saving model ..")
save_location = modelparams['model_path']
model_name = modelparams['model_name']
if not os.path.exists(save_location):
    os.makedirs(save_location)
save_location = os.path.join(save_location, model_name)
torch.save(model, save_location)

Starting epoch 1
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Mean epoch loss: 4.343282131824493
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mean validation loss: 36.171367586517334
Starting epoch 2
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mean epoch loss: 3.457275470571518
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mean validation loss: 3