<a href="https://colab.research.google.com/github/reconrus/DS_Project/blob/dev/UNMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
os.environ['PROJECT_PATH'] = os.path.abspath(os.curdir)

**Mount Google Drive**

It looks like it is impossible to use Google Colab, since I am using torchtext package, where field.build_vocab method is broken for the latest version supported by python3.6, while in python3.7 everything is OK

In [0]:
from google.colab import drive
os.environ['PROJECT_PATH']='/content/ydrive/My Drive/Study/UNMT'
drive.mount('/content/ydrive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/ydrive/


In [0]:
os.environ['TOOLS']= os.path.join(os.environ['PROJECT_PATH'], 'tools')
os.environ['RESOURCES']= os.path.join(os.environ['PROJECT_PATH'], 'resources')
os.environ['DATA']= os.path.join(os.environ['RESOURCES'], 'data')
os.environ['MODELS']= os.path.join(os.environ['PROJECT_PATH'], 'models')

# DATA VARIABLES
os.environ['VOCAB_SIZE']="60000"
os.environ['L1']='ba'
os.environ['L2']='ru'
os.environ['L1_DATA']="ba.sentesized"  
os.environ['L2_DATA']="news.2016.ru.shuffled"
os.environ['L1_DATA_PREPARED']=os.path.join(os.environ['DATA'], "{}.{}".format(os.environ['L1_DATA'], os.environ['VOCAB_SIZE']))
os.environ['L2_DATA_PREPARED']=os.path.join(os.environ['DATA'], "{}.{}".format(os.environ['L2_DATA'], os.environ['VOCAB_SIZE']))

os.environ["EMBEDDINGS_DIR"]=os.path.join(os.environ["RESOURCES"], "embeddings")
os.environ["BPE_EMBEDDINGS"]="crosslingualbpe"


# Download data

## Bashkir Language (source)

In [0]:
%%bash
BASHKIR="$DATA/bashkir"
git clone https://github.com/nevmenandr/bashkir-corpus "$BASHKIR-corpus"
mkdir "$BASHKIR" & mkdir "$BASHKIR/raw"
find "$BASHKIR-corpus" -name "*.txt" -print0 | xargs -0 -I file cat file > "$BASHKIR/ba"
# rm -rf -d  "$BASHKIR-corpus"

WIKIEXTRACTOR="$TOOLS/wikiextractor"
git clone https://github.com/ptakopysk/wikiextractor "$WIKIEXTRACTOR"
[ -f $BASHKIR/bawiki-latest-pages-articles.xml.bz2 ] || wget http://download.wikimedia.org/bawiki/latest/bawiki-latest-pages-articles.xml.bz2 -P "$BASHKIR"
python3 "$WIKIEXTRACTOR/WikiExtractor.py"  --json -o "$BASHKIR/ba_wiki" "$BASHKIR/bawiki-latest-pages-articles.xml.bz2"
# rm "$BASHKIR/bawiki-latest-pages-articles.xml.bz2"

In [0]:
import json

input_folder = os.path.join(os.environ['DATA'], 'bashkir', 'ba_wiki')
output_path = os.path.join(os.environ['DATA'], 'bashkir', 'ba')

output_file = open(output_path, "a+", encoding='utf-8')

for path, subdirs, files in os.walk(input_folder):
    for name in files:
        file = open(os.path.join(path, name), 'r', encoding='utf-8')
        for line in file.readlines():
            dump = json.loads(line)
            if dump["text"].strip('\n'):
              output_file.write("%s\n" % dump["text"])
        file.close()

output_file.close()

# !rm -rf -d "$DATA/bashkir/ba_wiki"

In [0]:
!pip install razdel
from razdel import sentenize

raw_data_path = os.path.join(os.environ['DATA'], 'bashkir', 'ba')
sentenized_data_path = os.path.join(os.environ['DATA'], 'ba.sentesized')

raw_data = open(raw_data_path, 'r', encoding='utf-8')
sentenized_data = open(sentenized_data_path, 'w+', encoding='utf-8')

for line in raw_data:
    sentences = sentenize(line)
    sentenized_data.writelines(["%s\n" % sentence.text for sentence in sentences])

## Russian Language

In [0]:
%%bash
wget http://data.statmt.org/wmt17/translation-task/news.2016.ru.shuffled.gz -P "$DATA"
gzip -d "$DATA/news.2016.ru.shuffled.gz"

# Preprocessing

## Text cleaning and tokenization

In [0]:
!pip install -U sacremoses
from sacremoses import MosesPunctNormalizer, MosesTokenizer

def preprocess_file(filepath, language):
    normalizer = MosesPunctNormalizer(language, pre_replace_unicode_punct=True, post_remove_control_chars=True)
    tokenizer = MosesTokenizer(language)
    output_file = open('%s.cleaned' % filepath, 'w+', encoding='utf-8')

    with open(filepath, 'r', encoding='utf-8') as input_file:
        for line in input_file:
            line = normalizer.normalize(line)
            tokens = tokenizer.tokenize(line)
            output_file.write("{}\n".format(' '.join(tokens)))


preprocess_file(os.path.join(os.environ['DATA'], os.environ['L1_DATA']), os.environ['L1']) 
preprocess_file(os.path.join(os.environ['DATA'], os.environ['L2_DATA']), os.environ['L2'])

## BPE codes

In [0]:
%%bash
FASTBPE="$TOOLS/fastBPE"
FAST="$FASTBPE/fast"
git clone https://github.com/glample/fastBPE "$FASTBPE"
g++ -std=c++11 -pthread -O3 "$FASTBPE/fastBPE/main.cc" -IfastBPE -o "$FAST"
"$FAST" learnbpe $VOCAB_SIZE "$DATA/${L1_DATA}.cleaned" "$DATA/${L2_DATA}.cleaned" > "$DATA/BPE_codes"
"$FAST" applybpe "$DATA/${L1_DATA_PREPARED}" "$DATA/${L1_DATA}.cleaned" "$DATA/BPE_codes"
"$FAST" applybpe "$DATA/${L2_DATA_PREPARED}" "$DATA/${L2_DATA}.cleaned" "$DATA/BPE_codes"

"$FAST" getvocab "$DATA/${L1_DATA_PREPARED}" > "$DATA/vocab.${L1_DATA}.60000" 
"$FAST" getvocab "$DATA/${L2_DATA_PREPARED}" > "$DATA/vocab.${L2_DATA}.60000" 
"$FAST" getvocab "$DATA/${L1_DATA_PREPARED}" "$DATA/${L2_DATA_PREPARED}" > "$DATA/vocab.full.60000"

## Cross-lingual Embeddings

In [0]:
%%bash
FASTTEXT_DIR="$TOOLS/fastText"
FASTTEXT="$FASTTEXT_DIR/fasttext"
git clone https://github.com/facebookresearch/fastText.git "$FASTTEXT_DIR"
cd "$FASTTEXT_DIR" 
[ -f "$FASTTEXT" ] || make

CONCAT_BPE="$DATA/concatenated.6000"
N_THREADS=$(grep -c ^processor /proc/cpuinfo)
echo $N_THREADS
cat "$DATA/${L1_DATA_PREPARED}" "$DATA/${L2_DATA_PREPARED}" | shuf > "$CONCAT_BPE"
chmod +x "$FASTTEXT"
"$FASTTEXT" skipgram -dim 256 -thread $N_THREADS -input "$CONCAT_BPE" -output "$EMBEDDINGS_DIR/$BPE_EMBEDDINGS"

# Model implementation

## Tools

In [0]:
import copy
import math
import torch
from torch import nn

#src https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

def get_module_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

def get_mask(inputs, pad_mask): #[2]
    slen, bs = inputs.size()
    lengths = slen-torch.sum(pad_mask, 0)
    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
    return alen < lengths[:, None]

## Encoder

In [0]:
import torch.nn.functional as F
from torch.nn import TransformerDecoder, TransformerDecoderLayer, \
                     TransformerEncoder, TransformerEncoderLayer


class Encoder(nn.Module):
  
    def __init__(self, field, d_model=256, nlayers=4, nheads=8, dropout=0.1, shared_nlayers=2, freeze_embs=True):
        """
        :param share_n_layers: number of layers that are shared for both languages
        """
        super(Encoder, self).__init__()
        
        self.voc_size = len(field.vocab) 

        self.d_model = d_model
        self.dropout = dropout
        self.embeddings = nn.Embedding(self.voc_size, d_model).from_pretrained(field.vocab.vectors, freeze=freeze_embs)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layer = TransformerEncoderLayer(d_model, nheads, dim_feedforward=4*d_model, dropout=dropout, activation='gelu')
        self.layers = nn.ModuleList()
        # Layers for the source language
        self.layers.append(get_module_clones(encoder_layer, nlayers-shared_nlayers))
        # Layers for the target language
        self.layers.append(get_module_clones(encoder_layer, nlayers-shared_nlayers))
        
        shared_layers = get_module_clones(encoder_layer, shared_nlayers)
        self.layers[0].extend(shared_layers)
        self.layers[1].extend(shared_layers)


    def forward(self, src, pad_mask, lang_id):
        src_mask = get_mask(src, pad_mask)
        x = self.embeddings(src)
        x = self.pos_encoder(x)
        x = F.dropout(x, self.dropout, training=self.training)
      
        for layer in self.layers[lang_id]:
            x = layer(x, src_mask, pad_mask)

        return x


In [0]:
class Decoder(nn.Module):
    def __init__(self, field, encoder, d_model=256, nlayers=4, nheads=8, dropout=0.1, shared_nlayers=2):
        super(Decoder, self).__init__()
        self.sos_idx = field.vocab.stoi['<sos>']
        self.eos_idx = field.vocab.stoi['<eos>']
        self.pad_idx = field.vocab.stoi['<pad>']
        self.d_model = d_model
        self.dropout = dropout
        self.embeddings = encoder.embeddings
        self.pos_encoder = PositionalEncoding(d_model)

        decoder_layer = TransformerDecoderLayer(d_model, nheads, dim_feedforward=4*d_model, dropout=dropout, activation='gelu')
        self.layers = nn.ModuleList()
        shared_layers = get_module_clones(decoder_layer, shared_nlayers)
        # Layers for the source language with shared bottom layers
        self.layers.append(shared_layers) 
        self.layers[0].extend(get_module_clones(decoder_layer, nlayers-shared_nlayers))
        # Layers for the target language with shared bottom layers
        self.layers.append(shared_layers) 
        self.layers[1].extend(get_module_clones(decoder_layer, nlayers-shared_nlayers))


        # share proj_layer 
        proj_layer = nn.Linear(self.embeddings.embedding_dim, len(field.vocab))
        proj_layers = [proj_layer]*2
        # proj_layers = [nn.Linear(self.emb_dim, len(vocab)) for vocab in vocabs]
        self.proj_layers = nn.ModuleList(proj_layers)

    def forward(self, previous_tokens, encoded, enc_pad_mask, lang_id):
        x = self.embeddings(previous_tokens)
        x = self.pos_encoder(x)
        x = F.dropout(x, self.dropout, training=self.training)
      
        for layer in self.layers[lang_id]:
            x = layer(x, encoded, memory_key_padding_mask=enc_pad_mask)

        x = self.proj_layers[lang_id](x)
        return x

    def generate_sequence(self, encoded, enc_pad_mask, lang_id, sequence_len=256):
        cur_len = 1
        bs = encoded.size(1)
        decoded = torch.LongTensor(sequence_len, bs).fill_(self.pad_idx)
        decoded = decoded.to(encoded.device)
        decoded[0] = self.sos_idx

        unfinished_sents = torch.LongTensor(bs).fill_(1)
        
        while cur_len < sequence_len:
            scores = self.forward(decoded[:cur_len], encoded, enc_pad_mask, lang_id)
            scores = scores.detach()[-1, :, :]
            next_words = torch.topk(scores, 1)[1].squeeze(1)
            assert next_words.size() == (bs,)

            decoded[cur_len] = next_words*unfinished_sents + self.pad_idx*(1-unfinished_sents)
            unfinished_sents.mul_(next_words.ne(self.eos_idx).long())
            cur_len += 1

            if unfinished_sents.max() == 0:
                break

        
        if cur_len == sequence_len:
            decoded[sequence_len - 1].masked_fill_(unfinished_sents.byte(), self.eos_idx)

        return decoded

# Training

1) https://github.com/pytorch/fairseq/blob/7b3df95f287bc0d844f64fe45717123d06dacb97/fairseq/data/noising.py

In [0]:
import numpy as np
# [1]
class Noising:
    def __init__(self, vocab):
        self.vocav = vocab
        self.bpe_ends_mask = np.array([not vocab.itos[i].endswith('@@') for i in range(len(vocab))])
        
        self.pad_idx = vocab.stoi['<pad>']
        self.sos_idx = vocab.stoi['<sos>']
        self.eos_idx = vocab.stoi['<eos>']
        self.sep_idx = vocab.stoi['<sep>']
        self.mask_idx = vocab.stoi['<mask>']

    def noise(self, inp):
        x = inp.cpu()
        pad_mask = x.eq(self.pad_idx)
        lengths = x.size(0) - pad_mask.sum(0)

        x = self.shuffle(x, lengths)
        x, lengths = self.dropout(x, lengths)
        x, lengths = self.dropout(x, lengths, blank_idx=self.mask_idx)
        return x

    def get_word_idx(self, x):
        bpe_end = self.bpe_ends_mask[x]
        word_idx = bpe_end[::-1].cumsum(0)[::-1]
        word_idx = word_idx.max(0)[None, :] - word_idx 
        return word_idx

    def dropout(self, x, lengths, dropout_rate=0.1, blank_idx=None):
        sentences = []
        modified_lengths = []
        word_idx = self.get_word_idx(x)
        sos_mask = x.eq(self.sos_idx)
        eos_mask = x.eq(self.eos_idx)
        not_dropout_mask = sos_mask + eos_mask
        not_dropout_mask = not_dropout_mask.numpy()
        
        for i in range(lengths.size(0)):
            num_words = max(word_idx[:, i]) + 1
            keep = np.random.rand(num_words) >= dropout_rate
            do_not_dropout_words_idx = word_idx[:, i]*not_dropout_mask[:, i]
            keep[do_not_dropout_words_idx] = 1 # do not dropout <sos> symbol
            words = x[:lengths[i], i].tolist()
            new_s = [
                w if keep[word_idx[j, i]] else blank_idx
                for j, w in enumerate(words)
            ]
            new_s = [w for w in new_s if w is not None]
            sentences.append(new_s)
            modified_lengths.append(len(new_s))
        # re-construct input
        modified_lengths = torch.LongTensor(modified_lengths)

        modified_x = torch.LongTensor(
            x.size(0),
            x.size(1)
        ).fill_(self.pad_idx)
        for i in range(modified_lengths.size(0)):
            modified_x[:modified_lengths[i], i].copy_(torch.LongTensor(sentences[i]))

        return modified_x, modified_lengths

    def shuffle(self, x, lengths, max_shuffle_distance=3):
        if max_shuffle_distance == 0:
            return x
        eos_mask = x.eq(self.eos_idx)
        lengths -= eos_mask.sum(0)

        noise = np.random.uniform(
            0,
            max_shuffle_distance,
            size=(x.size(0), x.size(1)),
        )
        
        sos_mask = x.eq(self.sos_idx).numpy()
        do_not_shuffle_indices = np.nonzero(sos_mask)
        noise[do_not_shuffle_indices] = -1 # do not move <sos> symbols
        word_idx = self.get_word_idx(x)

        x2 = x.clone()
        for i in range(lengths.size(0)):
            scores = word_idx[:lengths[i], i] + noise[word_idx[:lengths[i], i], i]
            scores += 1e-6 * np.arange(lengths[i])
            permutation = scores.argsort()
            x2[:lengths[i], i].copy_(
                x2[:lengths[i], i][torch.from_numpy(permutation)]
            )
        return x2

In [0]:
from collections import OrderedDict
from torch.optim import Adam

class Trainer:
    def __init__(self, encoder, decoder, field, params, logger, clip=1.0, lr=0.0001, n_iter=0, n_epoch=0):
        self.encoder = encoder
        self.decoder = decoder

        self.pad_idx = field.vocab.stoi['<pad>']
        self.vocab_size = len(field.vocab)

        self.criterion = nn.CrossEntropyLoss(ignore_index=self.pad_idx)
        self.clip = clip
        self.n_updates = n_iter

        self.noising = Noising(field.vocab)

        self.enc_optimizer = Adam(encoder.parameters(), lr=lr)
        self.dec_optimizer = Adam(decoder.parameters(), lr=lr)

        self.n_total_iter = n_iter
        self.epoch = n_epoch 

        self.device = params.device
        self.logger = logger

    def get_denoising_loss_weight(self, init_weight=1, decrease_slower_iter=10**5, weight_slower=0.1, set_to_zero_iter=3*10**5):
        if self.n_updates < decrease_slower_iter:
            return init_weight - ((init_weight-weight_slower)/decrease_slower_iter)*self.n_updates

        return weight_slower - (weight_slower/(set_to_zero_iter - decrease_slower_iter))*(self.n_updates - decrease_slower_iter)
    
    def backprop(self, loss):
        self.enc_optimizer.zero_grad()
        self.dec_optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.encoder.parameters(), self.clip)
        nn.utils.clip_grad_norm_(self.decoder.parameters(), self.clip)
        self.enc_optimizer.step()
        self.dec_optimizer.step()

    def denoising_step(self, inp, lang_id):
        x = self.noising.noise(inp).to(self.device)
        pad_mask = x.eq(self.pad_idx).transpose_(0, 1)
        self.encoder.train()
        self.decoder.train()
        encoded = self.encoder(x, pad_mask, lang_id)
        scores = self.decoder(x[:-1], encoded, pad_mask, lang_id)
        loss = self.criterion(scores.view(-1, self.vocab_size), inp[1:].view(-1))

        loss = self.get_denoising_loss_weight() * loss

        # check NaN
        if (loss != loss).data.any():
            print("NaN detected")
            exit()

        self.backprop(loss)

        self.n_updates += 1
        progress_state = OrderedDict(
            step_type='denoising',
            loss=loss.item(),
            sentences=inp.size(1),
            n_total_iter=self.n_total_iter,
            epoch=self.epoch,
            lang_id=lang_id
            )

        return progress_state

    def backtranslation_step(self, src, trg, src_lang_id, trg_lang_id):
        # src -> trg -> src
        self.encoder.train()
        self.decoder.train()

        trg_pad_mask = trg.eq(self.pad_idx).transpose_(0, 1)
        encoded = self.encoder(trg, trg_pad_mask, trg_lang_id)

        src_pad_mask = src.eq(self.pad_idx).transpose_(0, 1)
        scores = self.decoder(src[:-1], encoded, src_pad_mask, src_lang_id)
        loss = self.criterion(scores.view(-1, self.vocab_size), src[1:].view(-1))

        # check NaN
        if (loss != loss).data.any():
            print("NaN detected")
            exit()

        self.backprop(loss)

        progress_state = OrderedDict(
            step_type='backtranslation',
            loss=loss.item(),
            sentences=src.size(1),
            n_total_iter=self.n_total_iter,
            epoch=self.epoch,
            backtranslation_direction='{}->{}->{}'.format(src_lang_id, trg_lang_id, src_lang_id)
            )
        
        return progress_state

    def generate_translation(self, src, lang1_id, lang2_id):
        self.encoder.eval()
        self.decoder.eval()

        pad_mask = src.eq(self.pad_idx).transpose_(0, 1)

        with torch.no_grad():
            encoded = self.encoder(src, pad_mask, lang1_id)          
            trg = self.decoder.generate_sequence(encoded, pad_mask, lang2_id)

        return trg

    def save_model(self, dump_dir, name):
        path = os.path.join(dump_dir, '%s.pth' % name)
        self.logger.log('Saving model to %s ...' % path)
        torch.save({
            'encoder': self.encoder,
            'decoder': self.decoder,
            'enc_optimizer': self.enc_optimizer,
            'dec_optimizer': self.dec_optimizer,
            'epoch': self.epoch,
            'n_total_iter': self.n_total_iter, 
            'criterion': self.criterion
        }, path)

    def reload_checkpoint(self, dump_dir, name):
        checkpoint_path = os.path.join(dump_dir, name)
        if not os.path.isfile(checkpoint_path):
            return

        print('Reloading checkpoint from %s ...' % checkpoint_path)
        checkpoint_data = torch.load(checkpoint_path)
        self.encoder = checkpoint_data['encoder']
        self.decoder = checkpoint_data['decoder']
        self.enc_optimizer = checkpoint_data['enc_optimizer']
        self.dec_optimizer = checkpoint_data['dec_optimizer']
        self.epoch = checkpoint_data['epoch']
        self.n_total_iter = checkpoint_data['n_total_iter']
        # self.best_metrics = checkpoint_data['best_metrics']
        # self.best_stopping_criterion = checkpoint_data['best_stopping_criterion']
        self.criterion = checkpoint_data['criterion']

        self.logger.log('Checkpoint reloaded. Resuming at epoch %i ...' % self.epoch)

In [0]:
from torchtext import data

class CustomDataset(data.Dataset):
    def __init__(self, path, text_field, newline_eos=True,
                 encoding='utf-8', **kwargs):
        fields = [('text', text_field)]
        with open(path, encoding=encoding) as f:
            sentences = [text_field.preprocess(line) for line in f if line.strip('\n')]        
        examples = [data.Example.fromlist([sentence], fields) for sentence in sentences]
        super(CustomDataset, self).__init__(
            examples, fields, **kwargs)

In [0]:
from itertools import zip_longest
from torchtext.datasets import LanguageModelingDataset
from torchtext.vocab import Vectors

class Logger:
    def __init__(self, path=None):
          self.log_file = open(path, 'a+') if path else None

    def log(self, info):
        if type(info) is str:
            print("%s\n" % info, file=self.log_file)
        elif type(info) is OrderedDict:
            for k, v in info.items()
                print("%s: " % str(k), file=self.log_file)
                print("%s\n" % str(v), file=self.log_file)

            print('\n\n\n', file=self.log_file)

    def close(self):
        if self.log_file:
            self.log_file.close()


def main(params):
    logger = Logger()

    TEXT = data.Field(
        init_token='<sos>',
        eos_token='<eos>',
        fix_length=params.sequence_length
    )
    logger.log("Loaded Field")
    # train_l1_dataset = LanguageModelingDataset(params.l1_data_path, TEXT)
    train_l1_dataset = CustomDataset(params.l1_data_path, TEXT)
    logger.log("Loaded L1 Dataset")
    # train_l1_dataset = LanguageModelingDataset(params.l1_data_path, TEXT)
    train_l2_dataset = CustomDataset(params.l2_data_path, TEXT)
    logger.log("Loaded L2 Dataset")
    
    vectors = Vectors(name=params.embs_file, cache=params.embs_dir)
    logger.log("Loaded Vectors")

    TEXT.build_vocab(
        train_l1_dataset,
        train_l2_dataset,
        specials=['<sep>', '<mask>'],
        vectors=vectors
    )
    logger.log("Loaded Vocab")

    # DEBUG
    print(TEXT.vocab['<pad>'])
    print(TEXT.vocab['<sep>'])
    print(TEXT.vocab['<mask>'])

    l1_iter = data.BucketIterator(
          dataset = train_l1_dataset,
          batch_size = params.batch_size,
          repeat=True,
          shuffle=True,
          device=params.device
    )
    logger.log("Created L1 Iterator")

    l2_iter = data.BucketIterator(
          dataset = train_l2_dataset,
          batch_size = params.batch_size,
          repeat=True,
          shuffle=True,
          device=params.device
    )
    logger.log("Created L2 Iterator")

    encoder = Encoder(TEXT)
    logger.log("Created Encoder")
    decoder = Decoder(TEXT, encoder)
    logger.log("Created Decoder")

    trainer = Trainer(encoder, decoder, TEXT, params, logger)
    logger.log("Created Trainer")

    encoder.train()
    decoder.train()

    logger.log("===================TRAINING STARTED===================")
    while trainer.epoch <= params.n_epoch:
        logger.log("===================EPOCH%d===================" % trainer.epoch)
        for batches in zip_longest(l1_iter, l2_iter, fillvalue=None):
            for src_id in [0, 1]:
                if not batches[src_id]: # if there is no batches for this language
                    continue
                trg_id = 0 if src_id == 1 else 1 
                src_text = batches[src_id].text
                progress_state = trainer.denoising_step(src_text, src_id)
                logger.log(progress_state)
                translation = trainer.generate_translation(src_text, src_id, trg_id)
                progress_state = trainer.backtranslation_step(src_text, translation, src_id, trg_id)
                logger.log(progress_state)

            if trainer.n_total_iter % params.save_every_ith_iter == 0:
                trainer.save_model(params.dump_dir, 'checkpoint-{}-{}'.format(trainer.epoch, trainer.n_total_iter))
            
            trainer.n_total_iter += 1

        trainer.epoch += 1
        # TODO add bleu scoring 
        # And saving best model
        # Add loading model if it exists

    log_file.close()


In [0]:
class Parameters:
  def __init__(self):
    # Embeddings
    self.embs_file = "%s.vec" % os.environ["BPE_EMBEDDINGS"]
    self.embs_dir = os.environ["EMBEDDINGS_DIR"]
    
    # Dataset
    self.l1_data_path = os.environ["L1_DATA_PREPARED"]
    self.l2_data_path = os.environ["L2_DATA_PREPARED"]

    self.l1_data_path = os.environ["L1_DATA_PREPARED"] + '.cut'
    self.l2_data_path = os.environ["L2_DATA_PREPARED"] + '.cut'

    # self.l1_vocab = 
    # self.l2_vocab = 
    
    # Training
    self.sequence_length = 256
    self.batch_size = 64
    self.lr = 0.25
    self.clip = 1.0

    self.n_epoch = 40
    self.save_epoch = 5

    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    self.log_file = os.path.join(os.environ['MODELS'], 'log.txt')
    self.dump_dir = os.environ['MODELS']

    self.save_every_ith_iter = 1000


main(Parameters())

NameError: ignored