In [0]:
import os
os.environ['PROJECT_PATH'] = os.path.abspath(os.curdir)

**Mount Google Drive** 
Tokenization will not work due to Perl script usage

In [0]:
from google.colab import drive
os.environ['PROJECT_PATH']='/content/ydrive/My Drive/Study/DS_Project'
drive.mount('/content/ydrive/')

In [0]:
os.environ['TOOLS']= os.path.join(os.environ['PROJECT_PATH'], 'tools')
os.environ['RESOURCES']= os.path.join(os.environ['PROJECT_PATH'], 'resources')
os.environ['DATA']= os.path.join(os.environ['RESOURCES'], 'data')
os.environ['MODELS']= os.path.join(os.environ['PROJECT_PATH'], 'models')

## Preprocessing

### Download data

In [0]:
![ -f "$RESOURCES/data/news.2017.fi.shuffled.deduped" ] || wget http://data.statmt.org/wmt18/translation-task/news.2017.fi.shuffled.deduped.gz -P "$RESOURCES/data"
![ -f "$RESOURCES/data/news.2017.et.shuffled.deduped" ] || wget http://data.statmt.org/wmt18/translation-task/news.2017.et.shuffled.deduped.gz -P "$RESOURCES/data"
!gzip -d "$RESOURCES/data/news.2017.fi.shuffled.deduped.gz"
!gzip -d "$RESOURCES/data/news.2017.et.shuffled.deduped.gz"
os.environ['L1']='fi'
os.environ['L2']='et'
os.environ['L1_DATA']="news.2017.fi.shuffled.deduped"  
os.environ['L2_DATA']="news.2017.et.shuffled.deduped"

### Text cleaning and tokenization
**DOES NOT WORK IN COLAB DUE TO PERL SCRIPTS USAGE** \\
[Source](https://github.com/facebookresearch/XLM/blob/master/tools/tokenize.sh)

In [0]:
%%bash
MOSES="$TOOLS/mosesdecoder"
git clone https://github.com/moses-smt/mosesdecoder "$MOSES"

REPLACE_UNICODE_PUNCT="$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl"
NORM_PUNC="$MOSES/scripts/tokenizer/normalize-punctuation.perl"
REM_NON_PRINT_CHAR="$MOSES/scripts/tokenizer/remove-non-printing-char.perl"
TOKENIZER="$MOSES/scripts/tokenizer/tokenizer.perl"
set -e
function clean () { \
  cat - | "$REPLACE_UNICODE_PUNCT" | "$NORM_PUNC" -l $1 | \
          "$REM_NON_PRINT_CHAR" | "$TOKENIZER" \
          -no-escape -threads $(grep -c ^processor /proc/cpuinfo) -l $1;}

cat "$DATA/$L1_DATA" | clean "$L1" > "$DATA/${L1_DATA}.cleaned"
cat "$DATA/$L2_DATA" | clean "$L2" > "$DATA/${L2_DATA}.cleaned"

# TODO change to https://github.com/alvations/sacremoses

### BPE codes generating

In [0]:
%%bash
FASTBPE="$TOOLS/fastBPE"
FAST="$FASTBPE/fast"
git clone https://github.com/glample/fastBPE "$FASTBPE"

g++ -std=c++11 -pthread -O3 "$FASTBPE/main.cc" -IfastBPE -o "$FAST"
"$FAST" learnbpe 40000 "$DATA/${L1_DATA}.cleaned" "$DATA/${L2_DATA}.cleaned" > "$DATA/BPE_codes"
"$FAST" applybpe "$DATA/${L1_DATA}.40000" "$DATA/${L1_DATA}.cleaned" "$DATA/BPE_codes"
"$FAST" applybpe "$DATA/${L2_DATA}.40000" "$DATA/${L2_DATA}.cleaned" "$DATA/BPE_codes"

### Vocabulary will be calculated after dataset loading
# "$FAST" getvocab "$DATA/${L1_DATA}.40000" > "$DATA/vocab.${L1_DATA}.40000" 
# "$FAST" getvocab "$DATA/${L2_DATA}.40000" > "$DATA/vocab.${L2_DATA}.40000" 

# TODO Add splitting data on train, valid and test

### N-gram Translation Table Inferring

In [0]:
![ -f $RESOURCES/cc.cs.300.vec ] || wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.cs.300.vec.gz -P  "$RESOURCES"
![ -f $RESOURCES/cc.en.300.vec ] || wget -nc https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz -P "$RESOURCES"
!gzip -d "$RESOURCES/cc.cs.300.vec.gz"
!gzip -d "$RESOURCES/cc.en.300.vec.gz"
!git clone https://github.com/artetxem/vecmap.git "$TOOLS"
!python3 "$TOOLS/map_embeddings.py" --unsupervised "$RESOURCES/cc.cs.300.vec" "$RESOURCES/cc.en.300.vec" "$RESOURCES/cs_mapped.vec" "$RESOURCES/en_mapped.vec" 

# TODO add the table inferring from above cross-lingual embeddings

## Model implementation

### Tools

In [0]:
import torch
from torch import nn

#src https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)  

### BaseModel



Temporarily hyperparameters are hardcoded for better readability 


Sources:
1. https://pytorch.org/tutorials/beginner/transformer_tutorial.html
2. https://github.com/facebookresearch/XLM/blob/master/src/model/transformer.py
3. https://discuss.pytorch.org/t/memory-mask-in-nn-transformer/55230/5
4. https://github.com/tkmaroon/pytorch-xlm/blob/master/models/transformer.py

In [0]:
import torch.nn.functional as F
from torch.nn import TransformerDecoder, TransformerDecoderLayer, \
                     TransformerEncoder, TransformerEncoderLayer


class BaseModel(nn.Module):
    def __init__(self, field, d_model=1024, nlayers=6, nheads=8, dropout=0.1):
        super(BaseModel, self).__init__()
        
        # [4]
        self.voc_size = len(field.vocab.itos) 
        self.pad_idx = field.vocab.stoi['<pad>']
        self.bos_idx = field.vocab.stoi['<bos>']
        self.eos_idx = field.vocab.stoi['<eos>']
        self.sep_idx = field.vocab.stoi['<sep>']
        self.mask_idx = field.vocab.stoi['<mask>']

        self.d_model = d_model
        self.dropout = dropout
        self.embedding = nn.Embedding(self.voc_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = TransformerEncoderLayer(d_model, nheads, dim_feedforward=4*d_model, activation='gelu')
        self.encoder = TransformerEncoder(encoder_layers, nlayers)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.normal_(self.embedding.weight, mean=0, std=self.d_model ** -0.5) #[2] L46

    def forward(self, src):
        pad_mask = torch.transpose(src.eq(self.pad_idx), 0, 1)
        src_mask = self.get_mask(src, pad_mask)
        src = self.embedding(src)
        src = self.pos_encoder(src)
        src = self.encoder(src)
        src = F.dropout(src, self.dropout, training=self.training)

        output = self.encoder(src, src_mask, pad_mask)
        return output


    def get_mask(self, inputs, pad_mask): #[2]
        slen, bs = inputs.size()
        lengths = slen-torch.sum(pad_mask, 0)

        alen = torch.arange(slen, dtype=torch.long, device=lengths.device)

        return alen < lengths[:, None]

    # [4]
    def mlm_loss(self, inputs, criterion, sampling_rate=0.15, masked_rate=0.8, replaced_rate=0.1, unchanged_rate=0.1):
        slen, bs = inputs.size()

        sampler = self.sampling(inputs, sampling_rate)
        rnd = torch.rand((slen, bs), device=inputs.device)
        mask = (masked_rate >= rnd) & sampler

        # replace mask tokens
        inputs = torch.where(
            (masked_rate >= rnd) & mask,
            torch.ones_like(inputs) * self.mask_idx, 
            inputs,
        )

        # replace random tokens
        th = masked_rate + replaced_rate
        inputs = torch.where(
            (th >= rnd) & (rnd > masked_rate) & sampler, 
            torch.randint_like(inputs, self.mask_idx+1, self.voc_size),
            inputs
        )

        outs = self.forward(inputs).view(slen*bs, -1)
        loss = criterion(outs, inputs.view(-1))
        return loss

    # [4]
    def sampling(self, inputs, sampling_rate):
        slen, bs = inputs.size()
        rnd = -torch.rand((slen, bs))
        mask = rnd.ge(-sampling_rate)
        mask[inputs <= self.mask_idx] = 0 # special tokens are not sampled
        return mask.to(inputs.device)

### Pretraining

In [0]:
!pip install torchtext==0.5.0
import math
from collections import OrderedDict
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchtext import data, datasets
from tqdm import tqdm


class Trainer:
    def __init__(self, model, criterion, optimizer, clip, n_iter=0):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.clip = clip
        self.n_updates = n_iter

    def get_lr(self):
        return self.optimizer.param_groups[0]['lr']

    def step(self, inputs):
        self.optimizer.zero_grad()
        loss = self.model.mlm_loss(inputs, self.criterion)

        if self.model.training:
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
            self.optimizer.step()
            self.n_updates += 1
        return loss
    

def pretrain(data_path, device, min_vocab_freq=1, batch_size=64, sequence_length=256, lr=0.25, clip=1.0, n_epoch=40, save_epoch=2):

    TEXT = data.Field(
        init_token='<bos>', 
        eos_token='<eos>',
    )

    train_dataset = datasets.LanguageModelingDataset(data_path, TEXT)

    vocab = TEXT.build_vocab(
        train_dataset, 
        min_freq=min_vocab_freq, 
        specials=['<sep>', '<mask>']
    )
    train_iter = data.BPTTIterator(
        train_dataset, 
        batch_size=batch_size, 
        bptt_len=256,
        train=True, 
        repeat=False, 
        shuffle=True,
        device=device
    )

    if not os.path.exists(os.environ['MODELS']):
        os.mkdir(os.environ['MODELS'])
    
    model = BaseModel(TEXT).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>'])
    optimizer = Adam(model.parameters(), lr=lr)
    best_loss = math.inf
    epoch=0
    trainer = Trainer(model, criterion, optimizer, clip)

    while epoch <= n_epoch:
        # training
        with tqdm(train_iter, dynamic_ncols=True) as pbar:
            train_loss = 0.0
            model.train()
            for samples in pbar:
                srcs = samples.text.to(device)
                loss = trainer.step(srcs)
                train_loss += loss.item()

                # setting of progressbar
                pbar.set_description(f'epoch {str(epoch).zfill(3)}')
                progress_state = OrderedDict(
                    loss=loss.item(),
                    bsz=srcs.size(1),
                    lr=trainer.get_lr(), 
                    clip=clip, 
                    num_updates=trainer.n_updates)
                pbar.set_postfix(progress_state)
        train_loss /= len(train_iter)

        print(f'| epoch {str(epoch).zfill(3)} | train ', end='') 
        print(f'| loss {train_loss:.{4}} ', end='')
        print(f'| lr {trainer.get_lr():.1e} ', end='')
        print(f'| clip {clip} ', end='')
        print(f'| num_updates {trainer.n_updates} |')
        
        # saving model
        save_vars = {
            'epoch': epoch,
            'iteration': trainer.n_updates,
            'best_loss': train_loss if train_loss < best_loss else best_loss,
            'weights': model.state_dict()
        }

        if train_loss < best_loss:
            best_loss = train_loss
            filename = os.path.join(os.environ['MODELS'], 'checkpoint_best.pt') 
            torch.save(save_vars, filename)
        if epoch % save_epoch == 0:
            filename = os.path.join(os.environ['MODELS'], f'checkpoint_{epoch}.pt') 
            torch.save(save_vars, filename)
        filename = os.path.join(os.environ['MODELS'], 'checkpoint_last.pt') 
        torch.save(save_vars, filename)

        epoch += 1

In [0]:
# l1_data_path = os.path.join(os.environ['DATA'], "%s.cleaned" % os.environ['L1_DATA'])
# !head -64 "${DATA}/${L1_DATA}.cleaned" > "${DATA}/${L1_DATA}.cut"

l1_data_path = os.path.join(os.environ['DATA'], "%s.cut" % os.environ['L1_DATA'])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pretrain(l1_data_path, device, n_epoch=1)