In [None]:
import os
os.environ['PROJECT_PATH'] = os.path.abspath(os.curdir)

**Mount Google Drive** 
Tokenization will not work due to Perl script usage

In [None]:
from google.colab import drive
os.environ['PROJECT_PATH']='/content/ydrive/My Drive/Study/DS_Project'
drive.mount('/content/ydrive/')

In [None]:
os.environ['TOOLS']= os.path.join(os.environ['PROJECT_PATH'], 'tools')
os.environ['RESOURCES']= os.path.join(os.environ['PROJECT_PATH'], 'resources')
os.environ['DATA']= os.path.join(os.environ['RESOURCES'], 'data')
os.environ['MODELS']= os.path.join(os.environ['PROJECT_PATH'], 'models')

## Download data

### Bashkir language

In [None]:
%%bash
BASHKIR="$DATA/bashkir"
git clone https://github.com/nevmenandr/bashkir-corpus "$BASHKIR-corpus"
mkdir "$BASHKIR" & mkdir "$BASHKIR/raw"
find "$BASHKIR-corpus" -name "*.txt" -print0 | xargs -0 -I file cat file > "$BASHKIR/ba"
rm -rf -d  "$BASHKIR-corpus"

WIKIEXTRACTOR="$TOOLS/wikiextractor"
git clone https://github.com/ptakopysk/wikiextractor "$WIKIEXTRACTOR"
wget http://download.wikimedia.org/bawiki/latest/bawiki-latest-pages-articles.xml.bz2 -P "$BASHKIR"
"$WIKIEXTRACTOR/WikiExtractor.py"  --json -o "$BASHKIR/ba_wiki" "$BASHKIR/bawiki-latest-pages-articles.xml.bz2"
rm "$BASHKIR/bawiki-latest-pages-articles.xml.bz2"

In [None]:
import json

input_folder = os.path.join(os.environ['DATA'], 'bashkir', 'ba_wiki')
output_path = os.path.join(os.environ['DATA'], 'bashkir', 'ba')

output_file = open(output_path, "a+", encoding='utf-8')

for path, subdirs, files in os.walk(input_folder):
    for name in files:
        file = open(os.path.join(path, name), 'r', encoding='utf-8')
        for line in file.readlines():
            dump = json.loads(line)
            output_file.write("%s\n" % dump["text"])
        file.close()

output_file.close()

!rm -rf -d "$DATA/bashkir/ba_wiki"

In [None]:
!pip install razdel
from razdel import sentenize

raw_data_path = os.path.join(os.environ['DATA'], 'bashkir', 'ba')
sentenized_data_path = os.path.join(os.environ['DATA'], 'ba.sentesized')

raw_data = open(raw_data_path, 'r', encoding='utf-8')
sentenized_data = open(sentenized_data_path, 'w+', encoding='utf-8')

for line in raw_data:
    sentences = sentenize(line)
    sentenized_data.writelines(["%s\n" % sentence.text for sentence in sentences])

### Russian language

In [None]:
!wget http://data.statmt.org/wmt17/translation-task/news.2016.ru.shuffled.gz -P "$DATA"
!gzip -d "$DATA/news.2016.ru.shuffled.gz"

## Preprocessing

In [None]:
os.environ['L1']='ba'
os.environ['L2']='ru'
os.environ['L1_DATA']="ba.sentesized"  
os.environ['L2_DATA']="news.2016.ru.shuffled"

### Text cleaning and tokenization

In [None]:
!pip install -U sacremoses
from sacremoses import MosesPunctNormalizer, MosesTokenizer

def preprocess_file(filepath, language):
    normalizer = MosesPunctNormalizer(language, pre_replace_unicode_punct=True, post_remove_control_chars=True)
    tokenizer = MosesTokenizer(language)
    output_file = open('%s.cleaned' % filepath, 'w+', encoding='utf-8')

    with open(filepath, 'r', encoding='utf-8') as input_file:
        for line in input_file:
            line = normalizer.normalize(line)
            tokens = tokenizer.tokenize(line)
            output_file.write("{}\n".format(' '.join(tokens)))


preprocess_file(os.path.join(os.environ['DATA'], os.environ['L1_DATA']), os.environ['L1']) 
preprocess_file(os.path.join(os.environ['DATA'], os.environ['L2_DATA']), os.environ['L2']) 

### BPE codes generating

In [None]:
%%bash
FASTBPE="$TOOLS/fastBPE"
FAST="$FASTBPE/fast"
git clone https://github.com/glample/fastBPE "$FASTBPE"

g++ -std=c++11 -pthread -O3 "$FASTBPE/main.cc" -IfastBPE -o "$FAST"
"$FAST" learnbpe 40000 "$DATA/${L1_DATA}.cleaned" "$DATA/${L2_DATA}.cleaned" > "$DATA/BPE_codes"
"$FAST" applybpe "$DATA/${L1_DATA}.40000" "$DATA/${L1_DATA}.cleaned" "$DATA/BPE_codes"
"$FAST" applybpe "$DATA/${L2_DATA}.40000" "$DATA/${L2_DATA}.cleaned" "$DATA/BPE_codes"

### Vocabulary will be calculated after dataset loading
# "$FAST" getvocab "$DATA/${L1_DATA}.40000" > "$DATA/vocab.${L1_DATA}.40000" 
# "$FAST" getvocab "$DATA/${L2_DATA}.40000" > "$DATA/vocab.${L2_DATA}.40000" 

# TODO Add splitting data on train, valid and test

### N-gram Translation Table Inferring

In [None]:
os.environ['TRANSLATION_TABLE'] = os.path.join(os.environ['RESOURCES'], 'translation_table')

In [None]:
%%bash
[ -f $RESOURCES/cc.ba.300.vec ] || wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ba.300.vec.gz -P  "$TRANSLATION_TABLE"
[ -f $RESOURCES/cc.ru.300.vec ] || wget -nc https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz -P "$TRANSLATION_TABLE"
gzip -d "$RESOURCES/cc.ba.300.vec.gz"
gzip -d "$RESOURCES/cc.ru.300.vec.gz"
git clone https://github.com/artetxem/vecmap.git "$TOOLS/vecmap"
python3 "$TOOLS/vecmap/map_embeddings.py" --unsupervised "$TRANSLATION_TABLE/cc.ba.300.vec" "$TRANSLATION_TABLE/cc.ru.300.vec" "$TRANSLATION_TABLE/ba_mapped.vec" "$TRANSLATION_TABLE/ru_mapped.vec"

In [None]:
!pip install gensim
!pip install sklearn
import time
from datetime import timedelta

import numpy as np
from gensim.models.keyedvectors import KeyedVectors

start_time = time.time()

l1_embs = "ba_mapped.vec"
l2_embs = "ru_mapped.vec"

l1_path=os.path.join(os.environ["TRANSLATION_TABLE"], l1_embs)
l2_path=os.path.join(os.environ["TRANSLATION_TABLE"], l2_embs)

l1_mapping = KeyedVectors.load_word2vec_format(l1_path)
l2_mapping = KeyedVectors.load_word2vec_format(l2_path)

vectors_loading_time = time.time()
print('Loading vectors time: ', timedelta(seconds=vectors_loading_time - start_time))


In [None]:
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity

ROWS_IN_SLICE = 15
SAVE_EVERY_N_ROWS = 50
K_NN = 5 

translation_table_file_path = os.path.join(os.environ["TRANSLATION_TABLE"], "translation_table") 
translation_table_file = open(translation_table_file_path, 'ab+') # will store array of vectors of l2 and size (#l1_vectors, K_NN)  


def calculate_denominator_part(source_vectors, target_mapping, filename=None):
    if filename:
        f = open(filename, 'ab+')
        last_saved = open('%s_index' % filename, 'w+')
        start_index_to_save = 0
    
    result = np.empty(source_vectors.shape[0])
    for i, vector in enumerate(tqdm(source_vectors)):
        similar_vectors = target_mapping.similar_by_vector(vector, topn=K_NN)
        result[i] = sum([similar_vector[1] for similar_vector in similar_vectors]) # similar_vector[1] is cosine distance 
        if filename and i % SAVE_EVERY_N_ROWS == 0:
            np.savetxt(f, result[start_index_to_save:i+1])
            start_index_to_save = i + 1
            last_saved.write(str(i))
    
    if filename:
        np.save(f, result[start_index_to_save:i+1])
        f.close()
    return result/(2*K_NN)

start_time = time.time()
l2_denominator_file_path = os.path.join(os.environ["TRANSLATION_TABLE"], "l2_denominator") 
if os.path.exists(l2_denominator_file_path):
    l2_denominator = np.loadtext(l2_denominator_file_path)
    print("L2 denominator has been loaded from ", l2_denominator_file_path)    
else:
    l2_denominator = calculate_denominator_part(l2_mapping.vectors, l1_mapping, "%s_backup" % l2_denominator_file_path)
    np.savetxt(l2_denominator_file_path, l2_denominator)
    print("L2 denominator calculation time: ", timedelta(seconds=time.time() - start_time))

start_time = time.time()

for slice_start in tqdm(range(0, l1_mapping.vectors.shape[0], ROWS_IN_SLICE)):
    slice_end = slice_start + ROWS_IN_SLICE
    slice_end = slice_end if slice_end < l1_mapping.vectors.shape[0] else l1_mapping.vectors.shape[0]
    l1_slice = l1_mapping.vectors[slice_start:slice_end]

    cos_sims = cosine_similarity(l1_slice, l2_mapping.vectors)    
    
    l1_denominator = calculate_denominator_part(l1_slice, l2_mapping)

    denominator = l1_denominator[:, None] + l2_denominator
    
    similarities = cos_sims/denominator

    most_similar_indices = similarities.argsort(axis=1)[:, -K_NN:] 
    most_similar = np.take(l2_mapping.vectors, most_similar_indices) 

    np.savetxt(translation_table_file, most_similar)
    
translation_table_file.close()
print("Translation table inferring time: ", timedelta(seconds=time.time() - start_time))

## Model implementation

### Tools

In [None]:
import torch
from torch import nn

#src https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)  

### BaseModel



Temporarily hyperparameters are hardcoded for better readability 


Sources:
1. https://pytorch.org/tutorials/beginner/transformer_tutorial.html
2. https://github.com/facebookresearch/XLM/blob/master/src/model/transformer.py
3. https://discuss.pytorch.org/t/memory-mask-in-nn-transformer/55230/5
4. https://github.com/tkmaroon/pytorch-xlm/blob/master/models/transformer.py

In [None]:
import torch.nn.functional as F
from torch.nn import TransformerDecoder, TransformerDecoderLayer, \
                     TransformerEncoder, TransformerEncoderLayer


class BaseModel(nn.Module):
    def __init__(self, field, d_model=1024, nlayers=6, nheads=8, dropout=0.1):
        super(BaseModel, self).__init__()
        
        # [4]
        self.voc_size = len(field.vocab.itos) 
        self.pad_idx = field.vocab.stoi['<pad>']
        self.bos_idx = field.vocab.stoi['<bos>']
        self.eos_idx = field.vocab.stoi['<eos>']
        self.sep_idx = field.vocab.stoi['<sep>']
        self.mask_idx = field.vocab.stoi['<mask>']

        self.d_model = d_model
        self.dropout = dropout
        self.embedding = nn.Embedding(self.voc_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = TransformerEncoderLayer(d_model, nheads, dim_feedforward=4*d_model, activation='gelu')
        self.encoder = TransformerEncoder(encoder_layers, nlayers)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.normal_(self.embedding.weight, mean=0, std=self.d_model ** -0.5) #[2] L46

    def forward(self, src):
        pad_mask = torch.transpose(src.eq(self.pad_idx), 0, 1)
        src_mask = self.get_mask(src, pad_mask)
        src = self.embedding(src)
        src = self.pos_encoder(src)
        src = self.encoder(src)
        src = F.dropout(src, self.dropout, training=self.training)

        output = self.encoder(src, src_mask, pad_mask)
        return output


    def get_mask(self, inputs, pad_mask): #[2]
        slen, bs = inputs.size()
        lengths = slen-torch.sum(pad_mask, 0)

        alen = torch.arange(slen, dtype=torch.long, device=lengths.device)

        return alen < lengths[:, None]

    # [4]
    def mlm_loss(self, inputs, criterion, sampling_rate=0.15, masked_rate=0.8, replaced_rate=0.1, unchanged_rate=0.1):
        slen, bs = inputs.size()

        sampler = self.sampling(inputs, sampling_rate)
        rnd = torch.rand((slen, bs), device=inputs.device)
        mask = (masked_rate >= rnd) & sampler

        # replace mask tokens
        inputs = torch.where(
            (masked_rate >= rnd) & mask,
            torch.ones_like(inputs) * self.mask_idx, 
            inputs,
        )

        # replace random tokens
        th = masked_rate + replaced_rate
        inputs = torch.where(
            (th >= rnd) & (rnd > masked_rate) & sampler, 
            torch.randint_like(inputs, self.mask_idx+1, self.voc_size),
            inputs
        )

        outs = self.forward(inputs).view(slen*bs, -1)
        loss = criterion(outs, inputs.view(-1))
        return loss

    # [4]
    def sampling(self, inputs, sampling_rate):
        slen, bs = inputs.size()
        rnd = -torch.rand((slen, bs))
        mask = rnd.ge(-sampling_rate)
        mask[inputs <= self.mask_idx] = 0 # special tokens are not sampled
        return mask.to(inputs.device)

### Pretraining

In [None]:
!pip install torchtext==0.5.0
import math
from collections import OrderedDict
from torch.optim import Adam
from torch.utils.data import DataLoader
from torchtext import data, datasets
from tqdm import tqdm


class Trainer:
    def __init__(self, model, criterion, optimizer, clip, n_iter=0):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.clip = clip
        self.n_updates = n_iter

    def get_lr(self):
        return self.optimizer.param_groups[0]['lr']

    def step(self, inputs):
        self.optimizer.zero_grad()
        loss = self.model.mlm_loss(inputs, self.criterion)

        if self.model.training:
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
            self.optimizer.step()
            self.n_updates += 1
        return loss
    

def pretrain(data_path, device, min_vocab_freq=1, batch_size=64, sequence_length=256, lr=0.25, clip=1.0, n_epoch=40, save_epoch=2):

    TEXT = data.Field(
        init_token='<bos>', 
        eos_token='<eos>',
    )

    train_dataset = datasets.LanguageModelingDataset(data_path, TEXT)

    vocab = TEXT.build_vocab(
        train_dataset, 
        min_freq=min_vocab_freq, 
        specials=['<sep>', '<mask>']
    )
    train_iter = data.BPTTIterator(
        train_dataset, 
        batch_size=batch_size, 
        bptt_len=256,
        train=True, 
        repeat=False, 
        shuffle=True,
        device=device
    )

    if not os.path.exists(os.environ['MODELS']):
        os.mkdir(os.environ['MODELS'])
    
    model = BaseModel(TEXT).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>'])
    optimizer = Adam(model.parameters(), lr=lr)
    best_loss = math.inf
    epoch=0
    trainer = Trainer(model, criterion, optimizer, clip)

    while epoch <= n_epoch:
        # training
        with tqdm(train_iter, dynamic_ncols=True) as pbar:
            train_loss = 0.0
            model.train()
            for samples in pbar:
                srcs = samples.text.to(device)
                loss = trainer.step(srcs)
                train_loss += loss.item()

                # setting of progressbar
                pbar.set_description(f'epoch {str(epoch).zfill(3)}')
                progress_state = OrderedDict(
                    loss=loss.item(),
                    bsz=srcs.size(1),
                    lr=trainer.get_lr(), 
                    clip=clip, 
                    num_updates=trainer.n_updates)
                pbar.set_postfix(progress_state)
        train_loss /= len(train_iter)

        print(f'| epoch {str(epoch).zfill(3)} | train ', end='') 
        print(f'| loss {train_loss:.{4}} ', end='')
        print(f'| lr {trainer.get_lr():.1e} ', end='')
        print(f'| clip {clip} ', end='')
        print(f'| num_updates {trainer.n_updates} |')
        
        # saving model
        save_vars = {
            'epoch': epoch,
            'iteration': trainer.n_updates,
            'best_loss': train_loss if train_loss < best_loss else best_loss,
            'weights': model.state_dict()
        }

        if train_loss < best_loss:
            best_loss = train_loss
            filename = os.path.join(os.environ['MODELS'], 'checkpoint_best.pt') 
            torch.save(save_vars, filename)
        if epoch % save_epoch == 0:
            filename = os.path.join(os.environ['MODELS'], f'checkpoint_{epoch}.pt') 
            torch.save(save_vars, filename)
        filename = os.path.join(os.environ['MODELS'], 'checkpoint_last.pt') 
        torch.save(save_vars, filename)

        epoch += 1