# Language modeling using transformer on Hermann Hesse bibliography data (in Russian)

### Based on course ["Нейронные сети и обработка текста"](https://stepik.org/course/54098/)

In [None]:
pip install pytorch-nlp

In [None]:
pip install youtokentome

## Required libraries, functions and classes:

In [None]:
from google.colab import drive
from google.colab import files

import sys

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional as F
from torchnlp.word_to_vector import BPEmb

import youtokentome as yttm

import random

import heapq

from tqdm.notebook import tqdm

import datetime

from traceback import format_exc

from copy import deepcopy

In [None]:
def init_random_seed(value=0):
    random.seed(value)
    np.random.seed(value)
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    torch.backends.cudnn.deterministic = True

init_random_seed()

In [None]:
def copy_data_to_device(data, device):
    if torch.is_tensor(data):
        return data.to(device)
    elif isinstance(data, (list, tuple)):
        return [copy_data_to_device(elem, device) for elem in data]
    raise ValueError('Invalid data type {}'.format(type(data)))

In [None]:
def get_params_number(model):
    return sum(t.numel() for t in model.parameters())

In [None]:
def divisors(n):
    """Find all divisors of a number"""
    i = 1
    divisors = []
    while i <= n**0.5:
        if (n % i == 0) : 
            if (n / i == i):
                divisors.append(i)
            else:
                divisors.extend([i, n // i])
        i = i + 1
    return sorted(divisors)

In [None]:
def split_into_chunks(filename, chunk_size=200):
    with open(filename) as f:
        full_text = f.read()
    return [full_text[start:start + chunk_size] for start in range(0, len(full_text), chunk_size // 2)]

In [None]:
def ensure_length(txt, out_len, pad_value):
    if len(txt) < out_len:
        txt = list(txt) + [pad_value] * (out_len - len(txt))
    else:
        txt = txt[:out_len]
    return txt

class LanguageModelDataset(Dataset):
    def __init__(self, sample, chunk_length=100, pad_value=0):
        self.sample = sample
        self.chunk_length = chunk_length
        self.pad_value = pad_value

    def __len__(self):
        return len(self.sample)

    def __getitem__(self, item):
        text = self.sample[item]
        start_i = random.randint(0, max(0, len(text) - self.chunk_length - 1))
        chunk = text[start_i : start_i + self.chunk_length + 1]

        seed_part = chunk[:-1]
        target_part = chunk[1:]

        seed_part = ensure_length(seed_part, self.chunk_length, self.pad_value)
        target_part = ensure_length(target_part, self.chunk_length, self.pad_value)

        seed_part = np.array(seed_part)
        target_part = np.array(target_part)

        return seed_part, target_part

In [None]:
def make_target_dependency_mask(length):
    full_mask = torch.ones(length, length)
    ignore_mask = torch.tril(full_mask) < 1
    full_mask.masked_fill_(ignore_mask, float('-inf'))
    full_mask.masked_fill_(~ignore_mask, 0)
    return full_mask

In [None]:
def make_positional_encoding(max_length, embedding_size):
    time = np.pi * torch.arange(0, max_length).float()
    freq_dividers = torch.arange(1, embedding_size // 2 + 1).float()
    inputs = time[:, None] / freq_dividers[None, :]
    result = torch.zeros(max_length, embedding_size)
    result[:, 0::2] = torch.sin(inputs)
    result[:, 1::2] = torch.cos(inputs)
    return result

In [None]:
class LanguageModel(nn.Module):
    """ General class. param::backbone - used architecture of NNet """
    def __init__(self, vocab_size, emb_size, backbone, emb_weights=None, freeze=True, emb_dropout=0.0):
        super().__init__()
        if emb_weights is not None:
            self.embeddings = nn.Embedding.from_pretrained(emb_weights, freeze=freeze, padding_idx=0)
        else:
            self.embeddings = nn.Embedding(vocab_size, emb_size, padding_idx=0)
        self.embedding_size = emb_size
        self.emb_dropout = nn.Dropout(emb_dropout)
        self.backbone = backbone
        self.out = nn.Linear(emb_size, vocab_size)
    
    def forward(self, seed_tokenized_sample):
        batch_size, max_in_length = seed_tokenized_sample.shape

        seed_padding_mask = seed_tokenized_sample == 0
        dependency_mask = make_target_dependency_mask(max_in_length).to(seed_tokenized_sample.device)
        
        seed_embs = self.embeddings(seed_tokenized_sample)  # BatchSize x MaxInLen x EmbSize
        pos_codes = make_positional_encoding(max_in_length, self.embedding_size).unsqueeze(0).to(seed_embs.device) # 1 x MaxInLen x EmbSize
        seed_embs = seed_embs + pos_codes
        seed_embs = self.emb_dropout(seed_embs)

        # BatchSize x TargetLen x EmbSize
        target_features = self.backbone(seed_embs,
                                        mask=dependency_mask,
                                        src_key_padding_mask=seed_padding_mask)
        logits = self.out(target_features)  # BatchSize x TargetLen x VocabSize
        return logits

In [None]:
def lm_cross_entropy(pred, target):
    """
    pred - BatchSize x TargetLen x VocabSize
    target - BatchSize x TargetLen
    """
    pred_flat = pred.view(-1, pred.shape[-1])  # BatchSize*TargetLen x VocabSize
    target_flat = target.view(-1)  # BatchSize*TargetLen
    return F.cross_entropy(pred_flat, target_flat, ignore_index=0)

In [None]:
class BatchFirstTransformerEncoder(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.impl = nn.TransformerEncoder(*args, **kwargs)
        self.initialize_weights()
    
    def forward(self, src, *args, **kwargs):
        src = src.transpose(0, 1).contiguous()  # MaxInLen x BatchSize x EmbSize
        result = self.impl(src, *args, **kwargs)  # TargetLen x BatchSize x EmbSize
        result = result.transpose(0, 1).contiguous()  # BatchSize x TargetLen x EmbSize
        return result
    
    def initialize_weights(self):
        for param in self.impl.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)

In [None]:
def train_eval_loop(model, train_dataset, val_dataset, criterion, lr=1e-3, epoch_n=100, batch_size_train=32,
                    batch_size_val=32, device=None, early_stopping_patience=10, l2_reg_alpha=0, data_loader_ctor=DataLoader,
                    optimizer_ctor=None, lr_scheduler_ctor=None, dataloader_workers_n=0, draw_loss=False, show_lr=False):


    assert len(train_dataset) % batch_size_train == 0, "len of train_dataset must be divisible by train_batch_size"
    assert len(val_dataset) % batch_size_val == 0, "len of val_dataset must be divisible by val_batch_size"

    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)
    model.to(device)

    if optimizer_ctor is None:
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2_reg_alpha)
    else:
        optimizer = optimizer_ctor(model.parameters(), lr=lr)
    
    if lr_scheduler_ctor is not None:
        lr_scheduler = lr_scheduler_ctor(optimizer)
    else:
        lr_scheduler = None

    
    train_dataloader = data_loader_ctor(train_dataset, batch_size=batch_size_train, num_workers=dataloader_workers_n)
    val_dataloader = data_loader_ctor(val_dataset, batch_size=batch_size_val, num_workers=dataloader_workers_n)

    best_val_loss = float("inf")
    best_epoch_i = 0
    best_model = deepcopy(model)
    
    for epoch_i in range(epoch_n):
        try:
            epoch_start = datetime.datetime.now()
            print(f"Epoch {epoch_i}")

            model.train()
            mean_train_loss = 0
            train_batches_n = 0
            history = []

            #for batch_i, (batch_x, batch_y) in enumerate(tqdm(train_dataloader)):
            for batch_i, (batch_x, batch_y) in enumerate(train_dataloader):
                batch_x = copy_data_to_device(batch_x, device)
                batch_y = copy_data_to_device(batch_y, device)

                pred = model(batch_x)
                loss = criterion(pred, batch_y)

                model.zero_grad()
                loss.backward()

                optimizer.step()

                mean_train_loss += float(loss)
                train_batches_n += 1

                if draw_loss:
                    history.append(loss)

            mean_train_loss /= train_batches_n
            print('{} iterations, {:0.2f} sec'.format(train_batches_n,
                                                           (datetime.datetime.now() - epoch_start).total_seconds()))
            print('Average value of the train loss function:', mean_train_loss)

            if draw_loss:
                plt.plot(history, label="loss")
                plt.legend()
                plt.show()

            model.eval()
            mean_val_loss = 0
            val_batches_n = 0

            with torch.no_grad():
                for batch_i, (batch_x, batch_y) in enumerate(val_dataloader):

                    batch_x = copy_data_to_device(batch_x, device)
                    batch_y = copy_data_to_device(batch_y, device)

                    pred = model(batch_x)
                    loss = criterion(pred, batch_y)

                    mean_val_loss += float(loss)
                    val_batches_n += 1

            mean_val_loss /= val_batches_n
            print('Average value of the validation loss function:', mean_val_loss)

            if mean_val_loss < best_val_loss:
                best_epoch_i = epoch_i
                best_val_loss = mean_val_loss
                best_model = deepcopy(model)
                print('New best model!')
            elif epoch_i - best_epoch_i > early_stopping_patience:
                print('The model has not improved over the last {} epochs, stop training'.format(
                    early_stopping_patience))
                break
  
            if lr_scheduler is not None:
                if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                    lr_scheduler.step(mean_val_loss)
                elif isinstance(lr_scheduler, torch.optim.lr_scheduler.StepLR):
                    lr_scheduler.step()
                    if show_lr:
                        print(optimizer.param_groups[0]['lr'])
                else:
                    lr_scheduler.step()

            print()
        except KeyboardInterrupt:
            print('Stopped early by user')
            break
        except Exception as ex:
            print('Error while training: {}\n{}'.format(ex, format_exc()))
            break

    return best_val_loss, best_model

In [None]:
class GreedyGenerator:
    def __init__(self, model, tokenizer, device='cuda', eos_token_id=3):
        self.model = model
        self.tokenizer = tokenizer
        self.device = torch.device(device)
        self.model.to(self.device)
        self.eos_token_id = eos_token_id

    def __call__(self, seed_text, max_steps_n=40):
        seed_tokens = self.tokenizer.encode([seed_text])[0]

        for _ in range(max_steps_n):
            in_batch = torch.tensor(seed_tokens).unsqueeze(0).to(self.device)
            best_next_token = self.model(in_batch)[0, -1].argmax()
            if best_next_token == self.eos_token_id:
                break

            seed_tokens.append(best_next_token)

        return self.tokenizer.decode([seed_tokens])[0]

In [None]:
class BeamGenerator:
    def __init__(self, model, tokenizer, device='cuda', eos_token_id=3):
        self.model = model
        self.tokenizer = tokenizer
        self.device = torch.device(device)
        self.model.to(self.device)
        self.eos_token_id = eos_token_id

    def __call__(self, seed_text, max_steps_n=40, return_hypotheses_n=5, beamsize=5):
        seed_tokens = self.tokenizer.encode([seed_text])[0]
        initial_length = len(seed_tokens)

        partial_hypotheses = [(0, seed_tokens)]
        final_hypotheses = []

        while len(partial_hypotheses) > 0:
            cur_partial_score, cur_partial_hypothesis = heapq.heappop(partial_hypotheses)

            in_batch = torch.tensor(cur_partial_hypothesis).unsqueeze(0).to(self.device)
            next_tokens_logits = self.model(in_batch)[0, -1]
            next_tokens_logproba = F.log_softmax(next_tokens_logits, dim=0)
            topk_continuations = next_tokens_logproba.topk(beamsize)

            for token_score, token_idx in zip(topk_continuations.values, topk_continuations.indices):
                token_score = float(token_score)
                token_idx = int(token_idx)

                old_denorm_score = cur_partial_score * np.sqrt(len(cur_partial_hypothesis))
                new_score = (old_denorm_score - token_score) / np.sqrt(len(cur_partial_hypothesis) + 1)

                new_hypothesis = cur_partial_hypothesis + [token_idx]
                new_item = (new_score, new_hypothesis)

                if token_idx == self.eos_token_id or len(new_hypothesis) - initial_length >= max_steps_n:
                    final_hypotheses.append(new_item)
                else:
                    heapq.heappush(partial_hypotheses, new_item)

            if len(partial_hypotheses) > beamsize:
                partial_hypotheses = heapq.nsmallest(beamsize, partial_hypotheses)
                heapq.heapify(partial_hypotheses)

        final_scores, final_token_lists = zip(*final_hypotheses)
        final_texts = self.tokenizer.decode(list(final_token_lists))

        result = list(zip(final_scores, final_texts))
        result.sort()
        result = result[:return_hypotheses_n]

        return result

In [None]:
class ProbGenerator:
    def __init__(self, model, tokenizer, device='cuda', eos_token_id=3, max_steps_n=40, temperature=1.0):
        self.model = model
        self.tokenizer = tokenizer
        self.device = torch.device(device)
        self.model.to(self.device)
        self.eos_token_id = eos_token_id
        self.max_steps_n = max_steps_n
        self.temperature = temperature

    def __call__(self, seed_text):
        seed_tokens = self.tokenizer.encode([seed_text])[0]
        
        with torch.no_grad():
            for _ in range(self.max_steps_n):
                in_batch = torch.tensor(seed_tokens).unsqueeze(0).to(self.device)
                logits_next = self.model(in_batch)[0, -1]
                p_next = F.softmax(logits_next / self.temperature, dim=-1).data.cpu().numpy()
                next_token = np.random.choice(len(tokenizer.vocab()), p=p_next)
                if next_token == self.eos_token_id:
                    break
                seed_tokens.append(next_token)

        return ''.join(self.tokenizer.decode([seed_tokens], ignore_ids=[0,2,3]))

## Loading dataset and splitting it into training and test samples:

In [None]:
drive.mount('/content/gdrive')

In [None]:
dataset_filename = "/content/gdrive/My Drive/ML/datasets/Hermann_Hesse_bibliography_ru.txt"
all_chunks = split_into_chunks(dataset_filename, chunk_size=500)
len(all_chunks)

In [None]:
np.random.shuffle(all_chunks)

TRAIN_SPLIT = int(len(all_chunks) * 0.7)
train_sample = all_chunks[:TRAIN_SPLIT]
val_sample = all_chunks[TRAIN_SPLIT:]

print("Training sample size:", len(train_sample))
print("Validation sample size:", len(val_sample))

# Save train sample in file for further BPE training:
TRAIN_SAMPLE_FILENAME = "/tmp/train_sample.txt"

with open(TRAIN_SAMPLE_FILENAME, 'w') as f:
    f.write('\n'.join(train_sample))


##  BPE tokenization using [youtokentome library](https://pypi.org/project/youtokentome/):

In [None]:
NUM_TOKENS_BPE = 1000
BPE_MODEL_FILENAME = "/tmp/bpe_model.yttm"
yttm.BPE.train(data=TRAIN_SAMPLE_FILENAME, vocab_size=NUM_TOKENS_BPE, model=BPE_MODEL_FILENAME)

tokenizer = yttm.BPE(BPE_MODEL_FILENAME)

train_tokenized_sample = tokenizer.encode(train_sample, bos=True, eos=True)
val_tokenized_sample = tokenizer.encode(val_sample, bos=True, eos=True)

In [None]:
print(train_tokenized_sample[0])

In [None]:
print(tokenizer.vocab())

In [None]:
plt.hist([len(sent) for sent in train_tokenized_sample], bins=30)
plt.title('Distribution of tokenized fragment length')
plt.yscale('log');

In [None]:
token_counts = np.bincount([token_id for chunk in val_tokenized_sample for token_id in chunk])

plt.hist(token_counts, bins=100)
plt.title('Tokens mention distribution')
plt.yscale('log');

## Creation of [datasets](#scrollTo=OYs7y8NkMDFJ&line=8&uniqifier=1) for PyTorch:

In [None]:
CHUNK_LENGTH = 200

train_dataset = LanguageModelDataset(train_tokenized_sample,
                                     chunk_length=CHUNK_LENGTH)
val_dataset = LanguageModelDataset(val_tokenized_sample,
                                    chunk_length=CHUNK_LENGTH)

## Finding the appropriate batch size for train and validation:

In [None]:
print(f"divisors of train dataset size ({len(train_dataset)}) are {divisors(len(train_dataset))}")
print(f"divisors of val dataset size ({len(val_dataset)}) are {divisors(len(val_dataset))}")

batch_size_train = 66
batch_size_val = 9

## Using PyTorchEncoderLayer and our [LanguageModel class](#scrollTo=QZBmKyALZ5lq&line=1&uniqifier=1) for model creation:

In [None]:
emb_size = 300             # if use BPEmb: SUPPORTED_DIMS = [25, 50, 100, 200, 300]
heads_number = 15
dim_feedforward = 500
layers_number = 6
emb_dropout = 0.15
layer_dropout = 0.15

vectors = BPEmb(language='ru', dim=emb_size,  merge_ops=min([1000, 3000, 5000, 10000, 25000, 50000, 100000, 200000], key=lambda x:abs(x-tokenizer.vocab_size())))
emb_weights = vectors[tokenizer.vocab()]

TransformerEncoderLayer = nn.TransformerEncoderLayer(d_model=emb_size, nhead=heads_number, dim_feedforward=dim_feedforward, dropout=layer_dropout, activation='gelu')

backbone = BatchFirstTransformerEncoder(TransformerEncoderLayer, num_layers=layers_number)

torch_transf_model = LanguageModel(vocab_size=tokenizer.vocab_size(), emb_size=emb_size, backbone=backbone, emb_weights=emb_weights, freeze=False, emb_dropout=emb_dropout)

print('Number of parameters in model:', get_params_number(torch_transf_model))

In [None]:
#@title For loading model
emb_size = 300             # if use BPEmb: SUPPORTED_DIMS = [25, 50, 100, 200, 300]
heads_number = 15
dim_feedforward = 500
layers_number = 6
emb_dropout = 0.15
layer_dropout = 0.15

vectors = BPEmb(language='ru', dim=emb_size,  merge_ops=min([1000, 3000, 5000, 10000, 25000, 50000, 100000, 200000], key=lambda x:abs(x-tokenizer.vocab_size())))
emb_weights = vectors[tokenizer.vocab()]

TransformerEncoderLayer = nn.TransformerEncoderLayer(d_model=emb_size, nhead=heads_number, dim_feedforward=dim_feedforward, dropout=layer_dropout, activation='gelu')

backbone = BatchFirstTransformerEncoder(TransformerEncoderLayer, num_layers=layers_number)

best_torch_transf_model = LanguageModel(vocab_size=tokenizer.vocab_size(), emb_size=emb_size, backbone=backbone, emb_weights=emb_weights, freeze=False, emb_dropout=emb_dropout)

## [Training](#scrollTo=jx44W288jjRP&line=1&uniqifier=1):

In [None]:
lr_scheduler = lambda optim: \
    torch.optim.lr_scheduler.ReduceLROnPlateau(optim, patience=10, factor=0.75, verbose=True)

#lr_scheduler = lambda optim: \
#    torch.optim.lr_scheduler.StepLR(optim, step_size=5, gamma=0.9)

best_val_loss, best_torch_transf_model = train_eval_loop(torch_transf_model,
                                                         train_dataset,
                                                         val_dataset,
                                                         lm_cross_entropy,
                                                         lr=5e-4,
                                                         epoch_n=3000,
                                                         batch_size_train=batch_size_train,
                                                         batch_size_val=batch_size_val,
                                                         device='cuda',
                                                         early_stopping_patience=30,
                                                         lr_scheduler_ctor=lr_scheduler,
                                                         draw_loss=False,
                                                         dataloader_workers_n=5)

In [None]:
MODEL_FILENAME = f"/tmp/best_torch_transf_model_{best_val_loss}.pth"
torch.save(best_torch_transf_model.state_dict(), MODEL_FILENAME)
files.download(MODEL_FILENAME)

## Text generation using language modeling:

### Greedy generation:

In [None]:
greedy_generator = GreedyGenerator(best_torch_transf_model, tokenizer)
max_steps = 150

In [None]:
print(greedy_generator('Смысл жизни в том,', max_steps_n=max_steps))

In [None]:
print(greedy_generator('Наш мир - это', max_steps_n=max_steps))

In [None]:
print(greedy_generator('В конце концов,', max_steps_n=max_steps))

In [None]:
print(greedy_generator('Я ведь всего только и хотел пытаться жить тем, ', max_steps_n=max_steps))

### Generation using beam search:

In [None]:
beam_generator = BeamGenerator(best_torch_transf_model, tokenizer)

In [None]:
%%time

beam_gen_variants = beam_generator('Я ведь всего только и хотел пытаться жить тем, ', beamsize=5, return_hypotheses_n=5)

for score, pred_txt in beam_gen_variants:
    print('****')
    print(score)
    print(pred_txt)
    print()

In [None]:
%%time

beam_gen_variants = beam_generator('Я ведь всего только и хотел пытаться жить тем, ', beamsize=20, return_hypotheses_n=20)

for score, pred_txt in beam_gen_variants:
    print('****')
    print(score)
    print(pred_txt)
    print()

In [None]:
gen = ProbGenerator(best_torch_transf_model, tokenizer, max_steps_n=50, temperature=0.0001)

In [None]:
print(gen("Я ведь всего только и хотел пытаться жить тем, "))