## Setup

In [2]:
!pip install transformers sentencepiece datasets translate-toolkit --quiet

[K     |████████████████████████████████| 5.5 MB 5.1 MB/s 
[K     |████████████████████████████████| 1.3 MB 56.2 MB/s 
[K     |████████████████████████████████| 451 kB 65.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 56.3 MB/s 
[K     |████████████████████████████████| 182 kB 62.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 51.0 MB/s 
[K     |████████████████████████████████| 212 kB 65.4 MB/s 
[K     |████████████████████████████████| 115 kB 69.7 MB/s 
[K     |████████████████████████████████| 127 kB 43.7 MB/s 
[?25h

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
from IPython.utils import io
import torch
from torch import optim
from torch.nn import functional as F
import pandas as pd 


from transformers import AdamW, AutoTokenizer, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

from transformers.models.mt5 import MT5Config, MT5ForConditionalGeneration

sns.set()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved to /root/.huggingface/token
Login successful


In [None]:
model.push_to_hub("makarshatilov/transliteration_v1")

CommitInfo(commit_url='https://huggingface.co/makarshatilov/transliteration_v1/commit/dc4be5ae2df0398ddeca9088910d4ae48146d15f', commit_message='Upload MT5ForConditionalGeneration', commit_description='', oid='dc4be5ae2df0398ddeca9088910d4ae48146d15f', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("makarshatilov/transliteration_v1")

CommitInfo(commit_url='https://huggingface.co/makarshatilov/transliteration_v1/commit/dfafc176a1881d70cff4be947eda0e6328feffa6', commit_message='Upload tokenizer', commit_description='', oid='dfafc176a1881d70cff4be947eda0e6328feffa6', pr_url=None, pr_revision=None, pr_num=None)

## Prepare dataset

In [4]:
train_dataset = pd.read_csv('/content/train_dataset.csv', sep=',',  index_col=0)
test_dataset = pd.read_csv('/content/test_dataset.csv', sep=',',  index_col=0)

len(train_dataset), len(test_dataset)

(100000, 4855)

In [5]:
train_dataset.head()

Unnamed: 0,tj,fa
0,ротибахӯр,راتبه‌خور
1,Дигар гашта аз ҷанг ҷустан сутӯҳ.,دگر گشته از جنگ جستن ستوه
2,нонтокт ммуриол,نانتاکت مموریال
3,Хуан круз реал,خوان کروز رئال
4,"Ҳама гӯш доред овои ман,",همه گوش دارید آوای من


## Model

####Будем использовать модель mT5. Это модель общего назначения (seq2seq) предобученная на данных на разных языках.

---

Статья:
[mT5: A Massively Multilingual Pre-trained Text-to-Text Transformer](https://aclanthology.org/2021.naacl-main.41/)

Код: [GitHub](https://github.com/google-research/multilingual-t5)

In [None]:
model_repo = 'makarshatilov/transliteration_v1'#'google/mt5-base'

config = MT5Config.from_pretrained('google/mt5-base')

Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

In [None]:
##config.max_length = 40
max_seq_len = config.max_length

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google/mt5-base')

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [None]:
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-base')
model = model.cuda()

Downloading:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/mt5_translation base v2.pt'))

<All keys matched successfully>

#### Что подается на вход модели

In [None]:
example_input_str = 'Гар кунӣ болои мо таште нигун».'
input_ids = tokenizer.encode(example_input_str,
                             return_tensors='pt',
                             padding='max_length',
                             truncation=True,
                             max_length=max_seq_len)
print(input_ids, "\n")

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print(tokens)

tensor([[36407, 25044,  3202,  9493,   279,  3800,  1007, 37761,   259,   668,
         52298,  1836,     1,     0,     0,     0,     0,     0,     0,     0]]) 

['▁Гар', '▁кун', 'ӣ', '▁боло', 'и', '▁мо', '▁та', 'ште', '▁', 'ни', 'гун', '».', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


## Добавим специальные токены

In [None]:
LANG_TOKEN_MAPPING = {
    'tj': '<tj>',
    'fa': '<fa>'
}

In [None]:
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(250102, 768)

In [None]:
example_input_str = '<tj>Давайте токенизируем это предложение.'
input_ids = tokenizer.encode(example_input_str,
                             return_tensors='pt',
                             padding='max_length',
                             truncation=True,
                             max_length=max_seq_len)
print(input_ids)

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print(tokens)

tensor([[250100,   3348,  36456,   1049,   1625,  42547,  76660,   1436,  44616,
            324,    260,      1,      0,      0,      0,      0,      0,      0,
              0,      0]])
['<tj>', '▁Да', 'вайте', '▁то', 'ке', 'низ', 'ируем', '▁это', '▁предложени', 'е', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [None]:
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]
  
def encode_target_str(text, tokenizer, seq_len):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)
  
  return token_ids[0]

def format_translation_data(translations, lang_token_map,
                            tokenizer, seq_len=128):
  # Choose a random 2 languages for in i/o
  #langs = list(lang_token_map.keys())
  input_lang, target_lang = 'tj','fa'#np.random.choice(langs, size=2, replace=False)

  # Get the translations for the batch
  input_text = translations[input_lang]
  target_text = translations[target_lang]

  # print(input_lang, input_text)
  # print(target_lang, target_text)

  if input_text is None or target_text is None:
    return None

  input_token_ids = encode_input_str(input_text, target_lang, tokenizer, seq_len, lang_token_map)
  target_token_ids = encode_target_str(target_text, tokenizer, seq_len)

  return input_token_ids, target_token_ids

def transform_batch(batch, lang_token_map, tokenizer):
  inputs = []
  targets = []
  for translation_set in batch:
    formatted_data = format_translation_data(translation_set, lang_token_map, tokenizer, max_seq_len)
    
    # print(formatted_data)
    
    if formatted_data is None:
      continue
    
    input_ids, target_ids = formatted_data
    inputs.append(input_ids.unsqueeze(0))
    targets.append(target_ids.unsqueeze(0))
    
  batch_input_ids = torch.cat(inputs).cuda()
  batch_target_ids = torch.cat(targets).cuda()

  return batch_input_ids, batch_target_ids

def get_data(dataset, lang_token_map, tokenizer, batch_size=32):
  np.random.shuffle(dataset)
  for i in range(0, len(dataset), batch_size):
    raw_batch = dataset[i:i+batch_size]
    
    yield transform_batch(raw_batch, lang_token_map, tokenizer)

#Train

In [6]:
len(train_dataset)

100000

In [None]:
random.shuffle(train_dataset)

In [None]:
n_epochs = 1
batch_size = 20
print_freq = 100

lr = 1e-4

checkpoint_freq = 1000

n_batches = int(np.ceil(len(train_dataset) / batch_size))
total_steps = n_epochs * n_batches
n_warmup_steps = int(total_steps * 0.01)

print("n_batches", n_batches)
print("total_steps", total_steps)
print("n_warmup_steps", n_warmup_steps)

In [None]:
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, n_warmup_steps, total_steps)

losses = []
test_losses = []



In [None]:
def eval_model(model, dataset, max_iters=8):
  test_generator = get_data(dataset, LANG_TOKEN_MAPPING,
                                      tokenizer, batch_size)
  eval_losses = []
  with torch.no_grad():
    for i, (input_batch, label_batch) in enumerate(test_generator):
      if i >= max_iters:
        break

      model_out = model.forward(
          input_ids = input_batch,
          labels = label_batch)
      eval_losses.append(model_out.loss.item())

  return np.mean(eval_losses)

In [None]:
best_test_loss = float('inf')

In [None]:
for epoch_idx in range(n_epochs):
  data_generator = get_data(train_dataset, LANG_TOKEN_MAPPING, tokenizer, batch_size)

  for batch_idx, (input_batch, label_batch) in tqdm(enumerate(data_generator), total=n_batches):

    optimizer.zero_grad()

    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)

    loss = model_out.loss
    losses.append(loss.item())
    
    loss.backward()

    optimizer.step()
    scheduler.step()

      # Print training update info
    if (batch_idx + 1) % print_freq == 0:
      avg_loss = np.mean(losses[-print_freq:])
      print('Epoch: {} | Step: {} | Avg. loss: {:.3f} | lr: {:.6f}'.format(
          epoch_idx+1, batch_idx+1, avg_loss, scheduler.get_last_lr()[0]))
      
    if (batch_idx + 1) % checkpoint_freq == 0:
      test_loss = eval_model(model, test_dataset)
      test_losses.append(test_loss)
      print('Test loss {:.3f}'.format(test_loss))
      #if best_test_loss > test_loss:
        #print('Saving model with test loss of {:.3f}'.format(test_loss))
        #torch.save(model.state_dict(), model_path)
       # best_test_loss = test_loss

 # torch.save(model.state_dict(), model_path)

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/mt5_translation base v2.pt')

In [None]:
test_loss = eval_model(model, test_dataset)
test_loss

0.1330999657511711

In [None]:
# import gc
# gc.collect()

# import torch
# torch.cuda.empty_cache()

In [None]:
window_size = 50
smoothed_losses = [] 
for i in range(len(losses)-window_size):
  smoothed_losses.append(np.mean(losses[i:i+window_size]))

plt.plot(smoothed_losses[100:])

In [None]:
plt.plot(test_losses[:])

In [None]:
# tokenizer.save_pretrained('/content/mt5_translation_example_tokenizer')
#!zip -r '/content/mt5_translation_example_tokenizer.zip' '/content/mt5_translation_example_tokenizer'
# !cp '/content/mt5_translation_example_tokenizer.zip' '/content/gdrive/My Drive/mt5_translation_example_tokenizer2.zip'

# Test

In [None]:
test_sentence = test_dataset[16]['en']
print('Raw input text:', test_sentence)

input_ids = encode_input_str(
    text = test_sentence,
    target_lang = 'ru',
    tokenizer = tokenizer,
    seq_len = model.config.max_length,
    lang_token_map = LANG_TOKEN_MAPPING)

input_ids = input_ids.unsqueeze(0).cuda()

print('Truncated input text:', tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(input_ids[0])))

Raw input text: She called him on the phone. 
Truncated input text: <ru> She called him on the phone. </s><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [None]:
import re

def translate(text, lang_from, lang_to):
  # test_sentence = test_dataset[16][lang_from]
  # print('Raw input text:', test_sentence)

  input_ids = encode_input_str(
      text = text,
      target_lang = lang_to,
      tokenizer = tokenizer,
      seq_len = 35,#model.config.max_length,
      lang_token_map = LANG_TOKEN_MAPPING)

  input_ids = input_ids.unsqueeze(0).cuda()
  # res = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0]))

  output_tokens = model.generate(input_ids, num_beams=20, num_return_sequences=1)

  res = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

  return res

def translate_to_farci(text):
  sentences = list(filter(None, re.split('\.|\?|\!', text))) 
  print(sentences)
  res = [translate(sentence.capitalize(), 'tj', 'fa') for sentence in sentences]
  return '. '.join(res) #translate(text, 'tj', 'fa')

In [None]:
translate_to_farci('нахӯрем')

['нахӯрем']


'نخوریم'