# mGTP model (SberDevices)




# 1. Preparation

**1.1 Installing and linking the required libraries**

In [None]:
!pip install transformers==4.10.3

In [None]:
import torch
import torch.nn as nn 
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer

**1.2 Import data from local disk for model training**

In [None]:
from google.colab import files
uploaded_train = files.upload()
uploaded_train_name = list(uploaded_train)[0]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**1.3 Model loading**

In [None]:
# !wget https://files.sberdisk.ru/s/NzeBqYE84TAQDiS/download -O model.zip
# !unzip model.zip -d mgptxl
# model_name = "./mgptxl"
model_name = "sberbank-ai/mGPT"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False)

**1.4 Data conversion**

In [None]:
class TextDataset(Dataset):
    def __init__(self, path, tokenizer, seq_length=512):
        with open(path) as f:
            data = f.read()
        tokens = tokenizer.encode(data)
        examples = []
        for i in range(0, len(tokens) - seq_length + 1, seq_length):
            examples.append(tokens[i:i + seq_length])
        self.samples = torch.LongTensor(examples)
        print('Loaded samples:', len(self.samples))
    
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, item):
        return self.samples[item]

In [None]:
def convert_to_line(path):
  df = pd.read_csv(path)
  line = ''
  for index, row in df.iterrows():
    line += row['tj'].replace('\n', ' ') + ' ' + row['pers'].replace('\n', ' ') + '\n'
  file_name = path.split('.csv')[0]+'_converted.txt'
  with open(file_name, 'w') as f:
    f.write(line)
  return file_name

In [None]:
up_train_con = convert_to_line(uploaded_train_name)

In [None]:
with open(up_train_con, mode='r', encoding='utf-8') as f:
    lines = f.readlines()
for i in range(5):
    print(lines[i].replace('\n', ''))

ротибахӯр راتبه‌خور
Дигар гашта аз ҷанг ҷустан сутӯҳ. دگر گشته از جنگ جستن ستوه
нонтокт ммуриол نانتاکت مموریال
Хуан круз реал خوان کروز رئال
Ҳама гӯш доред овои ман, همه گوش دارید آوای من


In [None]:
train_dataset = TextDataset(up_train_con, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, drop_last=True)

Loaded samples: 3587


# 2. Training

In [None]:
for n,p in model.named_parameters():
  if 'transformer.h' in n:
    layer_num = int(n.split('.')[2])
    if 'ln_' not in n and layer_num > 0 and layer_num < 23:
      p.requires_grad = False
      print('Freeze', n)

In [None]:
model.cuda()
optimizer = torch.optim.SGD(params=model.parameters(), lr=1e-5)

In [None]:
def save_model(epoch): 
    path = '/content/drive/MyDrive/Model/saved_model_e' + str(epoch) + '.pth'
    torch.save(model.state_dict(), path)

def load_model(epoch):
  path = '/content/drive/MyDrive/Model/saved_model_e' + str(epoch) + '.pth'
  model.load_state_dict(torch.load(path))

In [None]:
#model.eval() 
model.train()
last_epoch = -1
max_epoch = 3
#max_epoch = 9
for epoch in range(0,max_epoch):
  print('Epoch', epoch)
  progressbar = tqdm(train_loader)
  losses = []
  for batch in progressbar:
    optimizer.zero_grad()

    batch = batch.to(model.device)
    outputs = model(batch, labels=batch)
    loss = outputs.loss

    loss.backward()
    optimizer.step()

    losses.append(loss.detach().item())
    progressbar.set_description("Loss: %.3f" % np.mean(losses[-10:]))
    last_epoch += 1
  save_model(epoch)

Epoch 7


Loss: 2.504: 100%|██████████| 3470/3470 [1:19:38<00:00,  1.38s/it]


Epoch 8


Loss: 2.409: 100%|██████████| 3470/3470 [1:19:36<00:00,  1.38s/it]


In [None]:
load_model(last_epoch)

# 3. Testing and metrics calculation

In [None]:
!pip install levenshtein
!pip install sacrebleu

In [None]:
from sacrebleu.metrics import BLEU
from Levenshtein import ratio

In [None]:
def cut(t, lenght): #функция для ограничения длины выводимого(генерируемого) текста
  line = t.split('متن فارسی:')[1]
  gen = line.split()
  line = ''
  lenght = min(lenght, len(gen))
  for i in range(lenght):
    line = line + ' ' + gen[i]
  return line.strip()

def transliteration(line):
  text = 'Матни тоҷикӣ:' + line + 'متن فارسی:' 
  input_ids = tokenizer.encode(text, return_tensors="pt").cuda()
  out = model.generate(input_ids, min_length=100, max_length=100, eos_token_id=5, pad_token=1, top_k=10, top_p=0.0, no_repeat_ngram_size=5, do_sample=False) 
  generated_text = list(map(tokenizer.decode, out))[0]
  return cut(generated_text, len(line.split()))

In [None]:
def get_bleu_score(pred, ref):
  bleu = BLEU()
  return bleu.corpus_score([pred], [[ref]]).score

def get_levenshtein_ratio(pred, ref):
  return(ratio(pred,ref))
  
def get_accuracy(ref, pred):
  return sum([r == p for r, p in zip(ref[0], pred)]) / len(pred)

def get_translit_file(file_input, file_output):     
  col_names=['tj', 'pers', 'pers_prediction', 'bleu', 'levenshtein_ratio', 'accuracy']
  df = pd.DataFrame(columns=col_names)

  df['tj'] = pd.read_csv(file_input, delimiter = ',')['tj']
  df['pers'] = pd.read_csv(file_input, delimiter = ',')['pers']
    
  for index, row in df.iterrows():
    # print(row['tj'], row['pers'])
    row['pers_prediction'] = transliteration(row['tj'])
    row['bleu'] = get_bleu_score(row['pers_prediction'], row['pers'])
    row['levenshtein_ratio'] = get_levenshtein_ratio(row['pers_prediction'], row['pers'])
    row['accuracy'] = get_accuracy(row['pers'], row['pers_prediction'])
      
    #print(row)
    print(f'''Rows completed: {index+1} / {df.shape[0]}''')

  file_output_name = f'{file_output}.xlsx'
  df.to_excel(file_output_name)
  print(f'Result saved to file: {file_output_name}')

In [None]:
up_test = files.upload() 
test_name = list(up_test)[0]

In [None]:
# df= pd.read_csv(test_name, delimiter = ';')
# for index, row in df.iterrows():
#   row['tj'] = str(row['tj']).strip()
#   row['pers'] = str(row['pers']).strip()

# df.to_csv(test_name+'_converted.csv', index=False)

In [None]:
get_translit_file(test_name, 'mGPT_examples')