In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

device = "cuda"
model_mgpt = GPT2LMHeadModel.from_pretrained('sberbank-ai/mGPT').cuda(device)
word_embeddings_mgpt = model_mgpt.transformer.wte.weight  # Word Token Embeddings 
position_embedding_mgpt = model_mgpt.transformer.wpe.weight  # Word Position Embeddings 

tokenizer_mgpt = GPT2Tokenizer.from_pretrained("sberbank-ai/mGPT")

In [2]:
import os 
import random
import numpy as np 

DEFAULT_RANDOM_SEED = 17

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
# torch random seed
import torch
def seedTorch(seed=DEFAULT_RANDOM_SEED):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
      
# basic + torch 
def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)
    seedTorch(seed)
    
seedEverything()

In [3]:
from googletrans import Translator
translator = Translator()

In [4]:
from random import randint

def random_tokens_retrieve_concat(k=8):
    result_str = ""
    for i in range(k):
        token_id = randint(0, 100000-1)
        result_str += tokenizer_mgpt.decode(token_id)
    token_ids = tokenizer_mgpt.encode(result_str, return_tensors="pt").cuda(device)
    return token_ids, result_str

In [None]:
# so we need 100 public and 100 private test generations
for split in ['public', 'private']:
    for k in [8, 9, 10]:
        for i in range(100):
            input_ids, result_str = random_tokens_retrieve_concat(k=k)
            out = model_mgpt.generate(
                    input_ids,
                    min_length=10, 
                    max_length=200,
                    eos_token_id=5,
                    temperature=0.0
            )
            generated_text = list(map(tokenizer_mgpt.decode, out))[0]
            tr = translator.translate(generated_text)
            research_entry = f"""--------------------\nНачальные токены:\n"\n{result_str}\n"
Генерация mGPT:\n"\n{generated_text}\n"
GoogleTranslate_eng:\n"\n{tr.text}\n"\n
--------------------\n"""
            with open(f'./circle_theory_gen/mGPT_{split}_{k}Tokens.txt', 'a') as f:
                f.write(research_entry)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.
The attentio