# all text encoders

In [1]:
import pprint
pp = pprint.PrettyPrinter(indent=4, width=200, depth=None, stream=None, compact=False, sort_dicts=False)

In [1]:
import json

with open('szinonimak.json', encoding='utf-8') as f:
    d = json.load(f)

In [2]:
# make smaller test dict
d_keys_li=list(d.keys())
smaller_dict={}
for i in range(0,10):
    smaller_dict[d_keys_li[i]]=d[d_keys_li[i]]
# pp.pprint(smaller_dict)

In [3]:
# d - full test disctionary
# smaller_dict - 10 key dictionary
DICT = smaller_dict

In [4]:
def get_avg_res(tokenizer,model):
    from scipy.spatial import distance

    sum=0
    cnt=0
    avg=0

    for k,v in DICT.items():
        baseword=k
        encoded_input_base=tokenizer(baseword,return_tensors='pt')
        output_base = model(**encoded_input_base)
        for e in v[1]:
            synonym=e
            encoded_input_synonym=tokenizer(synonym,return_tensors='pt')
            output_synonym = model(**encoded_input_synonym)

            embedded_base=output_base[0][0][-1]
            embedded_synonym=output_synonym[0][0][-1]
            dist=distance.cosine(embedded_base.detach().numpy(),embedded_synonym.detach().numpy())
            sum=sum+dist
            cnt=cnt+1
    # overall result on synonym-dictionary       
    avg=sum/cnt
    print(avg)

In [6]:
def get_avg_res_sentiment(tokenizer, model):
    from scipy.spatial import distance

    sum=0
    cnt=0
    avg=0

    for k,v in DICT.items():
        baseword=k
        encoded_input_base=tokenizer(baseword,return_tensors='pt')
        output_base = model(**encoded_input_base)
        for e in v[1]:
            synonym=e
            encoded_input_synonym=tokenizer(synonym,return_tensors='pt')
            output_synonym = model(**encoded_input_synonym)

            embedded_base=output_base[0][-1]
            embedded_synonym=output_synonym[0][-1]
            dist=distance.cosine(embedded_base.tolist(),embedded_synonym.tolist())
            sum=sum+dist
            cnt=cnt+1
    # overall result on synonym-dictionary       
    avg=sum/cnt
    print(avg)

In [7]:
def get_avg_res_xlm100(tokenizer,model):
    from scipy.spatial import distance

    sum=0
    cnt=0
    avg=0

    language_id_hu = tokenizer.lang2id["hu"]

    for k,v in DICT.items():
        baseword=k
        base_input_ids = torch.tensor([tokenizer.encode(baseword)])
        base_lang =  torch.tensor([language_id_hu] * base_input_ids.shape[1])
        base_lang = base_lang.view(1, -1)
        output_base = model(base_input_ids, langs=base_lang)
        for e in v[1]:
            synonym=e
            syn_input_ids = torch.tensor([tokenizer.encode(synonym)])
            syn_lang =  torch.tensor([language_id_hu] * syn_input_ids.shape[1])
            syn_lang = syn_lang.view(1, -1)
            output_syn = model(syn_input_ids, langs=syn_lang)

            embedded_base=output_base[0][0][-1]
            embedded_syn=output_syn[0][0][-1]
            dist=distance.cosine(embedded_base.tolist(),embedded_syn.tolist())
            sum=sum+dist
            cnt=cnt+1
    # overall result on synonym-dictionary       
    avg=sum/cnt
    print(avg)

In [8]:
# bert-base-multilingual-uncased
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
model = BertModel.from_pretrained("bert-base-multilingual-uncased")

get_avg_res(tokenizer,model)

  from .autonotebook import tqdm as notebook_tqdm
Downloading vocab.txt: 100%|██████████| 851k/851k [00:00<00:00, 1.10MB/s] 
Downloading tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 14.0kB/s]
Downloading config.json: 100%|██████████| 625/625 [00:00<00:00, 156kB/s]
Downloading pytorch_model.bin: 100%|██████████| 641M/641M [00:12<00:00, 53.1MB/s] 
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).


0.2021213493176869


In [9]:
# bert-base-multilingual-cased
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased")

get_avg_res(tokenizer,model)

Downloading vocab.txt: 100%|██████████| 972k/972k [00:03<00:00, 278kB/s] 
Downloading tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 7.25kB/s]
Downloading config.json: 100%|██████████| 625/625 [00:00<00:00, 209kB/s]
Downloading pytorch_model.bin: 100%|██████████| 681M/681M [02:00<00:00, 5.95MB/s] 
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertMo

0.3758295446026082


In [10]:
# text-generation-news-gpt2-small-hungarian
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("NYTK/text-generation-news-gpt2-small-hungarian")
model = AutoModelForCausalLM.from_pretrained("NYTK/text-generation-news-gpt2-small-hungarian")

get_avg_res(tokenizer,model)

Downloading tokenizer_config.json: 100%|██████████| 596/596 [00:00<00:00, 149kB/s]
Downloading vocab.json: 100%|██████████| 591k/591k [00:01<00:00, 377kB/s] 
Downloading merges.txt: 100%|██████████| 376k/376k [00:01<00:00, 331kB/s]  
Downloading tokenizer.json: 100%|██████████| 1.01M/1.01M [00:03<00:00, 321kB/s]
Downloading special_tokens_map.json: 100%|██████████| 109/109 [00:00<00:00, 36.4kB/s]
Downloading config.json: 100%|██████████| 773/773 [00:00<00:00, 257kB/s]
Downloading pytorch_model.bin: 100%|██████████| 436M/436M [00:08<00:00, 53.4MB/s] 


0.04946080944976028


In [11]:
# text-generation-poem-petofi-gpt2-small-hungarian
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("NYTK/text-generation-poem-petofi-gpt2-small-hungarian")
model = AutoModelForCausalLM.from_pretrained("NYTK/text-generation-poem-petofi-gpt2-small-hungarian")

get_avg_res(tokenizer,model)

Downloading tokenizer_config.json: 100%|██████████| 596/596 [00:00<00:00, 149kB/s]
Downloading vocab.json: 100%|██████████| 591k/591k [00:01<00:00, 355kB/s] 
Downloading merges.txt: 100%|██████████| 376k/376k [00:01<00:00, 311kB/s] 
Downloading tokenizer.json: 100%|██████████| 1.01M/1.01M [00:02<00:00, 405kB/s] 
Downloading special_tokens_map.json: 100%|██████████| 109/109 [00:00<00:00, 21.8kB/s]
Downloading config.json: 100%|██████████| 773/773 [00:00<00:00, 194kB/s]
Downloading pytorch_model.bin: 100%|██████████| 436M/436M [00:08<00:00, 54.4MB/s] 


0.04117110006663264


In [12]:
# hubert-base-cc
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("SZTAKI-HLT/hubert-base-cc")
tokenizer = AutoTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")

get_avg_res(tokenizer,model)

Downloading config.json: 100%|██████████| 420/420 [00:00<00:00, 105kB/s]
Downloading pytorch_model.bin: 100%|██████████| 424M/424M [00:08<00:00, 52.8MB/s] 
Some weights of the model checkpoint at SZTAKI-HLT/hubert-base-cc were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from

0.0043750046467294496


In [13]:
# mGPT
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/mGPT")
model = AutoModelForCausalLM.from_pretrained("sberbank-ai/mGPT")

get_avg_res(tokenizer,model)

Downloading config.json: 100%|██████████| 725/725 [00:00<00:00, 182kB/s]
Downloading vocab.json: 100%|██████████| 1.81M/1.81M [00:02<00:00, 654kB/s]
Downloading merges.txt: 100%|██████████| 1.15M/1.15M [00:01<00:00, 711kB/s] 
Downloading pytorch_model.bin: 100%|██████████| 3.21G/3.21G [00:55<00:00, 61.7MB/s]


0.007187895020660089


In [14]:
# sentiment-hts2-xlm-roberta-hungarian
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts2-xlm-roberta-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts2-xlm-roberta-hungarian")

get_avg_res_sentiment(tokenizer,model)

Downloading tokenizer_config.json: 100%|██████████| 1.07k/1.07k [00:00<00:00, 549kB/s]
Downloading config.json: 100%|██████████| 761/761 [00:00<00:00, 380kB/s]
Downloading vocab.json: 100%|██████████| 500k/500k [00:02<00:00, 233kB/s]  
Downloading merges.txt: 100%|██████████| 305k/305k [00:00<00:00, 540kB/s]  
Downloading tokenizer.json: 100%|██████████| 864k/864k [00:01<00:00, 621kB/s]  
Downloading special_tokens_map.json: 100%|██████████| 772/772 [00:00<00:00, 774kB/s]
Downloading pytorch_model.bin: 100%|██████████| 254M/254M [00:05<00:00, 48.2MB/s] 


0.7105878830825121


In [15]:
# sentiment-hts5-xlm-roberta-hungarian
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts5-xlm-roberta-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts5-xlm-roberta-hungarian")

get_avg_res_sentiment(tokenizer,model)

Downloading tokenizer_config.json: 100%|██████████| 394/394 [00:00<00:00, 197kB/s]
Downloading config.json: 100%|██████████| 1.00k/1.00k [00:00<00:00, 511kB/s]
Downloading sentencepiece.bpe.model: 100%|██████████| 4.83M/4.83M [00:00<00:00, 9.55MB/s]
Downloading tokenizer.json: 100%|██████████| 8.66M/8.66M [00:04<00:00, 1.86MB/s]
Downloading special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 47.8kB/s]
Downloading pytorch_model.bin: 100%|██████████| 1.04G/1.04G [00:38<00:00, 29.0MB/s]


0.14574735969674552


In [16]:
# sentiment-hts5-hubert-hungarian
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts5-hubert-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts5-hubert-hungarian")

get_avg_res_sentiment(tokenizer,model)

Downloading tokenizer_config.json: 100%|██████████| 346/346 [00:00<00:00, 69.3kB/s]
Downloading config.json: 100%|██████████| 911/911 [00:00<00:00, 228kB/s]
Downloading vocab.txt: 100%|██████████| 266k/266k [00:00<00:00, 598kB/s] 
Downloading tokenizer.json: 100%|██████████| 507k/507k [00:00<00:00, 902kB/s]  
Downloading special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 28.0kB/s]
Downloading pytorch_model.bin: 100%|██████████| 422M/422M [00:09<00:00, 47.3MB/s] 


0.22346660085836195


In [17]:
# sentiment-hts2-hubert-hungarian
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts2-hubert-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts2-hubert-hungarian")

get_avg_res_sentiment(tokenizer,model)

Downloading tokenizer_config.json: 100%|██████████| 346/346 [00:00<00:00, 49.4kB/s]
Downloading config.json: 100%|██████████| 681/681 [00:00<00:00, 136kB/s]
Downloading vocab.txt: 100%|██████████| 266k/266k [00:00<00:00, 571kB/s] 
Downloading tokenizer.json: 100%|██████████| 507k/507k [00:00<00:00, 927kB/s]  
Downloading special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 28.1kB/s]
Downloading pytorch_model.bin: 100%|██████████| 422M/422M [00:08<00:00, 51.6MB/s] 


0.41677919550617826


In [18]:
# roberta-base
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

get_avg_res(tokenizer,model)

Downloading vocab.json: 100%|██████████| 878k/878k [00:00<00:00, 1.29MB/s]
Downloading merges.txt: 100%|██████████| 446k/446k [00:00<00:00, 833kB/s] 
Downloading config.json: 100%|██████████| 481/481 [00:00<00:00, 80.2kB/s]
Downloading pytorch_model.bin: 100%|██████████| 478M/478M [00:18<00:00, 26.7MB/s] 
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassificatio

0.0015747680956003617


In [19]:
# xlm-roberta-base
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

get_avg_res(tokenizer,model)

0.0042058442320142475


In [20]:
# xlm-roberta-large
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large")

get_avg_res(tokenizer,model)

Downloading config.json: 100%|██████████| 616/616 [00:00<00:00, 205kB/s]
Downloading sentencepiece.bpe.model: 100%|██████████| 4.83M/4.83M [00:06<00:00, 725kB/s] 
Downloading tokenizer.json: 100%|██████████| 8.68M/8.68M [00:03<00:00, 2.53MB/s]
Downloading pytorch_model.bin: 100%|██████████| 2.09G/2.09G [00:38<00:00, 58.7MB/s]


0.006280950137547084


In [21]:
# xlm-mlm-100-1280
from transformers import XLMTokenizer, XLMWithLMHeadModel
import torch

tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-100-1280")

get_avg_res_xlm100(tokenizer,model)

Downloading vocab.json: 100%|██████████| 5.45M/5.45M [00:03<00:00, 1.83MB/s]
Downloading merges.txt: 100%|██████████| 2.84M/2.84M [00:01<00:00, 2.33MB/s]
Downloading tokenizer_config.json: 100%|██████████| 2.29k/2.29k [00:00<00:00, 470kB/s]
Downloading config.json: 100%|██████████| 41.0k/41.0k [00:00<00:00, 181kB/s] 
Downloading pytorch_model.bin: 100%|██████████| 1.06G/1.06G [00:27<00:00, 42.2MB/s]
Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-100-1280 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.3686501758554123


In [5]:
# xlm-roberta-xl
from transformers import AutoTokenizer, AutoModelForMaskedLM

# xxl - 40G+
tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-xl")
model = AutoModelForMaskedLM.from_pretrained("facebook/xlm-roberta-xl")

get_avg_res(tokenizer,model)
# 10 key dict - 8 min

  from .autonotebook import tqdm as notebook_tqdm


0.08749399410218608
