# all text encoders

In [1]:
import pprint
pp = pprint.PrettyPrinter(indent=4, width=200, depth=None, stream=None, compact=False, sort_dicts=False)

In [2]:
import json

with open('../pdf_to_dictionary/szinonimak.json', encoding='utf-8') as f:
    d = json.load(f)

In [3]:
import numpy as np
from numpy.linalg import norm

def my_cosine_similarity(A, B):
    return np.dot(A,B)/(norm(A)*norm(B))

In [4]:
# make smaller test dict
d_keys_li=list(d.keys())
smaller_dict={}
for i in range(0,10):
    smaller_dict[d_keys_li[i]]=d[d_keys_li[i]]
# pp.pprint(smaller_dict)

In [5]:
# d - full test disctionary
# smaller_dict - 10 key dictionary
# DICT = smaller_dict
DICT = d

In [30]:
def get_avg_res_bert(tokenizer,model):
    from scipy.spatial import distance
    import numpy as np

    sum=0
    cnt=0
    avg=0

    for k,v in DICT.items():
        baseword=k
        encoded_input_base=tokenizer(baseword,return_tensors='pt')
        output_base = model(**encoded_input_base)
        for e in v[1]:
            synonym=e
            encoded_input_synonym=tokenizer(synonym,return_tensors='pt')
            output_synonym = model(**encoded_input_synonym)

            embedded_base_avg=np.average(output_base.last_hidden_state[0].detach().numpy(), axis=0)
            embedded_synonym_avg=np.average(output_synonym.last_hidden_state[0].detach().numpy(), axis=0)
            #dist=distance.cosine(embedded_base_avg,embedded_synonym_avg)
            dist=my_cosine_similarity(embedded_base_avg,embedded_synonym_avg)
            sum=sum+dist
            cnt=cnt+1
    # overall result on synonym-dictionary
        print(cnt, end="\r")
           
    avg=sum/cnt
    print(avg)
    return avg

In [6]:
def get_avg_res_other(tokenizer, model):
    from scipy.spatial import distance
    import numpy as np

    sum=0
    cnt=0
    avg=0

    for k,v in DICT.items():
        baseword=k
        encoded_input_base=tokenizer(baseword,return_tensors='pt')
        output_base = model(**encoded_input_base,output_hidden_states=True)
        output_base_np=output_base.hidden_states[-1][0].detach().numpy()
        for e in v[1]:
            synonym=e
            encoded_input_synonym=tokenizer(synonym,return_tensors='pt')
            output_synonym = model(**encoded_input_synonym,output_hidden_states=True)
            output_synonym_np=output_synonym.hidden_states[-1][0].detach().numpy()

            embedded_base_avg=np.average(output_base_np, axis=0)
            embedded_synonym_avg=np.average(output_synonym_np, axis=0)
            #dist=distance.cosine(embedded_base_avg,embedded_synonym_avg)
            dist=my_cosine_similarity(embedded_base_avg,embedded_synonym_avg)
            sum=sum+dist
            cnt=cnt+1
        print(cnt, end="\r")
    # overall result on synonym-dictionary       
    avg=sum/cnt
    print(avg)
    return avg

In [32]:
def get_avg_res_xlm100(tokenizer,model):
    from scipy.spatial import distance

    sum=0
    cnt=0
    avg=0

    language_id_hu = tokenizer.lang2id["hu"]

    for k,v in DICT.items():
        baseword=k
        base_input_ids = torch.tensor([tokenizer.encode(baseword)])
        base_lang =  torch.tensor([language_id_hu] * base_input_ids.shape[1])
        base_lang = base_lang.view(1, -1)
        output_base = model(base_input_ids, langs=base_lang)
        for e in v[1]:
            synonym=e
            syn_input_ids = torch.tensor([tokenizer.encode(synonym)])
            syn_lang =  torch.tensor([language_id_hu] * syn_input_ids.shape[1])
            syn_lang = syn_lang.view(1, -1)
            output_syn = model(syn_input_ids, langs=syn_lang)

            embedded_base=output_base[0][0][-1].tolist()
            embedded_syn=output_syn[0][0][-1].tolist()
            #dist=distance.cosine(embedded_base,embedded_syn)
            dist=my_cosine_similarity(embedded_base,embedded_syn)
            sum=sum+dist
            cnt=cnt+1
    # overall result on synonym-dictionary       
    avg=sum/cnt
    print(avg)
    return avg

In [33]:
dic_result = {
    "bert-base-multilingual-uncased" : {},
    "bert-base-multilingual-cased" : {},
    "NYTK/text-generation-news-gpt2-small-hungarian" : {},
    "NYTK/text-generation-poem-petofi-gpt2-small-hungarian" : {},
    "SZTAKI-HLT/hubert-base-cc" : {},
    "sberbank-ai/mGPT" : {},
    "NYTK/sentiment-hts2-xlm-roberta-hungarian" : {},
    "NYTK/sentiment-hts5-xlm-roberta-hungarian" : {},
    "NYTK/sentiment-hts5-hubert-hungarian" : {},
    "NYTK/sentiment-hts2-hubert-hungarian" : {},
    "roberta-base" : {},
    "xlm-roberta-base" : {},
    "xlm-roberta-large" : {},
    "xlm-mlm-100-1280" : {},
    "facebook/xlm-roberta-xl" : {}
}
with open("results.json", "w+") as resultf:
    json.dump(dic_result, resultf)

In [9]:
with open("results.json", "r") as resultf:
    dic_result = json.load(resultf)
dic_result

{'distilbert-base-multilingual-cased': {},
 'bert-base-multilingual-uncased': {'syn_hu': 0.722074735583503,
  'ant_hu': 0.717287334589222,
  'syn_en': 0.79346014996581,
  'ant_en': 0.7750938003989282},
 'bert-base-multilingual-cased': {'syn_hu': 0.5632927332691204,
  'ant_hu': 0.5609021603555736,
  'syn_en': 0.5289188860975226,
  'ant_en': 0.5018978259905483},
 'NYTK/text-generation-news-gpt2-small-hungarian': {'syn_hu': 0.5717219781714749,
  'ant_hu': 0.5415410494257994,
  'syn_en': 0.6182320676318803,
  'ant_en': 0.6159549340590915},
 'NYTK/text-generation-poem-petofi-gpt2-small-hungarian': {'syn_hu': 0.6306462456930798,
  'ant_hu': 0.6006082052976565,
  'syn_en': 0.6326308510037462,
  'ant_en': 0.6294172660219062},
 'SZTAKI-HLT/hubert-base-cc': {'syn_hu': 0.9410662500765817,
  'ant_hu': 0.939179996788529,
  'syn_en': 0.9354769090465983,
  'ant_en': 0.9323585145156933},
 'sberbank-ai/mGPT': {'syn_hu': 0.8669438558143153,
  'ant_hu': 0.85937414462522,
  'syn_en': 0.8728994038749214,
 

In [10]:
# distilbert-base-multilingual-cased
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-multilingual-cased")

dic_result["distilbert-base-multilingual-cased"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

  from .autonotebook import tqdm as notebook_tqdm
Downloading tokenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 29.1kB/s]
Downloading config.json: 100%|██████████| 466/466 [00:00<00:00, 289kB/s]
Downloading vocab.txt: 100%|██████████| 972k/972k [00:09<00:00, 101kB/s]  
Downloading tokenizer.json: 100%|██████████| 1.87M/1.87M [00:22<00:00, 89.0kB/s]
Downloading pytorch_model.bin: 100%|██████████| 517M/517M [06:37<00:00, 1.36MB/s] 


0.6824023492810385


In [35]:
# bert-base-multilingual-uncased
# Run time: 21m
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
model = BertModel.from_pretrained("bert-base-multilingual-uncased")

dic_result["bert-base-multilingual-uncased"]["syn_hu"] = get_avg_res_bert(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.722074735583503


In [36]:
# bert-base-multilingual-cased
# Run time: 19m
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased")

dic_result["bert-base-multilingual-cased"]["syn_hu"] = get_avg_res_bert(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.5632927332691204


In [37]:
# hubert-base-cc
# Run time: 18m
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("SZTAKI-HLT/hubert-base-cc")
tokenizer = AutoTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")

dic_result["SZTAKI-HLT/hubert-base-cc"]["syn_hu"] = get_avg_res_bert(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

Some weights of the model checkpoint at SZTAKI-HLT/hubert-base-cc were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.9410662500765817


In [38]:
# text-generation-news-gpt2-small-hungarian
# Run time: 21m
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("NYTK/text-generation-news-gpt2-small-hungarian")
model = AutoModelForCausalLM.from_pretrained("NYTK/text-generation-news-gpt2-small-hungarian")

dic_result["NYTK/text-generation-news-gpt2-small-hungarian"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.5717219781714749


In [39]:
# text-generation-poem-petofi-gpt2-small-hungarian
# Run time: 21m
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("NYTK/text-generation-poem-petofi-gpt2-small-hungarian")
model = AutoModelForCausalLM.from_pretrained("NYTK/text-generation-poem-petofi-gpt2-small-hungarian")

dic_result["NYTK/text-generation-poem-petofi-gpt2-small-hungarian"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.6306462456930798


In [40]:
# mGPT
# Run time: 234m
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/mGPT")
model = AutoModelForCausalLM.from_pretrained("sberbank-ai/mGPT")

dic_result["sberbank-ai/mGPT"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.8669438558143153


In [41]:
# sentiment-hts2-xlm-roberta-hungarian
# Run time: 11m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts2-xlm-roberta-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts2-xlm-roberta-hungarian")

dic_result["NYTK/sentiment-hts2-xlm-roberta-hungarian"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.34177660500533863


In [42]:
# sentiment-hts5-xlm-roberta-hungarian
# Run time: 18m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts5-xlm-roberta-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts5-xlm-roberta-hungarian")

dic_result["NYTK/sentiment-hts5-xlm-roberta-hungarian"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.8192366469365135


In [43]:
# sentiment-hts5-hubert-hungarian
# Run time: 18m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts5-hubert-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts5-hubert-hungarian")

dic_result["NYTK/sentiment-hts5-hubert-hungarian"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.8705190223725047


In [44]:
# sentiment-hts2-hubert-hungarian
# Run time: 17m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts2-hubert-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts2-hubert-hungarian")

dic_result["NYTK/sentiment-hts2-hubert-hungarian"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.8311294925939299


# This is english only
# roberta-base
# Run time: 22m
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

get_avg_res(tokenizer,model)

In [45]:
# xlm-roberta-base
# Run time: 43m
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

dic_result["xlm-roberta-base"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.9935592033390457


In [46]:
# xlm-roberta-large
# Run time: 82m
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large")

dic_result["xlm-roberta-large"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.994551146293982


In [47]:
# xlm-mlm-100-1280
# Run time: 109m
from transformers import XLMTokenizer, XLMWithLMHeadModel
import torch

tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-100-1280")

dic_result["xlm-mlm-100-1280"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-100-1280 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.576480388738062


In [48]:
# xlm-roberta-xl
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-xl")
model = AutoModelForMaskedLM.from_pretrained("facebook/xlm-roberta-xl")

dic_result["facebook/xlm-roberta-xl"]["syn_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.9443521488765352


In [63]:
import json

with open('../pdf_to_dictionary/ellentetek.json', encoding='utf-8') as f:
    d = json.load(f)

DICT = d

In [50]:
# bert-base-multilingual-uncased
# Run time: 21m
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
model = BertModel.from_pretrained("bert-base-multilingual-uncased")

dic_result["bert-base-multilingual-uncased"]["ant_hu"] = get_avg_res_bert(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.717287334589222


In [51]:
# bert-base-multilingual-cased
# Run time: 19m
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased")

dic_result["bert-base-multilingual-cased"]["ant_hu"] = get_avg_res_bert(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.5609021603555736


In [52]:
# hubert-base-cc
# Run time: 18m
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("SZTAKI-HLT/hubert-base-cc")
tokenizer = AutoTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")

dic_result["SZTAKI-HLT/hubert-base-cc"]["ant_hu"] = get_avg_res_bert(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

Some weights of the model checkpoint at SZTAKI-HLT/hubert-base-cc were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.939179996788529


In [53]:
# text-generation-news-gpt2-small-hungarian
# Run time: 21m
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("NYTK/text-generation-news-gpt2-small-hungarian")
model = AutoModelForCausalLM.from_pretrained("NYTK/text-generation-news-gpt2-small-hungarian")

dic_result["NYTK/text-generation-news-gpt2-small-hungarian"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.5415410494257994


In [54]:
# text-generation-poem-petofi-gpt2-small-hungarian
# Run time: 21m
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("NYTK/text-generation-poem-petofi-gpt2-small-hungarian")
model = AutoModelForCausalLM.from_pretrained("NYTK/text-generation-poem-petofi-gpt2-small-hungarian")

dic_result["NYTK/text-generation-poem-petofi-gpt2-small-hungarian"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.6006082052976565


In [55]:
# mGPT
# Run time: 234m
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/mGPT")
model = AutoModelForCausalLM.from_pretrained("sberbank-ai/mGPT")

dic_result["sberbank-ai/mGPT"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.85937414462522


In [56]:
# sentiment-hts2-xlm-roberta-hungarian
# Run time: 11m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts2-xlm-roberta-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts2-xlm-roberta-hungarian")

dic_result["NYTK/sentiment-hts2-xlm-roberta-hungarian"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.2835821206325076


In [64]:
# sentiment-hts5-xlm-roberta-hungarian
# Run time: 18m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts5-xlm-roberta-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts5-xlm-roberta-hungarian")

dic_result["NYTK/sentiment-hts5-xlm-roberta-hungarian"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.7659460038634724


In [57]:
# sentiment-hts5-hubert-hungarian
# Run time: 18m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts5-hubert-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts5-hubert-hungarian")

dic_result["NYTK/sentiment-hts5-hubert-hungarian"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.826212971883551


In [58]:
# sentiment-hts2-hubert-hungarian
# Run time: 17m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts2-hubert-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts2-hubert-hungarian")

dic_result["NYTK/sentiment-hts2-hubert-hungarian"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.7088386121785658


In [59]:
# xlm-roberta-base
# Run time: 43m
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

dic_result["xlm-roberta-base"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.993490363018344


In [60]:
# xlm-roberta-large
# Run time: 82m
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large")

dic_result["xlm-roberta-large"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.994428756098792


In [61]:
# xlm-mlm-100-1280
# Run time: 109m
from transformers import XLMTokenizer, XLMWithLMHeadModel
import torch

tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-100-1280")

dic_result["xlm-mlm-100-1280"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-100-1280 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.568805721740109


In [62]:
# xlm-roberta-xl
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-xl")
model = AutoModelForMaskedLM.from_pretrained("facebook/xlm-roberta-xl")

dic_result["facebook/xlm-roberta-xl"]["ant_hu"] = get_avg_res_other(tokenizer,model)

with open("results.json", "w+") as resultf: json.dump(dic_result, resultf)

0.9432152557837755
