# all text encoders

In [1]:
import pprint
pp = pprint.PrettyPrinter(indent=4, width=200, depth=None, stream=None, compact=False, sort_dicts=False)

In [1]:
import json

with open('../pdf_to_dictionary/english_syn_ant.json', encoding='utf-8') as f:
    d = json.load(f)

In [2]:
import numpy as np
from numpy.linalg import norm

def my_cosine_similarity(A, B):
    return np.dot(A,B)/(norm(A)*norm(B))

In [3]:
# make smaller test dict
d_keys_li=list(d.keys())
smaller_dict={}
for i in range(0,10):
    smaller_dict[d_keys_li[i]]=d[d_keys_li[i]]
# pp.pprint(smaller_dict)

In [4]:
# d - full test disctionary
# smaller_dict - 10 key dictionary
# DICT = smaller_dict
DICT = d

In [8]:
for baseword,v in DICT.items():
    print(baseword,v["syn"],v["ant"])
    break

aback ['backwards', 'rearwards', 'aft', 'abaft', 'astern', 'behind', 'back'] ['onwards', 'forwards', 'ahead', 'before', 'afront', 'beyond', 'afore']


In [9]:
def get_avg_res(tokenizer, model):
    sum_syn=0
    sum_ant=0
    cnt_syn=0
    cnt_ant=0

    for baseword,v in DICT.items():
        encoded_input_base=tokenizer(baseword,return_tensors='pt')
        output_base = model(**encoded_input_base)
        embedded_base=output_base[0][0][-1]

        for synonym in v["syn"]:
            encoded_input_synonym=tokenizer(synonym,return_tensors='pt')
            output_synonym = model(**encoded_input_synonym)            
            embedded_synonym=output_synonym[0][0][-1]            
            sim_syn=my_cosine_similarity(embedded_base.detach().numpy(),embedded_synonym.detach().numpy())
            sum_syn=sum_syn+sim_syn
            cnt_syn=cnt_syn+1

        for antonym in v["ant"]:
            encoded_input_antonym=tokenizer(antonym, return_tensors='pt')
            output_antonym = model(**encoded_input_antonym)            
            embedded_antonym=output_antonym[0][0][-1]            
            sim_ant=my_cosine_similarity(embedded_base.detach().numpy(),embedded_antonym.detach().numpy())
            sum_ant=sum_ant+sim_ant
            cnt_ant=cnt_ant+1

        print(cnt_syn, cnt_ant, end="\r")
           
    print("syn", sum_syn/cnt_syn)
    print("ant", sum_ant/cnt_ant)

In [25]:
def get_avg_res_sentiment(tokenizer, model):
    sum_syn=0
    sum_ant=0
    cnt_syn=0
    cnt_ant=0

    for baseword,v in DICT.items():
        encoded_input_base=tokenizer(baseword,return_tensors='pt')
        output_base = model(**encoded_input_base)
        embedded_base=output_base[0][-1]

        for synonym in v["syn"]:
            encoded_input_synonym=tokenizer(synonym,return_tensors='pt')
            output_synonym = model(**encoded_input_synonym)            
            embedded_synonym=output_synonym[0][-1]
            sim_syn=my_cosine_similarity(embedded_base.detach().numpy(),embedded_synonym.detach().numpy())
            sum_syn=sum_syn+sim_syn
            cnt_syn=cnt_syn+1

        for antonym in v["ant"]:
            encoded_input_antonym=tokenizer(antonym,return_tensors='pt')
            output_antonym = model(**encoded_input_antonym)            
            embedded_antonym=output_antonym[0][-1]
            sim_ant=my_cosine_similarity(embedded_base.detach().numpy(),embedded_antonym.detach().numpy())
            sum_ant=sum_ant+sim_ant
            cnt_ant=cnt_ant+1

    
        print(cnt_syn, cnt_ant, end="\r")

    print("syn", sum_syn/cnt_syn)
    print("ant", sum_ant/cnt_ant)

In [26]:
def get_avg_res_xlm100(tokenizer,model):
    sum_syn=0
    sum_ant=0
    cnt_syn=0
    cnt_ant=0

    language_id_hu = tokenizer.lang2id["hu"]

    for baseword,v in DICT.items():
        base_input_ids = torch.tensor([tokenizer.encode(baseword)])
        base_lang =  torch.tensor([language_id_hu] * base_input_ids.shape[1])
        base_lang = base_lang.view(1, -1)
        output_base = model(base_input_ids, langs=base_lang)
        embedded_base=output_base[0][0][-1].tolist()

        for synonym in v["syn"]:
            syn_input_ids = torch.tensor([tokenizer.encode(synonym)])
            syn_lang =  torch.tensor([language_id_hu] * syn_input_ids.shape[1])
            syn_lang = syn_lang.view(1, -1)
            output_syn = model(syn_input_ids, langs=syn_lang)            
            embedded_syn=output_syn[0][0][-1].tolist()
            sim_syn=my_cosine_similarity(embedded_base,embedded_syn)
            sum_syn=sum_syn+sim_syn
            cnt_syn=cnt_syn+1

        for antonym in v["ant"]:
            ant_input_ids = torch.tensor([tokenizer.encode(antonym)])
            ant_lang =  torch.tensor([language_id_hu] * ant_input_ids.shape[1])
            ant_lang = ant_lang.view(1, -1)
            output_ant = model(ant_input_ids, langs=ant_lang)            
            embedded_ant=output_ant[0][0][-1].tolist()
            sim_ant=my_cosine_similarity(embedded_base,embedded_ant)
            sum_ant=sum_ant+sim_ant
            cnt_ant=cnt_ant+1

        print(cnt_syn, cnt_ant, end="\r")

    print("syn", sum_syn/cnt_syn)
    print("ant", sum_ant/cnt_ant)

In [14]:
# bert-base-multilingual-uncased
# Run time: 21m
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
model = BertModel.from_pretrained("bert-base-multilingual-uncased")

get_avg_res(tokenizer,model)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.7567041850302905


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0719,  0.0129,  0.1084,  ..., -0.0177,  0.0109, -0.0168],
         [ 0.0741,  0.4268,  0.2884,  ..., -0.1076,  0.2218, -0.0412],
         [-0.3344, -0.1295, -0.0935,  ...,  0.2953, -0.3330, -0.4336],
         [-0.1503, -0.7483, -0.0366,  ...,  0.0960,  0.3610,  0.2214],
         [ 0.0396,  0.5546,  0.1998,  ..., -0.1647, -0.0531, -0.2683],
         [-0.8048,  1.0132,  0.5755,  ...,  0.7749, -0.1809, -1.0477]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 1.5644e-01, -7.0881e-04,  2.2195e-01,  1.5884e-01,  1.9606e-01,
          4.1540e-01,  1.5336e-01, -1.3350e-01, -1.5227e-01,  2.8029e-01,
         -2.2688e-01, -2.1446e-01,  2.6676e-01, -1.9476e-01, -2.4235e-01,
          1.0252e-01,  1.7405e-01,  1.2447e-01,  2.1441e-01,  9.2159e-02,
         -3.6972e-02, -3.6511e-02,  8.7314e-02, -7.4431e-03,  2.7466e-01,
         -1.7619e-01,  2.4513e-01,  1.0325e-01,  3.8374e-01,  3.1987e-01,
       

In [15]:
# bert-base-multilingual-cased
# Run time: 19m
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertModel.from_pretrained("bert-base-multilingual-cased")

get_avg_res(tokenizer,model)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.6126954996837073


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0352, -0.0060,  0.3350,  ...,  0.2529,  0.3051,  0.3122],
         [-0.4581, -0.3785,  0.2395,  ...,  0.4396,  0.6502,  0.4630],
         [ 0.0305, -0.5363,  0.1183,  ...,  0.1776,  0.3533,  0.5092],
         [-0.0905, -0.9662,  1.0015,  ...,  0.0550,  0.5291,  0.5825],
         [-0.0639, -0.1973,  1.1161,  ...,  0.4076,  0.0489,  0.4222],
         [-0.0270, -0.0967,  0.9523,  ...,  0.1293,  0.4513,  0.7500]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 1.8316e-01, -1.4452e-01,  2.4417e-01, -2.9282e-02, -8.9008e-03,
          3.3721e-01,  1.0858e-01,  2.7783e-01, -3.1669e-01,  1.9912e-01,
          7.7042e-02, -6.6136e-02, -8.7563e-02, -1.9693e-01,  2.8700e-01,
         -1.2130e-01,  3.6513e-01,  3.8714e-02,  6.1837e-02, -2.7817e-01,
         -9.9997e-01, -3.0202e-01, -1.9889e-01, -1.4710e-01, -3.0429e-01,
          1.0936e-01,  3.7754e-02,  1.2235e-01,  1.3897e-01, -1.5806e-01,
       

In [16]:
# text-generation-news-gpt2-small-hungarian
# Run time: 21m
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("NYTK/text-generation-news-gpt2-small-hungarian")
model = AutoModelForCausalLM.from_pretrained("NYTK/text-generation-news-gpt2-small-hungarian")

get_avg_res(tokenizer,model)

0.9158712517996491


CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ -7.6904,  -7.6700, -10.2259,  ...,  -4.4806,  -6.2410,  -2.9573],
         [-16.1794, -16.4799, -17.6944,  ...,  -5.7863,  -9.9725, -10.2662],
         [-27.4192, -27.5182, -25.3408,  ...,  -9.7856, -10.2534, -14.2252]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 1.4389, -0.9259,  3.1674,  ..., -0.1491,  0.7358,  0.4069],
          [ 2.1965,  0.1825,  2.4464,  ..., -1.2019,  2.2887,  1.0410],
          [ 2.1519, -0.5390,  2.3420,  ..., -1.7927,  4.5300,  1.1086]],

         [[-1.1474,  0.0534, -0.8365,  ..., -0.0897, -0.9978,  0.8247],
          [ 0.0865,  0.8130,  0.7836,  ..., -0.6327, -0.0927,  2.2687],
          [-1.3778, -0.4761, -1.7145,  ...,  0.4187, -0.2740,  2.0218]],

         [[ 1.0041, -1.1440,  0.6039,  ..., -0.1498,  0.7908,  0.7858],
          [ 0.2596, -0.2843,  0.5057,  ...,  0.3007,  0.4995,  0.3799],
          [ 1.0470, -0.5683, -0.1906,  ...,  0.2936, -0.6122,  1.0293]],

    

In [17]:
# text-generation-poem-petofi-gpt2-small-hungarian
# Run time: 21m
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("NYTK/text-generation-poem-petofi-gpt2-small-hungarian")
model = AutoModelForCausalLM.from_pretrained("NYTK/text-generation-poem-petofi-gpt2-small-hungarian")

get_avg_res(tokenizer,model)

0.9276126429198216


CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ -2.6028,  -3.4230,   2.0850,  ...,  -3.6229,  -5.5268,  -5.3679],
         [ -3.4194,  -5.7300,  -1.9777,  ...,  -5.3644,  -8.9462,  -8.1705],
         [ -2.5727,  -5.5653,  -2.8149,  ...,  -9.1339, -11.3441, -10.5183]]],
       grad_fn=<UnsafeViewBackward0>), past_key_values=((tensor([[[[ 6.2375e-01, -5.4565e-01,  3.0739e+00,  ...,  1.1495e+00,
            1.8797e-01, -1.0632e-01],
          [ 1.6363e+00, -6.1063e-01,  9.6761e-01,  ..., -2.6795e-01,
            1.7213e+00,  7.1853e-01],
          [ 1.3586e+00, -4.7224e-01,  2.1549e+00,  ..., -1.0640e+00,
            3.9966e+00,  9.6106e-01]],

         [[-9.4393e-01,  8.7675e-01, -4.7710e-01,  ...,  2.6136e-01,
           -1.0788e+00,  1.6523e+00],
          [ 1.0532e+00,  1.3930e-01,  4.8374e-01,  ..., -4.1034e-01,
            1.6854e-01,  1.6128e+00],
          [-1.7038e+00, -5.0348e-01, -2.0477e+00,  ...,  5.4685e-01,
           -2.0591e-01,  7.0547e-01]],

         [[ 

In [18]:
# hubert-base-cc
# Run time: 18m
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("SZTAKI-HLT/hubert-base-cc")
tokenizer = AutoTokenizer.from_pretrained("SZTAKI-HLT/hubert-base-cc")

get_avg_res(tokenizer,model)

Some weights of the model checkpoint at SZTAKI-HLT/hubert-base-cc were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.9959319631118843


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.7243,  0.1612,  1.0162,  ...,  0.0296, -1.0394, -0.6103],
         [ 0.1038,  0.0367, -0.2054,  ..., -0.3088, -0.2042,  0.0630],
         [-0.0236, -0.0078, -0.1343,  ..., -0.1722, -0.2199,  0.1102],
         [ 0.1544,  0.3130,  0.2988,  ...,  0.1123, -0.1340, -0.0057],
         [ 0.1782, -0.0350,  0.2302,  ...,  0.0378, -0.0184,  0.1245]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 0.2092,  0.8206,  0.6497, -0.8355,  0.1763,  0.8134,  0.6644, -0.8871,
          0.8742, -0.7669, -0.6076,  0.1038, -0.9331,  0.9627,  0.7116,  0.3723,
          0.9161, -0.9955, -0.6619, -0.4602,  0.8201, -0.7894, -0.3004,  0.5493,
         -0.9972, -0.7139,  0.5849,  0.7389,  0.9486, -0.3229, -0.6968, -0.7561,
          0.7681,  0.5077, -0.6867, -0.7087, -0.8041, -0.2222,  0.0110,  0.9801,
         -0.9728, -0.6081, -0.7769,  0.6002, -0.7114,  0.6427, -0.7834, -0.7847,
         -0.7656,  0.3371, -0.5005, 

In [27]:
# mGPT
# Run time: 234m
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/mGPT")
model = AutoModelForCausalLM.from_pretrained("sberbank-ai/mGPT")

get_avg_res(tokenizer,model)

0.9930734286141195


In [28]:
# sentiment-hts2-xlm-roberta-hungarian
# Run time: 11m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts2-xlm-roberta-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts2-xlm-roberta-hungarian")

get_avg_res_sentiment(tokenizer,model)

-0.060030954703928434


In [29]:
# sentiment-hts5-xlm-roberta-hungarian
# Run time: 18m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts5-xlm-roberta-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts5-xlm-roberta-hungarian")

get_avg_res_sentiment(tokenizer,model)

0.7071057470567297


In [30]:
# sentiment-hts5-hubert-hungarian
# Run time: 18m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts5-hubert-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts5-hubert-hungarian")

get_avg_res_sentiment(tokenizer,model)

0.49708486253667966


In [31]:
# sentiment-hts2-hubert-hungarian
# Run time: 17m
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("NYTK/sentiment-hts2-hubert-hungarian")
model = AutoModelForSequenceClassification.from_pretrained("NYTK/sentiment-hts2-hubert-hungarian")

get_avg_res_sentiment(tokenizer,model)

-0.3236603515804651


In [32]:
# roberta-base
# Run time: 22m
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

get_avg_res(tokenizer,model)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.9970597336727028


In [33]:
# xlm-roberta-base
# Run time: 43m
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")

get_avg_res(tokenizer,model)

0.9956437026142038


In [34]:
# xlm-roberta-large
# Run time: 82m
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')
model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large")

get_avg_res(tokenizer,model)

0.9926321683722512


In [35]:
# xlm-mlm-100-1280
# Run time: 109m
from transformers import XLMTokenizer, XLMWithLMHeadModel
import torch

tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-100-1280")
model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-100-1280")

get_avg_res_xlm100(tokenizer,model)

Some weights of XLMWithLMHeadModel were not initialized from the model checkpoint at xlm-mlm-100-1280 and are newly initialized: ['transformer.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.6271096338710185


In [36]:
# xlm-roberta-xl
from transformers import AutoTokenizer, AutoModelForMaskedLM

# xxl - 40G+
tokenizer = AutoTokenizer.from_pretrained("facebook/xlm-roberta-xl")
model = AutoModelForMaskedLM.from_pretrained("facebook/xlm-roberta-xl")

get_avg_res(tokenizer,model)
# 10 key dict - 8 min

0.866267628416661
