In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

#rugpt3large_based_on_gpt2
model_rugpt3large = GPT2LMHeadModel.from_pretrained('sberbank-ai/rugpt3large_based_on_gpt2')
word_embeddings_rugpt3large = model_rugpt3large.transformer.wte.weight  # Word Token Embeddings 
position_embeddings_rugpt3large = model_rugpt3large.transformer.wpe.weight  # Word Position Embeddings 

tokenizer_rugpt3large = GPT2Tokenizer.from_pretrained("sberbank-ai/rugpt3large_based_on_gpt2")

# MGPT
model_mgpt = GPT2LMHeadModel.from_pretrained('sberbank-ai/mGPT')
word_embeddings_mgpt = model_mgpt.transformer.wte.weight  # Word Token Embeddings 
position_embedding_mgpt = model_mgpt.transformer.wpe.weight  # Word Position Embeddings 

tokenizer_mgpt = GPT2Tokenizer.from_pretrained("sberbank-ai/mGPT")

# Far Away From Center with NGT ANN

In [140]:
import torch
word_embeddings_centroid_mgpt = torch.mean(word_embeddings_mgpt, dim=0)
word_embeddings_sum_mgpt = torch.sum(word_embeddings_mgpt, dim=0)

In [3]:
import ngtpy

# dim = 1536
dim = 2048

# ngtpy.create(b"mgpt_Word_Embs", dim)
# index = ngtpy.Index(b"mgpt_Word_Embs")
# index.batch_insert(word_embeddings_mgpt.detach().numpy())
# index.save()
index = ngtpy.Index(b"mgpt_Word_Embs")

In [11]:
# query = word_embeddings[468].detach().numpy()
query = word_embeddings_centroid_mgpt.detach().numpy()
results = index.search(query, 100000, epsilon=100.0, edge_size=30000)
# results = index.linear_search(query, 100000)
anomaly_tokens = []
for i, (id, distance) in enumerate(results) :
    if i > 99000:
        # print(str(i) + ": " + str(id) + ", " + str(distance))
        # print(tokenizer.decode([id]))
        anomaly_tokens.append(tokenizer_mgpt.decode([id]))
        # object = index.get_object(id)
with open("anomaly_tokens_mgpt.txt", "w") as f:
    f.write("\n".join(anomaly_tokens))

In [10]:
# vocab_decoded = []
# for i in range(50257):
#     vocab_decoded.append(tokenizer.decode([i]))

# K-MEANS Grouping method

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import os

for n_clusters in [50,200,500,2000]:
    # making k-means grouping
    # X = word_embeddings_mgpt.detach()
    X = word_embeddings_rugpt3large.detach()
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    
    # grouping in dict
    embs_groups = {}
    for idx, label in enumerate(kmeans.labels_):
        # tokens_list = embs_groups.get(label, []) + [tokenizer_mgpt.decode([idx])]
        tokens_list = embs_groups.get(label, []) + [tokenizer_rugpt3large.decode([idx])]
        embs_groups[label] = tokens_list
    
    # dumping in txt files
    os.makedirs(f'ruGPT3-large_{str(n_clusters)}clusters', exist_ok = True)
    for i in range(len(embs_groups)):
        cluster_num = str(i).rjust(4, '0')
        with open(f'ruGPT3-large_{str(n_clusters)}clusters/cluster_№{cluster_num}.txt', 'w') as f:
            f.write("\n".join(embs_groups[i]))

# Testing tokenizer

In [90]:
test_str = " " #Medium Mathematical Space (MMSP)
test_str = " " #No-Break Space (NBSP)
test_str = " " #Mac Space
test_str = "؜" # ARABIC LETTER MARK

tokenizer_mgpt.decode(tokenizer_mgpt.encode(test_str))

'\u061c'

In [95]:
# messing with generation placing spaces cluster1753
test_str = "ури нотерапия"
test_str = "уре тра"
test_str = "куры парня"
test_str = "шуры парня"
test_str = "уру сских"
# test_str = "ура нтоварищи"

for token in tokenizer_mgpt.encode(test_str):
    print(tokenizer_mgpt.decode([token]))

уру
 с
ских


In [91]:
# messing with generation placing spaces cluster1767
test_str = "вид экономики ؜"

for token in tokenizer_mgpt.encode(test_str):
    print(tokenizer_mgpt.decode([token]))

вид
 экономики
 �
�


In [96]:
# messing with generation placing spaces cluster1801
test_str = "<mask>"

for token in tokenizer_mgpt.encode(test_str):
    print(tokenizer_mgpt.decode([token]))

<
mas
k
>


## ruGPT3-large

In [97]:
# messing with generation placing spaces cluster1801
test_str = "<mask>"

for token in tokenizer_rugpt3large.encode(test_str):
    print(tokenizer_rugpt3large.decode([token]))

<
m
ask
>


# exploring inside

In [99]:
# model_mgpt.transformer

In [118]:
batch_size = 1
test_str = "уру сских"
ids = tokenizer_mgpt.encode(test_str, return_tensors="pt")
model_mgpt(ids)[0].view(batch_size,-1).shape

torch.Size([1, 300000])

# what if we just select closest embedding to sum of prompt embs?
we get embedding of words in it lol

In [137]:
test_str = "Российская федерация чувствует себя замечательно в этом экономическом цикле"

result_emb = torch.zeros((1,2048))
for token in tokenizer_mgpt.encode(test_str):
    result_emb += word_embeddings_mgpt[token]
    print(tokenizer_mgpt.decode([token]))
# result_emb /= len(tokenizer_mgpt.encode(test_str))

Р
оссий
ская
 федера
ция
 чув
ствует
 себя
 замеч
ательно
 в
 этом
 эконом
ическом
 цик
ле


In [139]:
results = index.search(result_emb.detach().numpy(), 15)
for i, (id, distance) in enumerate(results) :
    print(tokenizer_mgpt.decode([id]))

 в
 себя
 и
 на
 с
ская
 не
ле
 к
 от
 федера
 В
 этом
 по
 т


# Misha's idea with circle group

In [202]:
from glob import glob
from tqdm import tqdm

emb_clusters = {}
# distance_info_clusters = {}
word_clusters = sorted(glob('mGPT_2000clusters/*'))
for cluster_fn in tqdm(word_clusters, total=len(word_clusters)):
    # cluster_emb = torch.zeros((1,2048))
    with open(cluster_fn, 'r') as f:    
        tokens = f.read().split('\n')
    euclid_dist_embs = [0] * len(tokens)
    for token_utf8_idx in range(len(tokens)):
        for token_utf8_idx_next in range(token_utf8_idx+1, len(tokens)):
            # print(token_utf8)
            curr_token_emb = None
            next_token_emb = None
            token_curr = tokenizer_mgpt.encode(tokens[token_utf8_idx])
            token_next = tokenizer_mgpt.encode(tokens[token_utf8_idx_next])
            # print(token_curr, token_next)
            if len(token_curr) > 1 or len(token_next) > 1:
                continue
            try:
                curr_token_emb = word_embeddings_mgpt[token_curr]
                next_token_emb = word_embeddings_mgpt[token_next]
                emb_clusters[cluster_fn] = curr_token_emb
                
            except:
                pass
            # euclidean distance https://stackoverflow.com/questions/68220457/to-calculate-euclidean-distance-between-vectors-in-a-torch-tensor-with-multiple
            try:
                dist = (curr_token_emb - next_token_emb).pow(2).sum(1).sqrt()
            except:
                print(curr_token_emb.shape, next_token_emb.shape)
            try:
                euclid_dist_embs[token_utf8_idx] += float(dist)
                euclid_dist_embs[token_utf8_idx_next] += float(dist)
            except:
                print(dist, token)
        # distance_info_clusters[cluster_fn] = [token_curr, tokens, euclid_dist_embs]
        
    euclid_dist_embs = [i for i in euclid_dist_embs if i != 0.0]
#     print('\nв группе', tokens, '\n самый далекий ото всех', tokens[euclid_dist_embs.index(max(euclid_dist_embs))], 
#           '\nк самому близкому у него отношение', max(euclid_dist_embs)/min(euclid_dist_embs), 
#           '\n самый близкий ко всем', tokens[euclid_dist_embs.index(min(euclid_dist_embs))], )
    

 48%|████▊     | 955/2000 [14:19<07:54,  2.20it/s]  

tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBack

 77%|███████▋  | 1548/2000 [20:17<04:54,  1.54it/s]  

tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBackward0>)
tensor([], grad_fn=<SqrtBack

100%|██████████| 2000/2000 [26:04<00:00,  1.28it/s]


In [204]:
import pickle

with open('distance_info_clusters.pkl', 'wb') as f:
    pickle.dump(distance_info_clusters, f)

In [203]:
len(distance_info_clusters) # distance_info_clusters.keys()

2000

In [214]:
token_embs = []
for info_pack in distance_info_clusters.values():
    token_curr = tokenizer_mgpt.encode(info_pack[1][info_pack[2].index(max(info_pack[2]))])
    curr_token_emb = word_embeddings_mgpt[token_curr]
    token_embs.append(curr_token_emb)
    # print(info_pack[1][info_pack[2].index(max(info_pack[2]))])

In [215]:
# len(token_embs)

2000

In [None]:
test_str = "Carex"

result_emb = torch.zeros((1,2048))
for token in tokenizer_mgpt.encode(test_str):
    result_emb += word_embeddings_mgpt[token]
    print(tokenizer_mgpt.decode([token]))

In [216]:
dim = 2048

ngtpy.create(b"mgpt_Word_Embs_2kClusters_Misha_RingIdea", dim)
index = ngtpy.Index(b"mgpt_Word_Embs_2kClusters_Misha_RingIdea")
index.batch_insert(torch.cat(token_embs, 0).detach().numpy())
index.save()