In [1]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
text = ( 
    "Organgutans are native to the rainforest of Indonesia."
    "They are considered to be one of the most intelligent animals."
    "Gorillas are the largest primates in the world."
)

tokens = tokenizer(text, return_tensors="pt")
tokens


{'input_ids': tensor([[  101,  5812, 27920,  6962,  2024,  3128,  2000,  1996, 18951,  1997,
          6239,  1012,  2027,  2024,  2641,  2000,  2022,  2028,  1997,  1996,
          2087,  9414,  4176,  1012, 23526,  2015,  2024,  1996,  2922, 25662,
          2015,  1999,  1996,  2088,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [3]:
tokenizer.convert_ids_to_tokens(tokens["input_ids"][0])

['[CLS]',
 'organ',
 '##gut',
 '##ans',
 'are',
 'native',
 'to',
 'the',
 'rainforest',
 'of',
 'indonesia',
 '.',
 'they',
 'are',
 'considered',
 'to',
 'be',
 'one',
 'of',
 'the',
 'most',
 'intelligent',
 'animals',
 '.',
 'gorilla',
 '##s',
 'are',
 'the',
 'largest',
 'primate',
 '##s',
 'in',
 'the',
 'world',
 '.',
 '[SEP]']

In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("naver/splade-cocondenser-ensembledistil")
model = AutoModelForMaskedLM.from_pretrained("naver/splade-cocondenser-ensembledistil")

In [5]:
text = (
    "The Amazon Rainforest, also known as the Amazon Jungle, is a vast tropical rainforest in South America, encompassing an area of approximately 5.5 million square kilometers. It spans across nine countries, including Brazil, Peru, Colombia, and Venezuela, with Brazil containing the largest portion. The Amazon Rainforest is renowned for its unparalleled biodiversity, housing over 400 billion individual trees representing around 16,000 species. It is home to an estimated 10% of all known species on Earth, including iconic wildlife such as jaguars, sloths, toucans, and pink river dolphins. Additionally, the Amazon River, the second longest river in the world, flows through this region, supporting a complex and interdependent ecosystem. The rainforest plays a crucial role in regulating the global climate by absorbing vast amounts of carbon dioxide and producing oxygen. However, it faces significant threats from deforestation, logging, mining, and agriculture, which have led to substantial habitat loss and environmental degradation. Efforts are underway to protect and conserve this critical natural resource through international cooperation, sustainable practices, and indigenous stewardship."
)
text2 = (
    "The Amazon River, flowing through the Amazon Rainforest, is the second longest river in the world. It supports a diverse ecosystem and is crucial for the rainforest's biodiversity."
)
text3 = (
    "The Great Wall of China is an ancient series of walls and fortifications, totaling more than 13,000 miles in length, located in northern China. It was built to protect against invasions and raids."
)
text4 = (
    "Amazon is founded by Jeff Bezos in 1994. It started as an online bookstore and has since expanded into various industries, including e-commerce, cloud computing, and entertainment."
)

In [6]:
tokens = tokenizer(text, return_tensors="pt")
output = model(**tokens)

tokens2 = tokenizer(text2, return_tensors="pt", padding=True, truncation=True)
output2 = model(**tokens2)

tokens3 = tokenizer(text3, return_tensors="pt")
output3 = model(**tokens3)

tokens4 = tokenizer(text4, return_tensors="pt")
output4 = model(**tokens4)

In [7]:
import torch 

def get_max_logits(output, tokens):
    return torch.max(
        torch.log(
            1 + torch.relu(output.logits)
        ) * tokens.attention_mask.unsqueeze(-1),
        dim=1)[0].squeeze().detach().cpu().numpy()

vec = get_max_logits(output, tokens)
vec2 = get_max_logits(output2, tokens2)
vec3 = get_max_logits(output3, tokens3)
vec4 = get_max_logits(output4, tokens4)


In [8]:
print(vec)
print(vec2)

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


In [9]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([vec, vec2, vec3, vec4])

array([[0.99999994, 0.48202875, 0.09869803, 0.17222472],
       [0.48202875, 1.        , 0.05827487, 0.26472694],
       [0.09869803, 0.05827487, 1.        , 0.00996668],
       [0.17222472, 0.26472694, 0.00996668, 1.        ]], dtype=float32)

In [10]:
idx2token = {
    idx: token for token, idx in tokenizer.get_vocab().items()
}

In [11]:
cols = vec.nonzero().squeeze().cpu().numpy()
cols2 = vec2.nonzero().squeeze().cpu().numpy()
cols3 = vec3.nonzero().squeeze().cpu().numpy()
print(len(cols))

weights = vec[cols].cpu().tolist()
weights2 = vec2[cols2].cpu().tolist()
weights3 = vec3[cols3].cpu().tolist()

sparse_dict = dict(zip(cols, weights))
sparse_dict2 = dict(zip(cols2, weights2))
sparse_dict3 = dict(zip(cols3, weights3))

AttributeError: 'tuple' object has no attribute 'squeeze'

In [12]:
sparse_dict_tokens = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(cols, weights)
}
sparse_dict_tokens2 = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(cols2, weights2)
}
sparse_dict_tokens3 = {
    idx2token[idx]: round(weight, 2) for idx, weight in zip(cols3, weights3)
}

sparse_dict_tokens = {
    k: v for k, v in sorted(
        sparse_dict_tokens.items(),
        key=lambda item: item[1],
        reverse=True
    )
}
sparse_dict_tokens

sparse_dict_tokens2 = {
    k: v for k, v in sorted(
        sparse_dict_tokens2.items(),
        key=lambda item: item[1],
        reverse=True
    )
}
sparse_dict_tokens2

sparse_dict_tokens3 = {
    k: v for k, v in sorted(
        sparse_dict_tokens3.items(),
        key=lambda item: item[1],
        reverse=True
    )
}

NameError: name 'cols' is not defined

In [13]:
print(vec.shape)
print(vec2.shape)

(30522,)
(30522,)
