# Embeddings Comparison

The idea is that we want to see if BERT produces ultrametric-like embeddings. Probabilistically, it should.

We create NUMBER_OF_TRIALS trials. Each trial has between FEWEST_EXEMPLARS and MOST_EXAMPLARS exemplars randomly
selected from monosemous (single-meaning) words.

For each trial we look at the whole vocabulary, and assign each word in the vocabulary to the nearest exemplar.
We do this using wordnet (path similarity) and BERT (using Euclidean distance). Hopefully one day I'll turn this
into something p-adic too.

Should I use Resnik similarity?

In [1]:
NUMBER_OF_TRIALS = 100
FEWEST_EXEMPLARS = 5
MOST_EXEMPLARS = 100

In [None]:
import nltk
from nltk.corpus import wordnet as wn
import random
import torch
from transformers import BertTokenizer, BertModel
import tqdm
import collections
from scipy.spatial.distance import cosine
import pandas
import matplotlib.pyplot
import seaborn

In [None]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Copied from https://towardsdatascience.com/3-types-of-contextualized-word-embeddings-from-bert-using-transfer-learning-81fcefe3fe6d
    
    Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

In [None]:
%%time
monosemous_nouns = {}
for word in wn.all_lemma_names():
    synsets = wn.synsets(word)
    if len(synsets) == 1 and synsets[0].pos() == 'n':
        word_fragments = word.split('_')
        all_fragments_in_bert = True
        for fragment in word_fragments:
            if word not in tokenizer.vocab:
                all_fragments_in_bert = False
                break
        if not all_fragments_in_bert:
            continue
        monosemous_nouns[word] = synsets[0]

In [None]:
monosemous_nouns_list = list(monosemous_nouns)
len(monosemous_nouns_list)

In [None]:
#working_subset_of_nouns = random.sample(monosemous_nouns_list, MOST_EXEMPLARS * 5)
working_subset_of_nouns = monosemous_nouns_list[:]

In [None]:
bert_embeddings = {}
for word in working_subset_of_nouns:
    marked_text = "[CLS] "
    marked_text += word.replace('_', ' ')
    marked_text += " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
    bert_embeddings[word] = list_token_embeddings[0]

In [None]:
trials = []
for trial_number in tqdm.tqdm(range(NUMBER_OF_TRIALS)):
    exemplar_count = random.randrange(FEWEST_EXEMPLARS, MOST_EXEMPLARS)
    exemplar_words = random.sample(working_subset_of_nouns, k=exemplar_count)
    exemplar_synsets = [monosemous_nouns[w] for w in exemplar_words]
    exemplars = {w:s for (w,s) in zip(exemplar_words, exemplar_synsets)}
    exemplar_embeddings = {w:bert_embeddings[w] for w in exemplar_words}
    wordnet_neighbours = {}
    bert_cosine_neighbours = {}
    for word, synset in monosemous_nouns.items():
        if word not in working_subset_of_nouns:
            continue
        best_exemplar = None
        best_score = 0.0
        for e_word, e_synset in exemplars.items():
            similarity = max(wn.path_similarity(synset, e_synset), wn.path_similarity(e_synset, synset))
            if similarity > best_score:
                best_exemplar = e_word
                best_score = similarity
        wordnet_neighbours[word] = best_exemplar
        best_exemplar = None
        best_score = 0.0
        word_embedding = bert_embeddings[word]
        for e_word, e_embedding in exemplar_embeddings.items():
            distance = 1 - cosine(word_embedding, e_embedding)
            if distance > best_score:
                best_exemplar = e_word
                best_score = distance
        bert_cosine_neighbours[word] = best_exemplar
    trial = {'exemplar_count': exemplar_count,
            'exemplar_words': exemplar_words,
            'wordnet_neighbours': wordnet_neighbours,
            'bert_cosine_neighbours': bert_cosine_neighbours}
    trials.append(trial)

In [None]:
pandas.Series([x['exemplar_count'] for x in trials]).plot.hist(title="Number of exemplars in series")

In [None]:
def neighbour_analysis(trials, neighbour_key):
    neighbour_dicts = [x[neighbour_key] for x in trials]
    word_similarities = []
    # I could perhaps be more efficient here. But quadratic time for <100,000 entries is not super-terrible.
    for w1 in working_subset_of_nouns:
        for w2 in working_subset_of_nouns:
            if w2 <= w1:
                continue
            coexemplar_count = 0
            for d in neighbour_dicts:
                if d[w1] == d[w2]:
                    coexemplar_count += 1
            if coexemplar_count > 0:
                word_similarities.append({'word1': w1, 'word2': w2, 'coexemplar_count': coexemplar_count})
    return pandas.DataFrame.from_records(word_similarities)
    #inverted_dicts = []
    #for d in neighbour_dicts:
    #    this_inverted_dict = collections.defaultdict(set)
    #    for k,v in d.items():
    #        this_inverted_dict[v].update([k])
    #    inverted_dicts.append(this_inverted_dict)
    #return inverted_dicts


In [None]:
wn_df = neighbour_analysis(trials, 'wordnet_neighbours').rename(columns={'coexemplar_count': 'wordnet_exemplar_count'})
bert_cosine_df = neighbour_analysis(trials, 'bert_cosine_neighbours').rename(
    columns={'coexemplar_count': 'bert_cosine_exemplar_count'})
df = wn_df.merge(bert_cosine_df, how="outer", on=['word1', 'word2']).fillna(0)
df.sample(10, random_state=12345)

In [None]:
fig, axes = matplotlib.pyplot.subplots(ncols=2, figsize=(16,6))
df.bert_cosine_exemplar_count.plot.hist(logy=True, ax=axes[0])
axes[0].set_title("BERT distribution of neighbour affinity counts")
df.wordnet_exemplar_count.plot.hist(logy=True, ax=axes[1])
axes[1].set_title("Wordnet distribution of neighbour affinity counts")

In [None]:
df.plot.scatter(x='bert_cosine_exemplar_count', y='wordnet_exemplar_count', alpha=0.01)

In [None]:
seaborn.kdeplot(df.bert_cosine_exemplar_count, df.wordnet_exemplar_count)

In [None]:
seaborn.lmplot(data=df,  x='bert_cosine_exemplar_count', y='wordnet_exemplar_count')

In [None]:
df[df.bert_cosine_exemplar_count == 10]

In [None]:
df[df.wordnet_exemplar_count == 10]