Initial Experiment to test complexity of BERT Embeddings by comparing Cosine Similarity and Euclidian distance

In [1]:
import tensorflow as tf

In [2]:
import tensorflow_hub as hub

In [3]:
from transformers import BertTokenizer, TFBertModel

In [4]:
import numpy as np

In [5]:
def get_model(model_name, max_seq_length):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32)
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32)
    input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32)
    
    inputs = {
        'input_ids': input_word_ids,
        'attention_mask': input_mask,
        'token_type_ids': input_type_ids
    }
    
    bert_layer = TFBertModel.from_pretrained(model_name)
    outputs = bert_layer(inputs)
    
    pooled_output = outputs[1]  # Pooled output corresponds to the [CLS] token
    return tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=pooled_output), tokenizer

In [6]:
# Function call
max_seq_length = 128
model_name = "bert-base-uncased"

bert_model, tokenizer = get_model(model_name=model_name, max_seq_length=max_seq_length)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [7]:
def encode(input_text, model, tokenizer, max_seq_length):
    input_ids = tokenizer.encode(input_text, add_special_tokens=True, max_length=max_seq_length, padding='max_length', truncation=True, return_tensors="tf")
    attention_mask = tf.ones_like(input_ids)
    token_type_ids = tf.zeros_like(input_ids)
    return model([input_ids, attention_mask, token_type_ids])[0]

In [8]:
def euclidean_distance(embeddings, x, y):
    dst = np.linalg.norm(embeddings[x] - embeddings[y])
    return dst

In [9]:
from scipy.spatial import distance

In [10]:
def cosine_similarity(embeddings, x, y):
    sim = 1 - distance.cosine(embeddings[x], embeddings[y])
    return sim

In [11]:
real = "dont worry about it"
idiom = "dont sweat it"
sentences = [real, idiom,"dont perspiration it"]

# Encode the sentences
embeddings = [encode(sentence, bert_model, tokenizer, max_seq_length).numpy() for sentence in sentences]

In [12]:
# Calculate and print distances
for i in range(len(sentences)):
    for j in range(i + 1, len(sentences)):
        euclidean_dist = euclidean_distance(embeddings, i, j)
        cosine_sim = cosine_similarity(embeddings, i, j)
        cosine_dst=1-cosine_sim
        print("Euclidean Distance between '{}' & '{}' is {}".format(sentences[i], sentences[j], euclidean_dist))
        print("Cosine Similarity between '{}' & '{}' is {}".format(sentences[i], sentences[j], cosine_sim))
        print("Cosine Distance between '{}' & '{}' is {}".format(sentences[i], sentences[j], cosine_dst))
        print()

Euclidean Distance between 'dont worry about it' & 'dont sweat it' is 1.4594606161117554
Cosine Similarity between 'dont worry about it' & 'dont sweat it' is 0.9968292713165283
Cosine Distance between 'dont worry about it' & 'dont sweat it' is 0.0031707286834716797

Euclidean Distance between 'dont worry about it' & 'dont perspiration it' is 0.8992570638656616
Cosine Similarity between 'dont worry about it' & 'dont perspiration it' is 0.9984480738639832
Cosine Distance between 'dont worry about it' & 'dont perspiration it' is 0.0015519261360168457

Euclidean Distance between 'dont sweat it' & 'dont perspiration it' is 1.212147831916809
Cosine Similarity between 'dont sweat it' & 'dont perspiration it' is 0.9975985288619995
Cosine Distance between 'dont sweat it' & 'dont perspiration it' is 0.0024014711380004883



In [13]:
import nlpaug.augmenter.word as naw
import spacy
nlp = spacy.load("en_core_web_lg")
import random

In [14]:
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import wordnet

In [15]:
def select_random(sentence):
    doc = nlp(sentence)
    words = [token.text for token in doc if token.pos_ in ['VERB','ADJ','NOUN']]
    if(len(words)==0):
        return doc[0].text
    random_word_index = random.randrange(len(words))  # Generate a random index
    random_word = words[random_word_index]
    return random_word

In [16]:
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\siddh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Augmentation using SynonymAug
aug = naw.SynonymAug(
    aug_src='wordnet',
    model_path=None,
    name='Synonym_Aug',
    aug_min=1,
    aug_max=10,
    aug_p=0.1,
    lang='eng',
    stopwords=None,
    tokenizer=None,
    reverse_tokenizer=None,
    stopwords_regex=None,
    force_reload=False,
    verbose=0
)