In [None]:
import numpy as np
from scipy.spatial import distance
import matplotlib.pyplot as plt
from scipy.spatial.distance import cityblock


from transformers import BertModel, BertTokenizer
import torch


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased' )

In [None]:
def getEmbedding(text):
    tokens = tokenizer(text, return_tensors='pt')
    outputs = model(**tokens) # Transformer layers
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_states, dim=1)
    embedding_array = sentence_embedding.detach().numpy()
    return embedding_array

# modify getEmbedding array so that it returns the embedding of a specific word within the sentence, parameters are the text and the index of the word
# Return the token split of the text as
def getEmbeddingWord(text, index):
    tokens = tokenizer(text, return_tensors='pt')
    outputs = model(**tokens) # Transformer layers
    last_hidden_states = outputs.last_hidden_state
    word_embedding = last_hidden_states[0][index]
    embedding_array = word_embedding.detach().numpy()
    return embedding_array

# Define a method that takes in text input and shows the tokenized version of the text
def tokenizeText(text):
    tokens = tokenizer(text, return_tensors='pt')
    return tokens

# Test method to make sure the input id is correct, takes in an input id and returns the word
def getWordFromID(id):
    return tokenizer.convert_ids_to_tokens([id])[0]
## Reminder, 101 and 102 are the start and end tokens, 0 is the padding token




In [None]:
# Result 1

# Vector Examples
vec1 = ["dog", "cat", "turtle"]

# Getting the embeddings for the words in vec1
vecEmbeds1 = []
for vec in vec1:
    vecEmbeds1.append(getEmbedding(vec)) # Gets the vector embeddings

vec2 = ["dog", "cat", "turtle"]

vec2Embeds = []
for vec in vec2:
    vec2Embeds.append(getEmbedding(vec))


cosineMatrix = np.zeros((len(vecEmbeds1), len(vec2Embeds)))

for i in range(len(vecEmbeds1)):
    for j in range(len(vec2Embeds)):
        cosineMatrix[i][j] = 1 - distance.cosine(vecEmbeds1[i], vec2Embeds[j]) # Calculating Cosine Similarity


angleMatrix = np.arccos(cosineMatrix) * 180 / np.pi

# Plotting Results in Heatmap
plt.imshow(angleMatrix, cmap='autumn')

# Add cell values as text annotations
for i in range(len(vecEmbeds1)):
    for j in range(len(vec2Embeds)):
        plt.annotate(f'{angleMatrix[i][j]:.2f}', xy=(j, i), ha='center', va='center')

plt.title("Distance between vectors")
plt.xticks(np.arange(len(vec2)), vec2)
plt.yticks(np.arange(len(vec1)), vec1)
plt.tick_params(axis='x', labeltop=True, labelbottom=False)
plt.colorbar()


In [None]:
# Question 2
### Creating word list and emebdding vectors ###
vec1 = ["Is a dog a mammal?", "Is a cat a mammal?", "Is a turtle a mammal?"]

vecEmbeds1 = []
for vec in vec1:
    vecEmbeds1.append(getEmbedding(vec))

vec2 = ["Is a dog a mammal?", "Is a cat a mammal?", "Is a turtle a mammal?"]

vec2Embeds = []
for vec in vec2:
    vec2Embeds.append(getEmbedding(vec))

### Creating Cosine Distance Matrix ###
cosineMatrix = np.zeros((len(vecEmbeds1), len(vec2Embeds)))
for i in range(len(vecEmbeds1)):
    for j in range(len(vec2Embeds)):
        cosineMatrix[i][j] = 1 - distance.cosine(vecEmbeds1[i], vec2Embeds[j])

angleMatrix = np.arccos(cosineMatrix) * 180 / np.pi


### Plotting Results in Heatmap ###
plt.imshow(angleMatrix, cmap='autumn')
plt.title("Distance between vectors")

for i in range(len(vecEmbeds1)):
    for j in range(len(vec2Embeds)):
        plt.annotate(f'{angleMatrix[i][j]:.2f}', xy=(j, i), ha='center', va='center')

labels = ["Is a dog""\n""a mammal?", "Is a cat""\n""a mammal?", "Is a turtle""\n""a mammal?"] #Slight change to make labels more visible

plt.xticks(np.arange(len(vec2)), labels)
plt.yticks(np.arange(len(vec1)), vec1)
plt.tick_params(axis='x', labeltop=True, labelbottom=False)
plt.colorbar()


In [None]:
# Bow

## Words with multiple meanings: bow
vec1 = ["The bow was damaged.", "She has a bow.", "He took a bow.", "He used a bow"]
vecEmbeds1 = []
for vec in vec1:
    vecEmbeds1.append(getEmbedding(vec))

vecEmbeds2 = []
# append bows from each sentence
vecEmbeds2.append(getEmbeddingWord("The bow was damaged.", 2))
vecEmbeds2.append(getEmbeddingWord("She wears a bow.", 3))
vecEmbeds2.append(getEmbeddingWord("He took a bow.", 3))
vecEmbeds2.append(getEmbeddingWord("He used a bow", 3))


cosineMatrix = np.zeros((len(vecEmbeds1), len(vecEmbeds2)))
for i in range(len(vecEmbeds1)):
    for j in range(len(vecEmbeds2)):
        cosineMatrix[i][j] = 1 - distance.cosine(vecEmbeds1[i], vecEmbeds2[j])

angleMatrix = np.arccos(cosineMatrix) * 180 / np.pi

plt.imshow(angleMatrix, cmap='autumn')

vec2 = ["bow -""\n"" prompt 1", "bow -" "\n" "prompt 2", "bow -""\n" "prompt 3","bow -""\n" "prompt 4"]

vec1 = ["1. The bow was" "\n""damaged.", "2. She wears a bow.", "3. He took a bow.", "4. He used a bow"]

plt.title("Distance between vectors")
plt.xticks(np.arange(len(vec2)), vec2)
plt.yticks(np.arange(len(vec1)), vec1)
plt.tick_params(axis='x', labeltop=True, labelbottom=False)
plt.colorbar()

# add angle values to the heatmap
for i in range(len(vecEmbeds1)):
    for j in range(len(vecEmbeds2)):
        plt.annotate(f'{angleMatrix[i][j]:.2f}', xy=(j, i), ha='center', va='center')

In [None]:
## Words with multiple meanings: bow
vec1 = ["Can you take a bow?", "Can you take a rainbow?"]
vecEmbeds1 = []
for vec in vec1:
    vecEmbeds1.append(getEmbedding(vec))

vecEmbeds2 = []
vecEmbeds2.append(getEmbeddingWord("Can you take a bow?", 5))
vecEmbeds2.append(getEmbeddingWord("Can you take a rainbow?", 5))

cosineMatrix = np.zeros((len(vecEmbeds1), len(vecEmbeds2)))
for i in range(len(vecEmbeds1)):
    for j in range(len(vecEmbeds2)):
        cosineMatrix[i][j] = 1 - distance.cosine(vecEmbeds1[i], vecEmbeds2[j])

angleMatrix = np.arccos(cosineMatrix) * 180 / np.pi

plt.imshow(angleMatrix, cmap='autumn')

#vec2 = ["bow - ship", "bow - hair", "bow -""\n" "perform","bow -""\n" "arrow" ,"bow -" "\n" "no context"]
vec2 = ["bow - bow", "bow - rainbow"]

plt.title("Distance between vectors")
plt.xticks(np.arange(len(vec2)), vec2)
plt.yticks(np.arange(len(vec1)), vec1)
plt.tick_params(axis='x', labeltop=True, labelbottom=False)
plt.colorbar()

# add angle values to the heatmap
for i in range(len(vecEmbeds1)):
    for j in range(len(vecEmbeds2)):
        plt.annotate(f'{angleMatrix[i][j]:.2f}', xy=(j, i), ha='center', va='center')