In [1]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers
from sklearn.metrics.pairwise import cosine_similarity
#!pip install gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
%load_ext tensorboard

In [3]:
# https://rare-technologies.com/word2vec-tutorial/
# https://radimrehurek.com/gensim/models/word2vec.html
# https://www.educative.io/edpresso/how-to-find-similarity-between-two-words-using-nlp
# https://openclassrooms.com/en/courses/6532301-introduction-to-natural-language-processing/7132231-train-your-first-embedding-models

'''
Unlike with other implementations of Word2Vec I have seen, this one I am using is basically a blank slate. So, in order to 
use the Word2Vec model I first need to train the model with a series of words/sentences. And since I am already using
the shakespeare text for part 1 I figured I can use it for part 2 as well
'''

text = open('shakespeare.txt', 'rb').read().decode(encoding='utf-8')
lines = text.split('\n')
sentences = []
for line in lines:
    # remove punctuation
    line = re.sub(r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]','',line).strip()
    # tokenizer
    tokens = re.findall(r'\b\w+\b', line)
    if len(tokens) > 1:
        sentences.append(tokens)

model = Word2Vec(sentences=sentences, min_count=3, vector_size = 50, sg = 1, window = 7)
model.save("word2vec.model")

model = Word2Vec.load("word2vec.model")

In [15]:
def calculate_word_embed(model):
    word1 = input("Enter the first word to be compared: ")
    if(word1 not in model.index_to_key):
        while(word1 not in model.index_to_key):
            print("I am sorry but ["+word1+"] is not in the list of words")
            word1 = input("Please enter another word: ")
    word2 = input("Enter the second word to be compared: ")
    if(word2 not in model.index_to_key):
        while(word2 not in model.index_to_key):
            print("I am sorry but ["+word2+"] is not in the list of words")
            word2 = input("Please enter another word: ")
            
    w1 = model[word1]
    w2 = model[word2]
    cosine_similarity = np.dot(w1, w2)/(np.linalg.norm(w1)* np.linalg.norm(w2))
    PIP_loss = np.sqrt(abs(np.dot(w1,w1.T) - np.dot(w2,w2.T)))
    
    print()
    print("The Cosine Similarity between ["+word1+"] and ["+word2+"] is:",cosine_similarity)
    print("The dissimilarity between ["+word1+"] and ["+word2+"] is:",PIP_loss)

In [16]:
calculate_word_embed(model.wv)

Enter the first word to be compared: fool
Enter the second word to be compared: king

The Cosine Similarity between [fool] and [king] is: 0.52015287
The dissimilarity between [fool] and [king] is: 0.49285793


In [None]:
'''
At first when I wanted to try and calculate the dissimilarity value I figured subtracting the cosine similarity value
from 1 would be enough but that just felt lazy to me. Not to mention I did not have any kind of proof that this would be 
a correct method to acquiring the score. Not being one for taking a chance, I decided to follow the advice of the homework 
and started reading research papers to find out more about determining the dissimilarity between words. I then came across
this one paper called "On the Dimensionality of Word Embedding"
(https://papers.nips.cc/paper/2018/file/b534ba68236ba543ae44b22bd110a1d6-Paper.pdf). Which talked about a method called 
Pairwise Inner Product Loss (PIP Loss). This basically involved taking both the embeded word matrices, multiplying each by
their transpose value, then subtracting from each other and getting the square root of that subtracted value. I honestly
think this would be a good way of identifying the dissimilarity between the 2. This would be because by first multiplying
the embedding with its transpose form, it then forms that identity matrix for that word essentialy turning it from a regular
n-vector to a vector representing itself. This means we are now getting the full projection of word2 onto the subspace
of word1 showing how different they can be
'''