In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [20]:
# define a sample corpus to build a vocab
texts = ['text', 'the', 'leader', 'prime', 'natural', 'languages','leader',
        'king','man','woman','queen']

In [21]:
# Initialise and fit tokenizer
# prints num of unique words and their assigned indices

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Output the word-index dictionary
print("Number of unique words in dictionary =", len(tokenizer.word_index))
print("Dictionary is =", tokenizer.word_index)


Number of unique words in dictionary = 10
Dictionary is = {'leader': 1, 'text': 2, 'the': 3, 'prime': 4, 'natural': 5, 'languages': 6, 'king': 7, 'man': 8, 'woman': 9, 'queen': 10}


In [22]:
# Define function to Create embedding matrix
# loads GloVe word vectors from file.
# Creates an embedding matrix matching tokenizer word indices with GloVe vectors.
def embedding_for_vocab(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # +1 for padding token (index 0) index of word_idnex starts at 1
    embedding_matrix_vocab = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding ='utf8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word]
                embedding_matrix_vocab[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix_vocab           

In [23]:
embedding_dim = 50
glove_path = './wiki_giga_2024_50_MFT20_vectors_seed_123_alpha_0.75_eta_0.075_combined.txt'

# Generating embeddings for vocab
embedding_matrix_vocab = embedding_for_vocab(glove_path, tokenizer.word_index, embedding_dim)

In [24]:
embedding_matrix_vocab

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [-9.64

In [25]:
tokenizer.word_index

{'leader': 1,
 'text': 2,
 'the': 3,
 'prime': 4,
 'natural': 5,
 'languages': 6,
 'king': 7,
 'man': 8,
 'woman': 9,
 'queen': 10}

In [28]:
k = embedding_matrix_vocab[tokenizer.word_index['king']]
m = embedding_matrix_vocab[tokenizer.word_index['man']]
w = embedding_matrix_vocab[tokenizer.word_index['woman']]

In [35]:
r = (k-m+w).reshape(1, -1)

In [37]:
r

array([[-0.80521497,  1.044588  , -0.20857   , -1.57146201,  1.934645  ,
        -1.46445806, -1.002041  ,  0.68692103, -0.43490499, -0.16137403,
         1.38474503,  0.21790899,  0.59523898, -0.47586302, -0.60208303,
         0.01116103, -0.30609503,  0.29185   ,  0.509348  ,  0.13490802,
         0.40268199,  0.519704  , -1.02989805,  0.372182  , -0.354302  ,
         0.266682  , -0.26460701, -0.49018399, -0.66991395,  0.08989301,
        -0.649966  ,  0.36549696,  0.77612102,  0.55191   , -0.59623098,
         1.49567303, -0.04088402,  3.56015372,  0.13204199,  0.35943204,
        -0.44513898, -1.077436  ,  0.14889801, -0.052625  ,  0.17367002,
        -0.16945   , -1.35451305,  0.22412693,  1.398422  , -0.29126298]])

In [36]:
q=embedding_matrix_vocab[tokenizer.word_index['queen']].reshape(1, -1)

In [38]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
similarity = cosine_similarity(r, q)
print("Cosine Similarity:", similarity[0][0])

Cosine Similarity: 0.8715485327153446


In [None]:
# https://www.geeksforgeeks.org/nlp/pre-trained-word-embedding-using-glove-in-nlp-models/
# https://nlp.stanford.edu/projects/glove/
# https://jonathan-hui.medium.com/nlp-word-embedding-glove-5e7f523999f6
# https://medium.com/@abhishekjainindore24/glove-global-vector-an-extension-to-word2vec-embedding-technique-359ce4289908

