# How To Use The Word Embedder

In [1]:
import nltk
from nltk.tokenize import word_tokenize

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shreyanshchordia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Setting the dataset

In [3]:
sentences = [
    'Hey how are you?',
    'I am great.',
    "What's the issue in it?",
    'We survived the  day!',
    'I am tired.',
    'I need to rest.',
    'We need to resolve our differences',
    'How far are you going to go with this, huh?'
]

### Tokenizing the sentences

In [4]:
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [5]:
for sentence in tokenized_sentences:
    print(sentence)

['Hey', 'how', 'are', 'you', '?']
['I', 'am', 'great', '.']
['What', "'s", 'the', 'issue', 'in', 'it', '?']
['We', 'survived', 'the', 'day', '!']
['I', 'am', 'tired', '.']
['I', 'need', 'to', 'rest', '.']
['We', 'need', 'to', 'resolve', 'our', 'differences']
['How', 'far', 'are', 'you', 'going', 'to', 'go', 'with', 'this', ',', 'huh', '?']


## Word Embedder

In [6]:
from WordEmbedder import Embedder, token_seq_to_num_seq, num_seq_to_token_seq, generate_vocabulary

### As simple as it could be!

In [7]:
emb = Embedder(dimensions=50)

In [8]:
embedder = emb.get_embedder()
# embedder returns embeddings of known words 
# for unknown (out of vocabulary) words it returns array of zeros
print(embedder['beautiful', 'ffnceo'])


[[ 0.54623    1.2042    -1.1288    -0.1325     0.95529    0.040524
  -0.47863   -0.3397    -0.28056    0.71761   -0.53691   -0.0045698
   0.73217    0.12101    0.28093   -0.088097   0.59733    0.55264
   0.056646  -0.50247   -0.63204    1.1439    -0.31053    0.1263
   1.3155    -0.52444   -1.5041     1.158      0.68795   -0.85051
   2.3236    -0.41789    0.44519   -0.019216   0.28969    0.53258
  -0.023008   0.58958   -0.72397   -0.85216   -0.17761    0.14432
   0.40658   -0.52003    0.09081    0.082961  -0.021975  -1.6214
   0.34579   -0.010919 ]
 [ 0.         0.         0.         0.         0.         0.
   0.         0.         0.         0.         0.         0.
   0.         0.         0.         0.         0.         0.
   0.         0.         0.         0.         0.         0.
   0.         0.         0.         0.         0.         0.
   0.         0.         0.         0.         0.         0.
   0.         0.         0.         0.         0.         0.
   0.         0.  

### Generating vocabulary for the dataset

In [9]:
vocab, vocab_dict = generate_vocabulary(tokenized_sentences)

In [10]:
print(f'Vocabulary:\n{vocab}\n\nVocabulary Dictionary that maps words to the count of their occurances:\n{vocab_dict}')

Vocabulary:
['?', 'I', '.', 'to', 'are', 'you', 'am', 'the', 'We', 'need', 'Hey', 'how', 'great', 'What', "'s", 'issue', 'in', 'it', 'survived', 'day', '!', 'tired', 'rest', 'resolve', 'our', 'differences', 'How', 'far', 'going', 'go', 'with', 'this', ',', 'huh']

Vocabulary Dictionary that maps words to the count of their occurances:
{'?': 3, 'I': 3, '.': 3, 'to': 3, 'are': 2, 'you': 2, 'am': 2, 'the': 2, 'We': 2, 'need': 2, 'Hey': 1, 'how': 1, 'great': 1, 'What': 1, "'s": 1, 'issue': 1, 'in': 1, 'it': 1, 'survived': 1, 'day': 1, '!': 1, 'tired': 1, 'rest': 1, 'resolve': 1, 'our': 1, 'differences': 1, 'How': 1, 'far': 1, 'going': 1, 'go': 1, 'with': 1, 'this': 1, ',': 1, 'huh': 1}


### Generating embedding matrix for the vocabulary

Embedding Matrix is required when you use the Embedding Layer from tensorflow.keras.layers on Pre-Trained word embeddings.

The Layer demands for a numpy matrix that contains embeddings of all the words in the chosen vocabulary 

In [11]:
embedding_matrix, id2token, token2id = emb.get_embedding_matrix(vocab, num_words=10)

In [12]:
print(f"Embeddings of the first word of vocabulary:\n{embedding_matrix[0]}\n\nID-TO-TOKEN Mapper:\n{id2token}\n\nTOKEN-TO-ID Mapper:\n{token2id}")

Embeddings of the first word of vocabulary:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]

ID-TO-TOKEN Mapper:
{0: '<oov>', 1: '?', 2: 'i', 3: '.', 4: 'to', 5: 'are', 6: 'you', 7: 'am', 8: 'the', 9: 'we'}

TOKEN-TO-ID Mapper:
{'<oov>': 0, '?': 1, 'i': 2, '.': 3, 'to': 4, 'are': 5, 'you': 6, 'am': 7, 'the': 8, 'we': 9}


### Converting Tokenized sentences to Number sequences

In [13]:
num_sequences = token_seq_to_num_seq(tokenized_sentences, token2id, oov_token='<oov>')

In [14]:
# 0 for words that are not in the vocab of 10 words
for sequence in num_sequences: print(sequence)

[0 0 5 6 1]
[2 7 0 3]
[0 0 8 0 0 0 1]
[9 0 8 0 0]
[2 7 0 3]
[2 0 4 0 3]
[9 0 4 0 0 0]
[0 0 5 6 0 4 0 0 0 0 0 1]


### Retrieving Tokenized sentences back from the Number sequences

In [15]:
retrieved_tokenized_sequences = num_seq_to_token_seq(num_sequences, id2token)

In [16]:
# we lose words ('<oov>') on retrieval because num_words < total number of words in the vocabulary
for sequence in retrieved_tokenized_sequences: print(sequence)

['<oov>', '<oov>', 'are', 'you', '?']
['i', 'am', '<oov>', '.']
['<oov>', '<oov>', 'the', '<oov>', '<oov>', '<oov>', '?']
['we', '<oov>', 'the', '<oov>', '<oov>']
['i', 'am', '<oov>', '.']
['i', '<oov>', 'to', '<oov>', '.']
['we', '<oov>', 'to', '<oov>', '<oov>', '<oov>']
['<oov>', '<oov>', 'are', 'you', '<oov>', 'to', '<oov>', '<oov>', '<oov>', '<oov>', '<oov>', '?']


## Extra features of the Embedder Class

You can use object of the Embedder class to find:

   1. Similarity score between two words

   2. Analogy Score that determines, how good an analogy is.

   3. K most similar words

   4. K most apt words to satisfy the analogy

In [17]:
emb2 = Embedder(dimensions=50)

In [18]:
print('bad -> ',emb2.most_similar_to('bad',k=3),'\n\nbaby -> ',emb2.most_similar_to('baby',k=3))

bad ->  ['worse', 'unfortunately', 'too'] 

baby ->  ['babies', 'boy', 'girl']


In [19]:
print('Analogy of\n\nking -> queen    man -> ',emb2.get_top_k_by_analogy('king','queen','man',k=2))

Analogy of

king -> queen    man ->  ['woman', 'girl']


In [20]:
print('Similarity score:\n\nbad -> worse    ',emb2.cosine_similarity('bad','worse'), '\n\nhello -> why    ', emb2.cosine_similarity('hello','why'))

Similarity score:

bad -> worse     0.8878378 

hello -> why     0.41916144


In [21]:
print('Analogy Score:\n\ngood->best   bad->worst = ',emb2.cosine_sim_analogy('good','best','bad','worst'),'\n\ngood->woman  yes->hello = ',emb2.cosine_sim_analogy('good','woman','yes','hello'))

Analogy Score:

good->best   bad->worst =  0.75127333 

good->woman  yes->hello =  0.48011282
