In [115]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Input, Dot

import tensorflow as tf

In [11]:
# 1. Define the Dataset
dataset = [
    "COVID-19 is a novel coronavirus",
    "The virus spreads through droplets",
    "Symptoms include fever and cough",
    "Social distancing helps prevent spread",
    "Masks are essential for protection",
    "Vaccines have been developed",
    "COVID-19 affects the respiratory system",
    "Hand hygiene is crucial to prevent infection",
    "Quarantine and isolation are necessary",
    "The pandemic has impacted global health",
    "Travel restrictions have been implemented",
    "Testing and contact tracing are important",
    "Remote work has become common",
    "Lockdowns have been imposed in many areas",
    "Public health measures are vital",
    "The virus can survive on surfaces",
    "Research is ongoing for treatments",
    "The virus mutates, creating new strains",
    "Many people have recovered from COVID-19",
    "The pandemic has affected mental health"
]

In [81]:
# 2. Data Preparation
# Tokenize the dataset
tokenizer = Tokenizer(filters= '!"#$%&()*+,./:;<=>?@[\\]^`{|}~\t\n', # '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      lower  = False)

tokenizer.fit_on_texts(dataset)

In [82]:
print(tokenizer.word_counts)

OrderedDict([('COVID-19', 3), ('is', 3), ('a', 1), ('novel', 1), ('coronavirus', 1), ('The', 5), ('virus', 3), ('spreads', 1), ('through', 1), ('droplets', 1), ('Symptoms', 1), ('include', 1), ('fever', 1), ('and', 3), ('cough', 1), ('Social', 1), ('distancing', 1), ('helps', 1), ('prevent', 2), ('spread', 1), ('Masks', 1), ('are', 4), ('essential', 1), ('for', 2), ('protection', 1), ('Vaccines', 1), ('have', 4), ('been', 3), ('developed', 1), ('affects', 1), ('the', 1), ('respiratory', 1), ('system', 1), ('Hand', 1), ('hygiene', 1), ('crucial', 1), ('to', 1), ('infection', 1), ('Quarantine', 1), ('isolation', 1), ('necessary', 1), ('pandemic', 2), ('has', 3), ('impacted', 1), ('global', 1), ('health', 3), ('Travel', 1), ('restrictions', 1), ('implemented', 1), ('Testing', 1), ('contact', 1), ('tracing', 1), ('important', 1), ('Remote', 1), ('work', 1), ('become', 1), ('common', 1), ('Lockdowns', 1), ('imposed', 1), ('in', 1), ('many', 1), ('areas', 1), ('Public', 1), ('measures', 1), 

In [83]:
print(tokenizer.word_index)

{'The': 1, 'are': 2, 'have': 3, 'COVID-19': 4, 'is': 5, 'virus': 6, 'and': 7, 'been': 8, 'has': 9, 'health': 10, 'prevent': 11, 'for': 12, 'pandemic': 13, 'a': 14, 'novel': 15, 'coronavirus': 16, 'spreads': 17, 'through': 18, 'droplets': 19, 'Symptoms': 20, 'include': 21, 'fever': 22, 'cough': 23, 'Social': 24, 'distancing': 25, 'helps': 26, 'spread': 27, 'Masks': 28, 'essential': 29, 'protection': 30, 'Vaccines': 31, 'developed': 32, 'affects': 33, 'the': 34, 'respiratory': 35, 'system': 36, 'Hand': 37, 'hygiene': 38, 'crucial': 39, 'to': 40, 'infection': 41, 'Quarantine': 42, 'isolation': 43, 'necessary': 44, 'impacted': 45, 'global': 46, 'Travel': 47, 'restrictions': 48, 'implemented': 49, 'Testing': 50, 'contact': 51, 'tracing': 52, 'important': 53, 'Remote': 54, 'work': 55, 'become': 56, 'common': 57, 'Lockdowns': 58, 'imposed': 59, 'in': 60, 'many': 61, 'areas': 62, 'Public': 63, 'measures': 64, 'vital': 65, 'can': 66, 'survive': 67, 'on': 68, 'surfaces': 69, 'Research': 70, 'ong

In [84]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

83

In [85]:
word2index = tokenizer.word_index

index2word = {v: k for k, v in word2index.items()}

Transforms each text in texts to a sequence of integers.

Only top `num_words-1` most frequent words will be taken into account.
Only words known by the tokenizer will be taken into account.

In [86]:
# Generate skip-grams
sequences = tokenizer.texts_to_sequences(dataset)
sequences

[[4, 5, 14, 15, 16],
 [1, 6, 17, 18, 19],
 [20, 21, 22, 7, 23],
 [24, 25, 26, 11, 27],
 [28, 2, 29, 12, 30],
 [31, 3, 8, 32],
 [4, 33, 34, 35, 36],
 [37, 38, 5, 39, 40, 11, 41],
 [42, 7, 43, 2, 44],
 [1, 13, 9, 45, 46, 10],
 [47, 48, 3, 8, 49],
 [50, 7, 51, 52, 2, 53],
 [54, 55, 9, 56, 57],
 [58, 3, 8, 59, 60, 61, 62],
 [63, 10, 64, 2, 65],
 [1, 6, 66, 67, 68, 69],
 [70, 5, 71, 12, 72],
 [1, 6, 73, 74, 75, 76],
 [77, 78, 3, 79, 80, 4],
 [1, 13, 9, 81, 82, 10]]

#### Generate skipgram word pairs

**`skipgrams` Function in Keras**

The `skipgrams` function in Keras is used to generate training pairs for the Word2Vec model using the skip-gram approach. 

The skip-gram model predicts the surrounding context words given a target word. 

The `skipgrams` function generates pairs of `(target, context)` words from sequences of word indices, which are used for training the model.

**Parameters**
- **sequence:** A list of word indices.
- **vocabulary_size:** The total number of words in the vocabulary.
- **window_size:** The maximum distance between the current and predicted word within a sentence.

**Returns**
- **pairs:** A list of tuples where each tuple consists of a target word and a context word.
- **labels:** A list of labels (not commonly used in Keras implementation).

- (word, word in the same window), with label 1 (positive samples).
- (word, random word from the vocabulary), with label 0 (negative samples).

**Example**

Consider a sentence "The quick brown fox jumps over the lazy dog." After tokenization and indexing, the sequence might look like this:

In [87]:
sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9]

Here, each number corresponds to a word index in the vocabulary. Let's generate skip-gram pairs with window_size=2:

In [88]:
vocabulary_size = 10
window_size     = 2

In [89]:
# Generate skip-gram pairs
pairs, labels = skipgrams(sequence, vocabulary_size, window_size)

In [90]:
# Convert pairs and labels to DataFrame
df = pd.DataFrame(pairs, columns=['target', 'context'])
df['label'] = labels
df

Unnamed: 0,target,context,label
0,6,5,1
1,1,3,1
2,6,8,1
3,5,3,0
4,5,7,0
5,2,3,1
6,7,9,1
7,1,2,1
8,6,6,0
9,3,1,1


... back to the code

In [91]:
skip_grams = [skipgrams(seq, 
                        vocabulary_size= vocab_size, 
                        window_size    = 2) for seq in sequences]

In [92]:
# Flatten the list of skip-grams and extract targets and contexts
pairs = [pair for skip_gram in skip_grams for pair in skip_gram[0]]

In [93]:
targets, contexts = zip(*pairs)

In [94]:
targets  = np.array(targets,  dtype="int32")
contexts = np.array(contexts, dtype="int32")

In [95]:
# 3. Define a Simple Neural Network for Word2Vec Using Keras
embedding_dim = 10  # Size of the embedding vector

In [96]:
# Define the input and embedding layers
input_target  = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, 
                      embedding_dim, 
                      input_length=1, name="embedding")

#### look up embedding vectors

In [97]:
# retrieves the dense vector representation for each of these indices.
target = embedding(input_target)

In [98]:
target.shape

TensorShape([None, 1, 10])

In [99]:
target = Flatten()(target)

In [100]:
context = embedding(input_context)
context = Flatten()(context)

In [101]:
# Calculate the dot product of the embeddings
# The dot product measures the similarity between the target and context word embeddings. 
# It gives a scalar value representing how closely related the two word vectors are.
dot_product = Dot(axes=1)([target, context])

output = Dense(1, activation='sigmoid')(dot_product)

In [118]:
# Define a callback to print dots
class DotProgress(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print('.', end='', flush=True)

In [119]:
# Fit the model with dot printing and capture history
labels = np.ones(len(targets))  # Labels are all 1 for positive samples

model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

history = model.fit([targets, contexts], labels, epochs=100, verbose=0, callbacks=[DotProgress()])

# Print new lines after dots
print('\n')

....................................................................................................



In [121]:
# Access and print training loss
losses = history.history['loss']
for epoch, loss in enumerate(losses):
    print(f'Epoch {epoch + 1}: Training loss = {loss:.8f}')

Epoch 1: Training loss = 0.00018932
Epoch 2: Training loss = 0.00013609
Epoch 3: Training loss = 0.00010041
Epoch 4: Training loss = 0.00007747
Epoch 5: Training loss = 0.00006164
Epoch 6: Training loss = 0.00005026
Epoch 7: Training loss = 0.00004179
Epoch 8: Training loss = 0.00003543
Epoch 9: Training loss = 0.00003042
Epoch 10: Training loss = 0.00002642
Epoch 11: Training loss = 0.00002314
Epoch 12: Training loss = 0.00002051
Epoch 13: Training loss = 0.00001831
Epoch 14: Training loss = 0.00001640
Epoch 15: Training loss = 0.00001482
Epoch 16: Training loss = 0.00001347
Epoch 17: Training loss = 0.00001228
Epoch 18: Training loss = 0.00001125
Epoch 19: Training loss = 0.00001036
Epoch 20: Training loss = 0.00000956
Epoch 21: Training loss = 0.00000885
Epoch 22: Training loss = 0.00000822
Epoch 23: Training loss = 0.00000766
Epoch 24: Training loss = 0.00000715
Epoch 25: Training loss = 0.00000670
Epoch 26: Training loss = 0.00000628
Epoch 27: Training loss = 0.00000591
Epoch 28: 

In [104]:
# 5. Extract Embeddings
embedding_layer = model.get_layer('embedding')
embedding_weights = embedding_layer.get_weights()[0]

In [105]:
print(embedding_weights.shape)  # Should be (vocab_size, embedding_dim)

(83, 10)


In [106]:
print(len(index2word))  # Should be vocab_size - 1 if vocab_size includes padding

82


In [107]:
# Save the embeddings in a DataFrame
embeddings_df = pd.DataFrame(embedding_weights)
embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.006641,-0.035035,-0.022339,-0.011309,-0.033168,0.043048,-0.017726,0.013206,-0.031161,-0.025686
1,-0.58833,-0.328001,-0.732139,-0.754788,0.750999,-0.484599,-0.525613,-0.770547,0.185323,0.755284
2,0.512723,-0.632475,-0.602117,-0.70169,0.671994,0.60315,-0.700471,-0.625682,-0.643193,0.745278
3,0.36842,-0.80494,0.357299,-0.591159,0.769499,0.48679,-0.759999,-0.812508,-0.61982,0.80619
4,-0.343359,-0.75853,-0.313048,-0.668963,0.750749,-0.326777,-0.60663,-0.678244,-0.142103,0.71789


In [108]:
# 6. Function to Retrieve Embedding for a Given Word
def get_embedding(word):
    idx = word2index.get(word)
    if idx is not None:
        return embedding_weights[idx]
    return None

In [109]:
# 7. Function to Find Most Similar Tokens
def most_similar(word, top_n=5):
    target_vector = get_embedding(word)
    if target_vector is None:
        return None

    similarities = np.dot(embedding_weights, target_vector)
    similarities /= np.linalg.norm(embedding_weights, axis=1)  # Normalize
    similarities /= np.linalg.norm(target_vector)  # Normalize

    # Get the top_n most similar words
    most_similar_indices = similarities.argsort()[-top_n-1:][::-1]
    most_similar_words = [(index2word[i], similarities[i]) for i in most_similar_indices if i != word2index[word]]
    return most_similar_words[:top_n]

In [111]:
# Example: Retrieve embedding for a word
print(get_embedding("COVID-19"))

[-0.3433594  -0.7585301  -0.31304765 -0.66896296  0.7507492  -0.32677683
 -0.60662997 -0.6782444  -0.1421033   0.71789044]


In [113]:
# Example: Find most similar tokens to a word
print(most_similar("COVID-19", top_n=5))

[('creating', 0.9896402), ('Many', 0.9776088), ('mutates', 0.9723464), ('Research', 0.9714599), ('recovered', 0.9679285)]
