## WordPiece Algorithm for Word Embeddings (Skip gram)

In [1]:
import tensorflow as tf
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
import numpy as np

2025-07-29 09:31:29.386466: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753781489.407962    3239 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753781489.414534    3239 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
with open('/kaggle/input/random-text-corpus/random_text_corpus.txt', 'r', encoding='utf-8') as f:
    text = f.read()

corpus = sent_tokenize(text)

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(corpus)
flat_seq = [word for sent in sequences for word in sent]

pairs, labels = skipgrams(flat_seq, vocabulary_size=vocab_size, window_size=2)
pairs = np.array(pairs)
labels = np.array(labels)

print(f"Number of training pairs: {len(pairs)}")

Number of training pairs: 90028


In [4]:
embedding_dim = 20

input_target = tf.keras.layers.Input(shape=(1,))
input_context = tf.keras.layers.Input(shape=(1,))

embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1, name='embedding')

target_emb = embedding(input_target)
context_emb = embedding(input_context)

dot_product = tf.keras.layers.Dot(axes=-1)([target_emb, context_emb])
dot_product = tf.keras.layers.Reshape((1,))(dot_product)
output = tf.keras.layers.Activation('sigmoid')(dot_product)

model = tf.keras.Model(inputs=[input_target, input_context], outputs=output)
model.compile(loss='binary_crossentropy', optimizer='adam')

model.summary()


I0000 00:00:1753781493.338300    3239 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


In [5]:
model.fit([pairs[:, 0], pairs[:, 1]], labels, epochs=500, batch_size=256)

Epoch 1/500


I0000 00:00:1753781495.076605    3288 service.cc:148] XLA service 0x7e8f68006060 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753781495.076647    3288 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1753781495.161302    3288 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 86/352[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 2ms/step - loss: 0.6930

I0000 00:00:1753781495.398223    3288 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.6900
Epoch 2/500
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.6291
Epoch 3/500
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5979
Epoch 4/500
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5793
Epoch 5/500
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5636
Epoch 6/500
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5482
Epoch 7/500
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5281
Epoch 8/500
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5117
Epoch 9/500
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4960
Epoch 10/500
[1m352/352[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss

<keras.src.callbacks.history.History at 0x7e9011058650>

In [6]:
embeddings = model.get_layer('embedding').get_weights()[0]

def get_vector(word):
    idx = tokenizer.word_index.get(word)
    if idx:
        return embeddings[idx]
    else:
        print(f"'{word}' not found in vocabulary.")
        return None

In [7]:
word_embeddings = {word: embeddings[idx] for word, idx in tokenizer.word_index.items()}

In [25]:
np.linalg.norm(word_embeddings["night"] - word_embeddings["glyph"])

7.931625

In [19]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def most_similar(word, word_embeddings, top_n=5):
    if word not in word_embeddings:
        print(f"'{word}' not found in vocabulary.")
        return []
    sims = {}
    target_vec = word_embeddings[word]
    for candidate, vec in word_embeddings.items():
        if candidate == word:
            continue
        sims[candidate] = cosine_similarity(target_vec, vec)
    sorted_words = sorted(sims.items(), key=lambda x: x[1], reverse=True)
    return sorted_words[:top_n]

def analogy(word_a, word_b, word_c, word_embeddings, top_n=5):
    if not all(w in word_embeddings for w in [word_a, word_b, word_c]):
        print("One or more words not in vocabulary.")
        return []
    target_vec = word_embeddings[word_a] - word_embeddings[word_b] + word_embeddings[word_c]
    sims = {}
    for candidate, vec in word_embeddings.items():
        # Optionally skip the input words
        if candidate in [word_a, word_b, word_c]:
            continue
        sims[candidate] = cosine_similarity(target_vec, vec)
    sorted_words = sorted(sims.items(), key=lambda x: x[1], reverse=True)
    return sorted_words[:top_n]

print(analogy("night", "moon", "sun", word_embeddings))

[('glyphs', 0.74258643), ('density', 0.6069289), ('found', 0.606532), ('floor', 0.5930469), ('spark', 0.5900337)]
