In [4]:
import re
from collections import Counter
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from sklearn.metrics.pairwise import cosine_similarity
import random
import pandas as pd

In [5]:
document_path = "4.txt"
vector_dim    = 100
window_size   = 2
min_count     = 1
epochs        = 5
batch_size    = 256

In [6]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

with open(document_path, encoding='utf-8') as f:
    text = f.read()

tokens = tokenize(text)
freqs = Counter(tokens)
vocab = [w for w, c in freqs.items() if c >= min_count]
idx2word = ['<unk>'] + sorted(vocab)
word2idx = {w: i for i, w in enumerate(idx2word)}
V = len(idx2word)
print(f"Vocab size: {V}, Total tokens: {len(tokens)}")

Vocab size: 533, Total tokens: 1546


In [7]:
cbow_contexts, cbow_targets = [], []
for i, w in enumerate(tokens):
    context = []
    for j in range(i-window_size, i+window_size+1):
        if j != i and 0 <= j < len(tokens):
            context.append(word2idx.get(tokens[j], 0))
    if len(context) == 2 * window_size:
        cbow_contexts.append(context)
        cbow_targets.append(word2idx.get(w, 0))
cbow_contexts = np.array(cbow_contexts, dtype=np.int32)
cbow_targets  = np.array(cbow_targets,  dtype=np.int32)

sg_centers, sg_contexts = [], []
for i, w in enumerate(tokens):
    center = word2idx.get(w, 0)
    for j in range(i-window_size, i+window_size+1):
        if j != i and 0 <= j < len(tokens):
            sg_centers.append(center)
            sg_contexts.append(word2idx.get(tokens[j], 0))
sg_centers = np.array(sg_centers, dtype=np.int32)
sg_contexts = np.array(sg_contexts, dtype=np.int32)

In [8]:
cbow_ds = (tf.data.Dataset
           .from_tensor_slices((cbow_contexts, cbow_targets))
           .shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE))

sg_ds   = (tf.data.Dataset
           .from_tensor_slices((sg_centers, sg_contexts))
           .shuffle(10000).batch(batch_size).prefetch(tf.data.AUTOTUNE))

print("CBOW examples:", cbow_contexts.shape,
      "Skip‑Gram examples:", sg_centers.shape)

CBOW examples: (1542, 4) Skip‑Gram examples: (6178,)


In [9]:
cbow_inputs = Input(shape=(2*window_size,), dtype='int32')
emb_layer_c = layers.Embedding(input_dim=V, output_dim=vector_dim)
x = emb_layer_c(cbow_inputs)                     
x = layers.GlobalAveragePooling1D()(x)           
cbow_outputs = layers.Dense(V)(x)                
cbow_model = Model(cbow_inputs, cbow_outputs, name="CBOW")
cbow_model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy()
)

sg_inputs = Input(shape=(), dtype='int32')
emb_layer_s = layers.Embedding(input_dim=V, output_dim=vector_dim)
y = emb_layer_s(sg_inputs)                       
sg_outputs = layers.Dense(V)(y)                   
sg_model = Model(sg_inputs, sg_outputs, name="SkipGram")
sg_model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy()
)

cbow_model.summary()
sg_model.summary()

In [33]:
history_cbow = cbow_model.fit(cbow_ds, epochs=epochs)
embeddings_cbow = emb_layer_c.get_weights()[0]

Epoch 1/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.1730
Epoch 2/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 4.1613 
Epoch 3/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.1106 
Epoch 4/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 4.1164 
Epoch 5/5
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 4.2561 


In [34]:
history_sg = sg_model.fit(sg_ds, epochs=epochs)
embeddings_sg = emb_layer_s.get_weights()[0]

Epoch 1/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 6.4941
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6.4292
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 6.3677
Epoch 4/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 6.3901
Epoch 5/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 6.4003


In [36]:
final_cbow_loss = history_cbow.history['loss'][-1]
final_sg_loss   = history_sg.history['loss'][-1]
print(f"Final CBOW Training Loss:    {final_cbow_loss:.4f}")
print(f"Final Skip‑Gram Training Loss: {final_sg_loss:.4f}")

Final CBOW Training Loss:    4.2168
Final Skip‑Gram Training Loss: 6.3944


In [37]:
def nearest_safe(word, embeddings, idx2word, word2idx, topn=5):
    if word not in word2idx:
        return []
    idx = word2idx[word]
    sims = cosine_similarity([embeddings[idx]], embeddings)[0]
    nn = np.argsort(-sims)[1:1+topn]
    return [idx2word[i] for i in nn]

probe = ["refund","public","small","medium","college"]
print("Word\tCBOW Neighbors\t\t\t\t\t\tSkipGram Neighbors")
for w in probe:
    print(f"{w}\t{nearest_safe(w, embeddings_cbow, idx2word, word2idx)}\t\t{nearest_safe(w, embeddings_sg, idx2word, word2idx)}")


Word	CBOW Neighbors						SkipGram Neighbors
refund	['inaugural', 'through', 'tax', 'warranty', 'accepts']		['four', 'named', 'defective', 'how', 'tells']
public	['exact', 'rerelease', 'caps', 'assassination', 'tax']		['days', 'cannot', 'displayed', 'works', 'plain']
small	['testing', 'asterisk', 'for', 'conversion', 'ground']		['contain', 'inaccurate', 'used', 'which', 'distribution']
medium	['years', 'thanks', 'dead', 'score', 'long']		['apply', 'wish', 'you', 'disk', 'donations']
college	['proposition', 'you', 'modify', 'birth', 'below']		['near', 'conversion', 'measure', 'named', 'deleted']


In [38]:
def analogy(a, b, c, embeddings, idx2word, word2idx):
    if any(x not in word2idx for x in (a,b,c)): return None
    va, vb, vc = embeddings[word2idx[a]], embeddings[word2idx[b]], embeddings[word2idx[c]]
    target = va - vb + vc
    sims = cosine_similarity([target], embeddings)[0]
    for idx in np.argsort(-sims):
        w = idx2word[idx]
        if w not in {a,b,c}: return w
    return None

tests = [
    ("case","college","project"),
    ("print","legal","public"),
    ("refund","domain","medium"),
]
print("\nAnalogy\t\t\tCBOW\t\t\tSkipGram")
for a,b,c in tests:
    print(f"{a}-{b}+{c}\t{analogy(a,b,c,embeddings_cbow,idx2word,word2idx)}\t\t{analogy(a,b,c,embeddings_sg,idx2word,word2idx)}")


Analogy			CBOW			SkipGram
case-college+project	proprietary		original
print-legal+public	a		continent
refund-domain+medium	dedicate		hold


In [47]:
pairs = [("public","domain"), ("legal","refund"), ("nation","new")]
print("\nPairwise Cosine Similarities:")
for a,b in pairs:
    if a in word2idx and b in word2idx:
        sim_cb = cosine_similarity([embeddings_cbow[word2idx[a]]],
                                   [embeddings_cbow[word2idx[b]]])[0,0]
        sim_sg = cosine_similarity([embeddings_sg[word2idx[a]]],
                                   [embeddings_sg[word2idx[b]]])[0,0]
        print(f"{a}-{b}: CBOW={sim_cb:.3f}, SkipGram={sim_sg:.3f}")
    else:
        print(f"{a}-{b}: word(s) not in vocab")


Pairwise Cosine Similarities:
public-domain: CBOW=0.263, SkipGram=0.222
legal-refund: CBOW=0.204, SkipGram=0.050
nation-new: CBOW=0.088, SkipGram=0.020
