<a href="https://colab.research.google.com/github/sreevanimtcs2502/sreevanimtcs2502/blob/nlp/neg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =======================
# Skip-gram with Negative Sampling vs Gensim (Single Cell)
# =======================

import os, zipfile, urllib.request, re, random
import numpy as np
from collections import Counter
from numpy.linalg import norm

# -----------------------
# 1. Download & Load Dataset (text8)
# -----------------------
url = "http://mattmahoney.net/dc/text8.zip"
if not os.path.exists("text8.zip"):
    urllib.request.urlretrieve(url, "text8.zip")

with zipfile.ZipFile("text8.zip") as z:
    text = z.read("text8").decode("utf-8")

tokens = text.split()[:2_000_000]  # medium subset for speed

# -----------------------
# 2. Build Vocabulary
# -----------------------
min_count = 5
counts = Counter(tokens)
vocab = {w:i for i,(w,c) in enumerate(counts.items()) if c>=min_count}
id2word = {i:w for w,i in vocab.items()}

tokens = [w for w in tokens if w in vocab]

VOCAB_SIZE = len(vocab)
print("Vocab size:", VOCAB_SIZE)

# -----------------------
# 3. Generate Training Pairs
# -----------------------
WINDOW = 5
pairs = []

for i,w in enumerate(tokens):
    target = vocab[w]
    for j in range(max(0,i-WINDOW), min(len(tokens),i+WINDOW)):
        if i!=j:
            pairs.append((target, vocab[tokens[j]]))

random.shuffle(pairs)
pairs = pairs[:200_000]  # subsample for speed

# -----------------------
# 4. Negative Sampling Distribution
# -----------------------
word_freq = np.array([counts[id2word[i]] for i in range(VOCAB_SIZE)])
noise_dist = word_freq**0.75
noise_dist /= noise_dist.sum()

# -----------------------
# 5. Skip-gram with Negative Sampling (NumPy)
# -----------------------
EMB = 100
NEG = 5
LR = 0.025
EPOCHS = 1

W_in = np.random.uniform(-0.5,0.5,(VOCAB_SIZE,EMB))
W_out = np.random.uniform(-0.5,0.5,(VOCAB_SIZE,EMB))

sigmoid = lambda x: 1/(1+np.exp(-x))

for epoch in range(EPOCHS):
    loss = 0
    for t,c in pairs:
        z = np.dot(W_in[t], W_out[c])
        grad = sigmoid(z)-1

        W_out[c] -= LR*grad*W_in[t]
        W_in[t] -= LR*grad*W_out[c]

        negs = np.random.choice(VOCAB_SIZE,NEG,p=noise_dist)
        for n in negs:
            zn = np.dot(W_in[t],W_out[n])
            gradn = sigmoid(zn)
            W_out[n] -= LR*gradn*W_in[t]
            W_in[t] -= LR*gradn*W_out[n]

        loss += -np.log(sigmoid(z))
    print("Custom SGNS loss:", round(loss,2))

# -----------------------
# 6. Train Gensim Word2Vec
# -----------------------
from gensim.models import Word2Vec

sentences = [tokens[i:i+1000] for i in range(0,len(tokens),1000)]

gensim_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    negative=5,
    sg=1,
    min_count=5,
    epochs=3
)

# -----------------------
# 7. Cosine Similarity Comparison
# -----------------------
def cos(a,b): return np.dot(a,b)/(norm(a)*norm(b))

word = "king"
if word in vocab:
    print("\nCosine similarity (king):",
          cos(W_in[vocab[word]], gensim_model.wv[word]))

# -----------------------
# 8. Word Analogy Task
# -----------------------
def analogy(a,b,c,emb):
    v = emb[vocab[a]] - emb[vocab[b]] + emb[vocab[c]]
    scores = {id2word[i]:cos(v,emb[i]) for i in range(VOCAB_SIZE)}
    return sorted(scores,key=scores.get,reverse=True)[:5]

print("\nCustom analogy king-man+woman:", analogy("king","man","woman",W_in))
print("Gensim analogy:",
      gensim_model.wv.most_similar(
          positive=["king","woman"], negative=["man"], topn=5))

# -----------------------
# 9. Bias Detection (Gender Bias)
# -----------------------
bias_custom = W_in[vocab["she"]] - W_in[vocab["he"]]
bias_gensim = gensim_model.wv["she"] - gensim_model.wv["he"]

def bias_score(word, emb, bias):
    return cos(emb[vocab[word]], bias)

test_words = ["doctor","nurse","engineer","teacher"]

print("\nBias scores (custom):")
for w in test_words:
    print(w, round(bias_score(w,W_in,bias_custom),3))

print("\nBias scores (gensim):")
for w in test_words:
    print(w, round(cos(gensim_model.wv[w],bias_gensim),3))


Vocab size: 21681


KeyError: 16

In [5]:
# ============================================================
# Skip-gram with Negative Sampling (30-min Safe Version)
# Dataset: text8 (Wikipedia)
# ============================================================

import os, zipfile, urllib.request
import numpy as np
from collections import Counter
from numpy.linalg import norm
import random

# -----------------------
# 1. Download Dataset
# -----------------------
url = "http://mattmahoney.net/dc/text8.zip"
if not os.path.exists("text8.zip"):
    urllib.request.urlretrieve(url, "text8.zip")

with zipfile.ZipFile("text8.zip") as z:
    text = z.read("text8").decode("utf-8")

tokens = text.split()[:5_000_000]   # 5M tokens → safe runtime

# -----------------------
# 2. Vocabulary + Subsampling
# -----------------------
min_count = 10
counts = Counter(tokens)
# Fix: Ensure vocabulary indices are contiguous after filtering by min_count
filtered_words = [w for w, c in counts.items() if c >= min_count]
vocab = {word: i for i, word in enumerate(filtered_words)}
id2word = {i: word for word, i in vocab.items()}

# Store original tokens (before subsampling) for Gensim to build its vocabulary
original_tokens_for_gensim = list(tokens)

# Subsampling frequent words (Mikolov) - this modifies 'tokens' for the custom model
t = 1e-5
tokens = [
    w for w in tokens if w in vocab
    and random.random() < np.sqrt(t / counts[w])
]

VOCAB_SIZE = len(vocab)
print("Vocab size:", VOCAB_SIZE)
print("Tokens after subsampling:", len(tokens))

# -----------------------
# 3. Training Pairs
# -----------------------
WINDOW = 5
pairs = []

for i,w in enumerate(tokens):
    target = vocab[w]
    start = max(0, i-WINDOW)
    end = min(len(tokens), i+WINDOW)
    for j in range(start,end):
        if i!=j:
            pairs.append((target, vocab[tokens[j]]))

random.shuffle(pairs)
pairs = pairs[:1_000_000]   # cap pairs → runtime control
print("Training pairs:", len(pairs))

# -----------------------
# 4. Negative Sampling Distribution
# -----------------------
freq = np.array([counts[id2word[i]] for i in range(VOCAB_SIZE)])
noise_dist = freq**0.75
noise_dist /= noise_dist.sum()

# -----------------------
# 5. SGNS Model (Vectorized)
# -----------------------
EMB = 100
NEG = 5
LR = 0.025
EPOCHS = 1

W_in = np.random.uniform(-0.5,0.5,(VOCAB_SIZE,EMB))
W_out = np.random.uniform(-0.5,0.5,(VOCAB_SIZE,EMB))

def sigmoid(x): return 1/(1+np.exp(-x))

for epoch in range(EPOCHS):
    loss = 0
    for t,c in pairs:
        v_in = W_in[t]
        v_out = W_out[c]

        # Positive sample
        score = sigmoid(np.dot(v_in,v_out))
        grad = score - 1

        W_out[c] -= LR*grad*v_in
        W_in[t] -= LR*grad*v_out

        # Negative samples (vectorized)
        negs = np.random.choice(VOCAB_SIZE,NEG,p=noise_dist)
        neg_vecs = W_out[negs]
        scores = sigmoid(np.dot(neg_vecs,v_in))

        W_out[negs] -= LR*(scores[:,None]*v_in)
        W_in[t] -= LR*np.sum(scores[:,None]*neg_vecs,axis=0)

        loss += -np.log(score+1e-9)

    print("Epoch loss:", round(loss,2))

# -----------------------
# 6. Train Gensim Word2Vec
# -----------------------
!pip install gensim
from gensim.models import Word2Vec

# Use the original_tokens_for_gensim for sentence creation
sentences = [original_tokens_for_gensim[i:i+1000] for i in range(0,len(original_tokens_for_gensim),1000)]

gensim_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    negative=5,
    sg=1,
    min_count=10,
    epochs=3,
    workers=4
)

# -----------------------
# 7. Cosine Similarity Comparison
# -----------------------
def cos(a,b): return np.dot(a,b)/(norm(a)*norm(b))

word = "king"
if word in vocab:
    print("\nCosine similarity (king):",
          round(cos(W_in[vocab[word]],gensim_model.wv[word]),3))

# -----------------------
# 8. Word Analogy
# -----------------------
def analogy(a,b,c):
    v = W_in[vocab[a]] - W_in[vocab[b]] + W_in[vocab[c]]
    scores = {id2word[i]:cos(v,W_in[i]) for i in range(VOCAB_SIZE)}
    return sorted(scores,key=scores.get,reverse=True)[:5]

print("\nCustom analogy king-man+woman:", analogy("king","man","woman"))
print("Gensim analogy:",
      gensim_model.wv.most_similar(
          positive=["king","woman"], negative=["man"], topn=5))

# -----------------------
# 9. Bias Detection
# -----------------------
bias_custom = W_in[vocab["she"]] - W_in[vocab["he"]]
bias_gensim = gensim_model.wv["she"] - gensim_model.wv["he"]

def bias_score(word, emb, bias):
    return cos(emb[vocab[word]], bias)

test_words = ["doctor","nurse","engineer","teacher"]

print("\nBias scores (custom):")
for w in test_words:
    print(w, round(bias_score(w,W_in,bias_custom),3))

print("\nBias scores (gensim):")
for w in test_words:
    print(w, round(cos(gensim_model.wv[w],bias_gensim),3))

Vocab size: 23599
Tokens after subsampling: 608
Training pairs: 5447
Epoch loss: 4190.41

Cosine similarity (king): -0.079

Custom analogy king-man+woman: ['woman', 'king', 'lowers', 'solids', 'orthography']
Gensim analogy: [('throne', 0.6682941317558289), ('amalric', 0.6681479215621948), ('hezekiah', 0.6666582226753235), ('judah', 0.6609217524528503), ('andronicus', 0.6596450805664062)]

Bias scores (custom):
doctor 0.098
nurse -0.143
engineer -0.061
teacher -0.098

Bias scores (gensim):
doctor 0.12
nurse 0.174
engineer -0.031
teacher 0.023
