<a href="https://colab.research.google.com/github/sreevanimtcs2502/sreevanimtcs2502/blob/nlp/neg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:


import os, zipfile, urllib.request
import numpy as np
from collections import Counter
from numpy.linalg import norm
import random


url = "http://mattmahoney.net/dc/text8.zip"
if not os.path.exists("text8.zip"):
    urllib.request.urlretrieve(url, "text8.zip")

with zipfile.ZipFile("text8.zip") as z:
    text = z.read("text8").decode("utf-8")

tokens = text.split()[:5_000_000]   # 5M tokens â†’ safe runtime


min_count = 10
counts = Counter(tokens)

filtered_words = [w for w, c in counts.items() if c >= min_count]
vocab = {word: i for i, word in enumerate(filtered_words)}
id2word = {i: word for word, i in vocab.items()}


original_tokens_for_gensim = list(tokens)


t = 1e-5
tokens = [
    w for w in tokens if w in vocab
    and random.random() < np.sqrt(t / counts[w])
]

VOCAB_SIZE = len(vocab)
print("Vocab size:", VOCAB_SIZE)
print("Tokens after subsampling:", len(tokens))


WINDOW = 5
pairs = []

for i,w in enumerate(tokens):
    target = vocab[w]
    start = max(0, i-WINDOW)
    end = min(len(tokens), i+WINDOW)
    for j in range(start,end):
        if i!=j:
            pairs.append((target, vocab[tokens[j]]))

random.shuffle(pairs)
pairs = pairs[:1_000_000]
print("Training pairs:", len(pairs))


freq = np.array([counts[id2word[i]] for i in range(VOCAB_SIZE)])
noise_dist = freq**0.75
noise_dist /= noise_dist.sum()


EMB = 100
NEG = 5
LR = 0.025
EPOCHS = 1

W_in = np.random.uniform(-0.5,0.5,(VOCAB_SIZE,EMB))
W_out = np.random.uniform(-0.5,0.5,(VOCAB_SIZE,EMB))

def sigmoid(x): return 1/(1+np.exp(-x))

for epoch in range(EPOCHS):
    loss = 0
    for t,c in pairs:
        v_in = W_in[t]
        v_out = W_out[c]


        score = sigmoid(np.dot(v_in,v_out))
        grad = score - 1

        W_out[c] -= LR*grad*v_in
        W_in[t] -= LR*grad*v_out


        negs = np.random.choice(VOCAB_SIZE,NEG,p=noise_dist)
        neg_vecs = W_out[negs]
        scores = sigmoid(np.dot(neg_vecs,v_in))

        W_out[negs] -= LR*(scores[:,None]*v_in)
        W_in[t] -= LR*np.sum(scores[:,None]*neg_vecs,axis=0)

        loss += -np.log(score+1e-9)

    print("Epoch loss:", round(loss,2))


!pip install gensim
from gensim.models import Word2Vec


sentences = [original_tokens_for_gensim[i:i+1000] for i in range(0,len(original_tokens_for_gensim),1000)]

gensim_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    negative=5,
    sg=1,
    min_count=10,
    epochs=3,
    workers=4
)


def cos(a,b): return np.dot(a,b)/(norm(a)*norm(b))

word = "king"
if word in vocab:
    print("\nCosine similarity (king):",
          round(cos(W_in[vocab[word]],gensim_model.wv[word]),3))


def analogy(a,b,c):
    v = W_in[vocab[a]] - W_in[vocab[b]] + W_in[vocab[c]]
    scores = {id2word[i]:cos(v,W_in[i]) for i in range(VOCAB_SIZE)}
    return sorted(scores,key=scores.get,reverse=True)[:5]

print("\nCustom analogy king-man+woman:", analogy("king","man","woman"))
print("Gensim analogy:",
      gensim_model.wv.most_similar(
          positive=["king","woman"], negative=["man"], topn=5))


bias_custom = W_in[vocab["she"]] - W_in[vocab["he"]]
bias_gensim = gensim_model.wv["she"] - gensim_model.wv["he"]

def bias_score(word, emb, bias):
    return cos(emb[vocab[word]], bias)

test_words = ["doctor","nurse","engineer","teacher"]

print("\nBias scores (custom):")
for w in test_words:
    print(w, round(bias_score(w,W_in,bias_custom),3))

print("\nBias scores (gensim):")
for w in test_words:
    print(w, round(cos(gensim_model.wv[w],bias_gensim),3))

Vocab size: 23599
Tokens after subsampling: 608
Training pairs: 5447
Epoch loss: 4190.41

Cosine similarity (king): -0.079

Custom analogy king-man+woman: ['woman', 'king', 'lowers', 'solids', 'orthography']
Gensim analogy: [('throne', 0.6682941317558289), ('amalric', 0.6681479215621948), ('hezekiah', 0.6666582226753235), ('judah', 0.6609217524528503), ('andronicus', 0.6596450805664062)]

Bias scores (custom):
doctor 0.098
nurse -0.143
engineer -0.061
teacher -0.098

Bias scores (gensim):
doctor 0.12
nurse 0.174
engineer -0.031
teacher 0.023
