In [54]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import re
from collections import Counter
import math
import random

## TASK 1

In [55]:
import pandas as pd

df = pd.read_excel("reutersNLTK.xlsx")

In [56]:
df.head()

Unnamed: 0,ids,categories,text
0,test/14826,['trade'],ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test/14828,['grain'],CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test/14829,"['crude', 'nat-gas']",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test/14832,"['corn', 'grain', 'rice', 'rubber', 'sugar', '...",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test/14833,"['palm-oil', 'veg-oil']",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...


In [57]:
# Using a subset for faster training demonstration, but you can increase this
corpus_raw = df['text'].astype(str).tolist()[:100] 

def preprocess_corpus(corpus):
    processed = []
    for text in corpus:
        # Lowercase, remove special characters and extra spaces
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        tokens = text.split()
        if len(tokens) > 5: # Only keep sentences with enough context
            processed.append(tokens)
    return processed

In [58]:
corpus = preprocess_corpus(corpus_raw)

# Vocabulary Building
flatten = lambda l: [item for sublist in l for item in sublist]
vocabs = list(set(flatten(corpus)))
vocabs.append('<UNK>')
word2index = {v: idx for idx, v in enumerate(vocabs)}
index2word = {idx: v for v, idx in word2index.items()}
voc_size = len(vocabs)

In [59]:
# Utility for sequences
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

In [60]:
# --- TASK REQUIREMENT: DYNAMIC WINDOW SIZE ---
def get_skipgrams(corpus, window_size=2):
    skipgrams = []
    for doc in corpus:
        for i in range(window_size, len(doc) - window_size):
            center = word2index.get(doc[i], word2index['<UNK>'])
            # Extract context within window_size
            for j in range(-window_size, window_size + 1):
                if j == 0: continue # skip center word
                outside = word2index.get(doc[i+j], word2index['<UNK>'])
                skipgrams.append([center, outside])
    return skipgrams

In [61]:
def random_batch(batch_size, skipgrams):
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
    return np.array(inputs), np.array(labels)

In [None]:
# =========================================================================
# 1. Word2Vec (Without Negative Sampling)
# =========================================================================

class Skipgram(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center) 
        outside_embedding    = self.embedding_center(outside) 
        all_vocabs_embedding = self.embedding_center(all_vocabs) 
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        lower_term_sum = torch.sum(torch.exp(lower_term), 1).reshape(-1, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))
        return loss

# Training Setup
emb_size = 2
batch_size = 64
window_size = 2 # DYNAMIC WINDOW SIZE
skipgrams = get_skipgrams(corpus, window_size)
all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)

model_sg = Skipgram(voc_size, emb_size)
optimizer_sg = optim.Adam(model_sg.parameters(), lr=0.001)

print("Starting Skipgram Without Negative Sampling")
for epoch in range(2000):
    input_batch, label_batch = random_batch(batch_size, skipgrams)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    loss = model_sg(input_tensor, label_tensor, all_vocabs)
    optimizer_sg.zero_grad()
    loss.backward()
    optimizer_sg.step()
    
    if (epoch + 1) % 500 == 0:
        print(f"Epoch {epoch+1} | Loss: {loss.item():2.6f}")


Starting Skipgram Vanilla Training...
Epoch 500 | Loss: 8.692552
Epoch 1000 | Loss: 8.518382
Epoch 1500 | Loss: 8.141383
Epoch 2000 | Loss: 8.175533


In [40]:
# =========================================================================
# 2. Word2Vec (Negative Sampling)
# =========================================================================

# Unigram distribution for negative sampling
z = 0.001
word_count = Counter(flatten(corpus))
num_total_words = sum(word_count.values())
unigram_table = []
for v in vocabs:
    uw = word_count[v] / num_total_words if v in word_count else 1/num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):
        target_index = targets[i].item()
        nsample = []
        while len(nsample) < k:
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index: continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
    return torch.cat(neg_samples)

class SkipgramNeg(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        center_embed   = self.embedding_center(center) 
        outside_embed  = self.embedding_outside(outside) 
        negative_embed = self.embedding_outside(negative) 
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) 
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) 
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) 
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        return -torch.mean(loss)

# Training Setup
model_neg = SkipgramNeg(voc_size, emb_size)
optimizer_neg = optim.Adam(model_neg.parameters(), lr=0.001)
k = 5

print("\nStarting Skipgram Negative Sampling Training...")
for epoch in range(2000):
    input_batch, label_batch = random_batch(batch_size, skipgrams)
    input_tensor = torch.LongTensor(input_batch)
    label_tensor = torch.LongTensor(label_batch)
    
    neg_samples = negative_sampling(label_tensor, unigram_table, k)
    loss = model_neg(input_tensor, label_tensor, neg_samples)
    
    optimizer_neg.zero_grad()
    loss.backward()
    optimizer_neg.step()
    
    if (epoch + 1) % 500 == 0:
        print(f"Epoch {epoch+1} | Loss: {loss.item():2.6f}")


Starting Skipgram Negative Sampling Training...
Epoch 500 | Loss: 2.324990
Epoch 1000 | Loss: 2.142023
Epoch 1500 | Loss: 1.816248
Epoch 2000 | Loss: 1.850167


In [43]:
%pip install gensim


Collecting gensim
  Downloading gensim-4.4.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.5.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart_open>=1.8.1->gensim)
  Downloading wrapt-2.0.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.0 kB)
Downloading gensim-4.4.0-cp313-cp313-macosx_11_0_arm64.whl (24.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.4/24.4 MB[0m [31m39.6 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading smart_open-7.5.0-py3-none-any.whl (63 kB)
Downloading wrapt-2.0.1-cp313-cp313-macosx_11_0_arm64.whl (61 kB)
Installing collected packages: wrapt, smart_open, gensim
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [gensim]2m2/3[0m [gensim]
[1A[2KSuccessfully installed gensim-4.4.0 smart_open-7.5.0 wrapt-2.0.1
Note: you may need to restart the kernel to use updated packages.


In [44]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import pandas as pd
import re
import time
import math
import random
from collections import Counter
from scipy.spatial.distance import cosine
import urllib.request
import gensim.downloader as api

# =========================================================================
# 0. DATA PREPARATION (Shared for all models)
# =========================================================================
df = pd.read_excel("reutersNLTK.xlsx")

# Use a subset for training speed; increase this for better accuracy
corpus_raw = df['text'].astype(str).tolist()[:200] 

def preprocess_corpus(corpus):
    processed = []
    for text in corpus:
        text = text.lower()
        text = re.sub(r'[^a-z\s]', '', text)
        tokens = text.split()
        if len(tokens) > 5:
            processed.append(tokens)
    return processed

corpus = preprocess_corpus(corpus_raw)
flatten = lambda l: [item for sublist in l for item in sublist]
vocabs = list(set(flatten(corpus)))
vocabs.append('<UNK>')
word2index = {v: idx for idx, v in enumerate(vocabs)}
index2word = {idx: v for v, idx in word2index.items()}
voc_size = len(vocabs)

def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

# --- DYNAMIC WINDOW SIZE LOGIC ---
def get_skipgrams(corpus, window_size=2):
    skipgrams = []
    for doc in corpus:
        for i in range(len(doc)):
            center = word2index.get(doc[i], word2index['<UNK>'])
            # Dynamic Window: look left and right
            start = max(0, i - window_size)
            end = min(len(doc), i + window_size + 1)
            for j in range(start, end):
                if i == j: continue
                outside = word2index.get(doc[j], word2index['<UNK>'])
                skipgrams.append([center, outside])
    return skipgrams

def random_batch(batch_size, skipgrams):
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
    return np.array(inputs), np.array(labels)

# Global Config
window_size = 2 # Default window size as requested
batch_size = 64
emb_size = 2
skipgrams = get_skipgrams(corpus, window_size)

# =========================================================================
# 1. Word2Vec (Skipgram Vanilla)
# =========================================================================
class Skipgram(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center) 
        outside_embedding    = self.embedding_center(outside) 
        all_vocabs_embedding = self.embedding_center(all_vocabs) 
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        lower_term_sum = torch.sum(torch.exp(lower_term), 1).reshape(-1, 1)
        loss = -torch.mean(torch.log(top_term / lower_term_sum))
        return loss

model_sg = Skipgram(voc_size, emb_size)
optimizer_sg = optim.Adam(model_sg.parameters(), lr=0.001)
all_vocabs_tensor = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size)

print("Training Skipgram (Vanilla)...")
start_sg = time.time()
for epoch in range(1000):
    input_batch, label_batch = random_batch(batch_size, skipgrams)
    loss_sg = model_sg(torch.LongTensor(input_batch), torch.LongTensor(label_batch), all_vocabs_tensor)
    optimizer_sg.zero_grad(); loss_sg.backward(); optimizer_sg.step()
time_sg = time.time() - start_sg

# =========================================================================
# 2. Word2Vec (Negative Sampling)
# =========================================================================
z = 0.001
word_count = Counter(flatten(corpus))
num_total_words = sum(word_count.values())
unigram_table = []
for v in vocabs:
    uw = word_count[v] / num_total_words if v in word_count else 1/num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):
        target_index = targets[i].item()
        nsample = []
        while len(nsample) < k:
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index: continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
    return torch.cat(neg_samples)

class SkipgramNeg(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        center_embed   = self.embedding_center(center) 
        outside_embed  = self.embedding_outside(outside) 
        negative_embed = self.embedding_outside(negative) 
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) 
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) 
        loss           = self.logsigmoid(uovc) + self.logsigmoid(torch.sum(ukvc, 1).reshape(-1, 1))
        return -torch.mean(loss)

model_neg = SkipgramNeg(voc_size, emb_size)
optimizer_neg = optim.Adam(model_neg.parameters(), lr=0.001)

print("Training Skipgram (Negative Sampling)...")
start_neg = time.time()
for epoch in range(1000):
    input_batch, label_batch = random_batch(batch_size, skipgrams)
    label_tensor = torch.LongTensor(label_batch)
    neg_samples = negative_sampling(label_tensor, unigram_table, k=5)
    loss_neg = model_neg(torch.LongTensor(input_batch), label_tensor, neg_samples)
    optimizer_neg.zero_grad(); loss_neg.backward(); optimizer_neg.step()
time_neg = time.time() - start_neg

# =========================================================================
# 3. GloVe
# =========================================================================
def build_cooccurrence(corpus, window_size):
    cooc = Counter()
    for doc in corpus:
        for i in range(len(doc)):
            start = max(0, i - window_size)
            end = min(len(doc), i + window_size + 1)
            for j in range(start, end):
                if i == j: continue
                cooc[(doc[i], doc[j])] += 1
    return cooc

def glove_random_batch(batch_size, cooc_counts):
    pairs = list(cooc_counts.keys())
    indices = np.random.choice(len(pairs), batch_size)
    inputs, labels, coocs, weights = [], [], [], []
    for idx in indices:
        pair = pairs[idx]
        count = cooc_counts[pair]
        inputs.append([word2index[pair[0]]]); labels.append([word2index[pair[1]]])
        coocs.append([math.log(count)])
        weights.append([(count/100)**0.75 if count < 100 else 1.0])
    return np.array(inputs), np.array(labels), np.array(coocs), np.array(weights)

class Glove(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        self.center_bias = nn.Embedding(voc_size, 1)
        self.outside_bias = nn.Embedding(voc_size, 1)
    def forward(self, center, outside, coocs, weighting):
        c_e = self.center_embedding(center); o_e = self.outside_embedding(outside)
        c_b = self.center_bias(center).squeeze(1); o_b = self.outside_bias(outside).squeeze(1)
        inner = o_e.bmm(c_e.transpose(1, 2)).squeeze(2)
        loss = weighting * torch.pow(inner + c_b + o_b - coocs, 2)
        return torch.sum(loss)

cooc_counts = build_cooccurrence(corpus, window_size)
model_glove = Glove(voc_size, emb_size)
optimizer_glove = optim.Adam(model_glove.parameters(), lr=0.001)

print("Training GloVe...")
start_glove = time.time()
for epoch in range(1000):
    i_b, t_b, c_b, w_b = glove_random_batch(batch_size, cooc_counts)
    loss_gv = model_glove(torch.LongTensor(i_b), torch.LongTensor(t_b), torch.FloatTensor(c_b), torch.FloatTensor(w_b))
    optimizer_glove.zero_grad(); loss_gv.backward(); optimizer_glove.step()
time_glove = time.time() - start_glove

# =========================================================================
# TASK 2: EVALUATION
# =========================================================================
# 1. Loading Analogy Data
url = "https://raw.githubusercontent.com/nicholas-leonard/word2vec/master/questions-words.txt"
urllib.request.urlretrieve(url, "analogy.txt")

semantic_tests, syntactic_tests = [], []
curr_cat = None
with open("analogy.txt", 'r') as f:
    for line in f:
        if line.startswith(':'):
            curr_cat = line.strip()
            continue
        words = line.lower().split()
        if curr_cat == ': capital-common-countries' and all(w in word2index for w in words):
            semantic_tests.append(words)
        elif curr_cat == ': past-tense' and all(w in word2index for w in words):
            syntactic_tests.append(words)

# 2. Solver Function
def solver(model, a, b, c, mode='sg'):
    def get_v(w):
        idx = torch.LongTensor([word2index[w]])
        if mode == 'glove': return (model.center_embedding(idx) + model.outside_embedding(idx)).detach().squeeze().numpy()
        return (model.embedding_center(idx) + model.embedding_outside(idx)).detach().squeeze().numpy()
    
    target = get_v(b) - get_v(a) + get_v(c)
    best_w, max_sim = None, -1
    for w in vocabs:
        if w in [a, b, c, '<UNK>']: continue
        sim = 1 - cosine(target, get_v(w))
        if sim > max_sim: max_sim = sim; best_w = w
    return best_w

def get_acc(model, tests, mode):
    if not tests: return 0.0
    correct = sum(1 for t in tests if solver(model, t[0], t[1], t[2], mode) == t[3])
    return (correct / len(tests)) * 100

# 3. Gensim Benchmark
print("Loading Gensim GloVe...")
g_model = api.load("glove-wiki-gigaword-100")
def get_gensim_acc(tests):
    if not tests: return 0.0
    c = 0
    for t in tests:
        try:
            if g_model.most_similar(positive=[t[1], t[2]], negative=[t[0]], topn=1)[0][0] == t[3]: c += 1
        except: continue
    return (c / len(tests)) * 100

# =========================================================================
# FINAL RESULTS TABLE
# =========================================================================
data = [
    ["Skipgram", window_size, f"{loss_sg.item():.4f}", f"{time_sg:.2f}s", get_acc(model_sg, syntactic_tests, 'sg'), get_acc(model_sg, semantic_tests, 'sg')],
    ["Skipgram (NEG)", window_size, f"{loss_neg.item():.4f}", f"{time_neg:.2f}s", get_acc(model_neg, syntactic_tests, 'neg'), get_acc(model_neg, semantic_tests, 'neg')],
    ["Glove", window_size, f"{loss_gv.item():.4f}", f"{time_glove:.2f}s", get_acc(model_glove, syntactic_tests, 'glove'), get_acc(model_glove, semantic_tests, 'glove')],
    ["Glove (Gensim)", "N/A", "N/A", "N/A", get_gensim_acc(syntactic_tests), get_gensim_acc(semantic_tests)]
]

print("\n" + "="*85)
print(f"{'Model':<18} {'Win':<5} {'Loss':<10} {'Time':<10} {'Syntactic %':<15} {'Semantic %'}")
print("-" * 85)
for r in data:
    print(f"{r[0]:<18} {r[1]:<5} {r[2]:<10} {r[3]:<10} {r[4]:<15.2f} {r[5]:.2f}")

Training Skipgram (Vanilla)...
Training Skipgram (Negative Sampling)...
Training GloVe...
Loading Gensim GloVe...

Model              Win   Loss       Time       Syntactic %     Semantic %
-------------------------------------------------------------------------------------
Skipgram           2     8.7263     7.86s      0.00            0.00
Skipgram (NEG)     2     1.9949     3.78s      0.00            0.00
Glove              2     7.6050     0.48s      0.00            0.00
Glove (Gensim)     N/A   N/A        N/A        0.00            80.00


In [52]:
import numpy as np
import torch
import pandas as pd
from scipy.stats import spearmanr
from gensim.test.utils import datapath

# =========================================================================
# 1. PREPARE LOOKUP DICTIONARIES (Extracting from Trained Models)
# =========================================================================

def create_lookup(model, model_type='sg'):
    # Get center and outside embeddings
    if model_type == 'glove':
        v = model.center_embedding.weight.detach()
        u = model.outside_embedding.weight.detach()
    else:
        v = model.embedding_center.weight.detach()
        u = model.embedding_outside.weight.detach()
    
    # Combined embedding (average)
    W = (v + u) / 2
    # Normalize for cosine similarity via dot product: Wn = W / ||W||
    norm = W.norm(p=2, dim=1, keepdim=True)
    Wn = W / norm
    
    return {"stoi": word2index, "Wn": Wn}

# Create lookups for our trained models
skipgram_lookup = create_lookup(model_sg, 'sg')
skipgram_neg_lookup = create_lookup(model_neg, 'sg')
glove_lookup = create_lookup(model_glove, 'glove')

# =========================================================================
# 2. LOAD WORDSIM353 DATASET (Corrected with Error Handling)
# =========================================================================
ws_path = datapath("wordsim353.tsv")
with open(ws_path, "r", encoding="utf-8", errors="ignore") as f:
    lines = [ln.strip() for ln in f if ln.strip()]

rows = []
for ln in lines:
    parts = ln.split()
    # Check if we have at least 3 parts (Word1, Word2, Score)
    if len(parts) < 3: 
        continue
    
    # Use a try-except block to skip metadata/header lines
    try:
        w1 = parts[0].lower()
        w2 = parts[1].lower()
        score = float(parts[2]) # This will fail on text like 'WordSimilarity-353'
        rows.append((w1, w2, score))
    except ValueError:
        # This skips the line if parts[2] is not a number
        continue

ws = pd.DataFrame(rows, columns=["Word 1", "Word 2", "Human (mean)"])
print(f"Successfully loaded {len(ws)} word pairs.")

# =========================================================================
# 3. SIMILARITY SCORE FUNCTIONS
# =========================================================================
def similarity_scores_torch(lookup, ws_df):
    stoi_local = lookup["stoi"]
    Wn = lookup["Wn"]
    sims, gold, skipped = [], [], 0

    for _, row in ws_df.iterrows():
        w1, w2, score = row["Word 1"], row["Word 2"], row["Human (mean)"]
        if w1 not in stoi_local or w2 not in stoi_local:
            skipped += 1
            continue
        v1, v2 = Wn[stoi_local[w1]], Wn[stoi_local[w2]]
        # Dot product of normalized vectors = Cosine Similarity
        sims.append(torch.dot(v1, v2).item())
        gold.append(score)
    return np.array(sims), np.array(gold), skipped

def similarity_scores_gensim(model, ws_df):
    sims, gold, skipped = [], [], 0
    for _, row in ws_df.iterrows():
        w1, w2, score = row["Word 1"], row["Word 2"], row["Human (mean)"]
        if w1 not in model or w2 not in model:
            skipped += 1
            continue
        sims.append(model.similarity(w1, w2))
        gold.append(score)
    return np.array(sims), np.array(gold), skipped

# =========================================================================
# 4. CALCULATE METRICS
# =========================================================================
results_similarity = []

# Eval loop for custom models
for name, lookup in [("Skipgram", skipgram_lookup), ("Skipgram (NEG)", skipgram_neg_lookup), ("GloVe", glove_lookup)]:
    sims, gold, skipped = similarity_scores_torch(lookup, ws)
    rho, _ = spearmanr(sims, gold)
    # Human scores are 0-10, sims are -1 to 1. To calculate MSE, we normalize human scores to 0-1
    mse = np.mean(((sims) - (gold/10)) ** 2) 
    results_similarity.append({"Model": name, "Spearman": rho, "MSE": mse, "Skipped": skipped})

# Eval for Gensim
sims, gold, skipped = similarity_scores_gensim(g_model, ws)
rho, _ = spearmanr(sims, gold)
mse = np.mean(((sims) - (gold/10)) ** 2)
results_similarity.append({"Model": "GloVe (Gensim)", "Spearman": rho, "MSE": mse, "Skipped": skipped})

sim_df = pd.DataFrame(results_similarity)

# =========================================================================
# 5. FINAL MERGED TABLE & TABLE 1 (SWAPPED)
# =========================================================================
# (Assuming analogy_df and training_df were created in Task 2 parts 1 & 2)
# Here we create a dummy summary for the final merge
final_table = sim_df.copy()

# ROUNDING
final_table["Spearman"] = final_table["Spearman"].round(3)
final_table["MSE"] = final_table["MSE"].round(3)

print("Standard Model Comparison Table:")
print(final_table)

# TABLE 1: Swapped Columns and Rows Table
table_1_swapped = final_table.set_index("Model").T

print("\nTable 1. Swapped Columns and Rows Table")
print(table_1_swapped)

# Assessment
print("\nAssessment:")
rho_val = final_table.loc[final_table['Model'] == 'GloVe', 'Spearman'].values[0]
if rho_val < 0.2:
    print(f"The GloVe model Spearman correlation is {rho_val}. It correlates weakly with human judgment due to data limitations.")
else:
    print(f"The GloVe model Spearman correlation is {rho_val}, showing some alignment with human judgment.")

Successfully loaded 354 word pairs.
Standard Model Comparison Table:
            Model  Spearman    MSE  Skipped
0        Skipgram    -0.081  0.920      267
1  Skipgram (NEG)     0.125  0.789      267
2           GloVe    -0.049  0.665      267
3  GloVe (Gensim)     0.536  0.053        0

Table 1. Swapped Columns and Rows Table
Model     Skipgram  Skipgram (NEG)    GloVe  GloVe (Gensim)
Spearman    -0.081           0.125   -0.049           0.536
MSE          0.920           0.789    0.665           0.053
Skipped    267.000         267.000  267.000           0.000

Assessment:
The GloVe model Spearman correlation is -0.049. It correlates weakly with human judgment due to data limitations.


In [53]:
import torch
import numpy as np

# 1. Function to convert any text into a vector (Average of word embeddings)
def get_text_vector(text, model, word2index):
    tokens = text.lower().split()
    vectors = []
    for token in tokens:
        if token in word2index:
            idx = torch.LongTensor([word2index[token]])
            # Using GloVe average of center and outside
            embed = (model.center_embedding(idx) + model.outside_embedding(idx)) / 2
            vectors.append(embed.detach().squeeze().numpy())
    
    if not vectors: # If no words in vocab, return zero vector
        return np.zeros(emb_size)
    
    return np.mean(vectors, axis=0)

# 2. Pre-calculate vectors for the entire corpus (Top 500 docs for speed)
reuters_docs = df['text'].astype(str).tolist()[:500]
corpus_vectors = np.array([get_text_vector(doc, model_glove, word2index) for doc in reuters_docs])

# 3. Search function
def search(query, top_n=10):
    query_vec = get_text_vector(query, model_glove, word2index)
    # Compute dot product against all documents
    scores = np.dot(corpus_vectors, query_vec)
    # Get indices of top 10 scores
    top_indices = np.argsort(scores)[::-1][:top_n]
    
    results = []
    for idx in top_indices:
        results.append({
            "text": reuters_docs[idx][:200] + "...", # Show snippet
            "score": round(float(scores[idx]), 4)
        })
    return results