# NLP Assignment 1: Thatâ€™s What I LIKE (Complete Pipeline)

This notebook contains the complete implementation for the NLP Assignment. All outputs and data are stored within the `git` folder.

## 0. Setup & Configuration
Ensure we are running in the correct directory.

In [1]:

import os
import sys

# 1. Set Working Directory to 'git' folder
if os.path.exists('git') and os.path.isdir('git'):
    os.chdir('git')
    print(f"Changed directory to: {os.getcwd()}")
else:
    print(f"Current directory: {os.getcwd()}")

# 2. Create necessary directories
os.makedirs('models', exist_ok=True)
os.makedirs('nltk_data', exist_ok=True)


Changed directory to: /Users/htutkoko/Library/CloudStorage/OneDrive-AsianInstituteofTechnology/AIT_Study/AT82.05_NLP/A1/git


## 1. Data Loader

Handles loading the Reuters corpus, building vocabulary, and numericalization.

In [2]:

import nltk
from nltk.corpus import reuters
import numpy as np
import torch
from collections import Counter

import os

nltk_data_dir = os.path.join(os.getcwd(), 'nltk_data')
if not os.path.exists(nltk_data_dir):
    os.makedirs(nltk_data_dir)
nltk.data.path.append(nltk_data_dir)

try:
    nltk.data.find('corpora/reuters')
except LookupError:
    print("Downloading reuters corpus...")
    nltk.download('reuters', download_dir=nltk_data_dir, quiet=True)
    nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
    nltk.download('punkt_tab', download_dir=nltk_data_dir, quiet=True)

import zipfile
reuters_zip_path = os.path.join(nltk_data_dir, 'corpora', 'reuters.zip')
reuters_dir_path = os.path.join(nltk_data_dir, 'corpora', 'reuters')

if os.path.exists(reuters_zip_path) and not os.path.exists(reuters_dir_path):
    print(f"Unzipping {reuters_zip_path}...")
    with zipfile.ZipFile(reuters_zip_path, 'r') as zip_ref:
        zip_ref.extractall(os.path.join(nltk_data_dir, 'corpora'))
    print("Unzipping complete.")

MIN_FREQ = 5 # Minimum frequency for words to be included in vocab

class DataLoader:
    def __init__(self, min_freq=MIN_FREQ):
        self.min_freq = min_freq
        self.categories = None # Use full corpus
        print(f"Loading Reuters corpus (Full)...")
        
        self.sentences = reuters.sents()
        self.corpus = [[word.lower() for word in sent] for sent in self.sentences]
        
        print(f"Corpus size: {len(self.corpus)} sentences")
        
        self.build_vocab()
        
    def build_vocab(self):
        print("Building vocabulary...")
        flatten = lambda l: [item for sublist in l for item in sublist]
        self.all_words = flatten(self.corpus)
        self.word_count = Counter(self.all_words)
        
        self.vocab = [w for w, c in self.word_count.items() if c >= self.min_freq]
        self.vocab.append('<UNK>')
        
        self.word2index = {w: i for i, w in enumerate(self.vocab)}
        self.index2word = {i: w for w, i in self.word2index.items()}
        self.voc_size = len(self.vocab)
        
        print(f"Vocabulary size: {self.voc_size}")
        
    def get_numericalized_corpus(self):
        unk_idx = self.word2index['<UNK>']
        numericalized_corpus = []
        for sent in self.corpus:
            sent_indices = [self.word2index.get(w, unk_idx) for w in sent]
            numericalized_corpus.append(sent_indices)
        return numericalized_corpus

    def get_vocab(self):
        return self.vocab
        
    def get_word2index(self):
        return self.word2index


## 2. Model Definitions

PyTorch classes for Skipgram, Negative Sampling, and GloVe.

In [3]:

import torch
import torch.nn as nn

class Skipgram(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center) 
        outside_embedding    = self.embedding_outside(outside) 
        all_vocabs_embedding = self.embedding_outside(all_vocabs) 
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        lower_term_sum = torch.sum(torch.exp(lower_term), 1) 
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum)) 
        return loss

class SkipgramNeg(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        center_embed   = self.embedding_center(center) 
        outside_embed  = self.embedding_outside(outside) 
        negative_embed = self.embedding_outside(negative) 
        
        uovc = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) 
        ukvc = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) 
        
        loss = self.logsigmoid(uovc) + torch.sum(self.logsigmoid(ukvc), 1).unsqueeze(1)
        return -torch.mean(loss)

class Glove(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) 
        outside_embeds = self.outside_embedding(outside) 
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        return torch.sum(loss)


## 3. Training Word2Vec (Skipgram)

**Note**: Training on full Reuters corpus might take time.

In [4]:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import time

loader = DataLoader() # min_freq=5
corpus = loader.get_numericalized_corpus()
voc_size = loader.voc_size
word2index = loader.get_word2index()
index2word = {v:k for k, v in word2index.items()}

print(f"Vocab Size: {voc_size}")

def generate_skipgrams(corpus, window_size=2):
    skipgrams = []
    for doc in corpus:
        for i in range(window_size, len(doc)-window_size):
            center = doc[i]
            outside = []
            for w in range(1, window_size+1):
                outside.append(doc[i-w])
                outside.append(doc[i+w])
            
            for each_out in outside:
                skipgrams.append([center, each_out])
    return skipgrams

def get_batch(corpus, batch_size, window_size=2):
    
    inputs, labels = [], []
    
    while len(inputs) < batch_size:
        doc_idx = np.random.randint(0, len(corpus))
        doc = corpus[doc_idx]
        
        if len(doc) < 2 * window_size + 1:
            continue
            
        center_idx = np.random.randint(window_size, len(doc) - window_size)
        center = doc[center_idx]
        
        offsets = list(range(-window_size, 0)) + list(range(1, window_size + 1))
        offset = np.random.choice(offsets)
        outside = doc[center_idx + offset]
        
        inputs.append([center])
        labels.append([outside])
        
    return np.array(inputs), np.array(labels)

class Skipgram(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_outside(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_outside(all_vocabs) #(batch_size, voc_size, emb_size) 
        
        all_vocabs_embedding = self.embedding_outside(all_vocabs)
        outside_embedding = self.embedding_outside(outside)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        return loss

device = torch.device('cpu') # Use CPU for now as default
device = torch.device('cpu') # Use CPU for now as default
batch_size = 512 # Increased for faster pure-python batching
emb_size   = 2 # Instruction says "Compare... models". Usually embedding size is larger but notebook used 2. 
EMB_SIZE = 10 

model      = Skipgram(voc_size, EMB_SIZE).to(device)
optimizer  = optim.Adam(model.parameters(), lr=0.001)

all_vocabs = torch.LongTensor(list(range(voc_size))).unsqueeze(0).expand(batch_size, voc_size).to(device)

num_epochs = 5 # Reduced epochs because each epoch is now FULL pass (much longer)
print(f"Training Skipgram (Window Size=2, Emb Size={EMB_SIZE}, Epochs={num_epochs})...")

num_tokens = sum([len(doc) for doc in corpus])
print(f"Total Tokens: {num_tokens}")
fields_per_token = 2 # window size 2 -> 2 pairs left, 2 pairs right? No, dynamic.
approx_total_pairs = num_tokens * 2 * 2 
steps_per_epoch = max(1, approx_total_pairs // batch_size)
print(f"Steps per epoch: {steps_per_epoch}")

total_start = time.time()
for epoch in range(num_epochs):    
    start = time.time()
    total_loss = 0
    
    for step in range(steps_per_epoch):
        input_batch, label_batch = get_batch(corpus, batch_size, window_size=2)
        input_tensor = torch.LongTensor(input_batch).to(device)
        label_tensor = torch.LongTensor(label_batch).to(device)
        
        loss = model(input_tensor, label_tensor, all_vocabs)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if (step + 1) % 1000 == 0:
            print(f"  Step {step+1}/{steps_per_epoch} | Loss: {loss.item():.6f}")

    end = time.time()
    avg_loss = total_loss / steps_per_epoch
    print(f"Epoch {epoch+1:6.0f} | Avg Loss: {avg_loss:.6f} | Time: {end-start:.4f}s")

save_path = 'models/skipgram_model.pth'
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")

import json
metrics_path = 'models/metrics.json'
if os.path.exists(metrics_path):
    with open(metrics_path, 'r') as f:
        metrics = json.load(f)
else:
    metrics = {}

metrics['Skipgram'] = {
    'window_size': 2,
    'training_loss': avg_loss,
    'training_time': time.time() - total_start
}
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=4)
print("Metrics saved.")


Loading Reuters corpus (Full)...
Corpus size: 54716 sentences
Building vocabulary...
Vocabulary size: 11678
Vocab Size: 11678
Training Skipgram (Window Size=2, Emb Size=10, Epochs=5)...
Total Tokens: 1720917
Steps per epoch: 13444
  Step 1000/13444 | Loss: 10.726004
  Step 2000/13444 | Loss: 9.221310
  Step 3000/13444 | Loss: 7.893904
  Step 4000/13444 | Loss: 7.371448
  Step 5000/13444 | Loss: 7.016697
  Step 6000/13444 | Loss: 6.769714
  Step 7000/13444 | Loss: 6.613394
  Step 8000/13444 | Loss: 6.561038
  Step 9000/13444 | Loss: 6.647599
  Step 10000/13444 | Loss: 6.428586
  Step 11000/13444 | Loss: 6.461717
  Step 12000/13444 | Loss: 6.460613
  Step 13000/13444 | Loss: 6.290025
Epoch      1 | Avg Loss: 7.491995 | Time: 7270.4656s
  Step 1000/13444 | Loss: 6.179290
  Step 2000/13444 | Loss: 6.227787
  Step 3000/13444 | Loss: 6.393227
  Step 4000/13444 | Loss: 6.253748
  Step 5000/13444 | Loss: 6.103871
  Step 6000/13444 | Loss: 6.436166
  Step 7000/13444 | Loss: 6.297138
  Step 8000

## 4. Training Word2Vec (Negative Sampling)

In [5]:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import time

loader = DataLoader()
corpus = loader.get_numericalized_corpus()
voc_size = loader.voc_size
word2index = loader.get_word2index()

print("Preparing Unigram Table...")
word_count = loader.word_count
total_words = sum(word_count.values())
unigram_table = []
z = 0.001
for w in loader.vocab:
    if w == '<UNK>': continue
    idx = word2index[w]
    uw = word_count[w] / total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([idx] * uw_alpha)

if '<UNK>' in word2index:
    unk_idx = word2index['<UNK>']
    unigram_table.extend([unk_idx] * 10)

print(f"Unigram Table Size: {len(unigram_table)}")

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):
        target_index = targets[i].item()
        nsample = []
        while len(nsample) < k:
            neg = random.choice(unigram_table)
            if neg == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(nsample)
    return torch.LongTensor(neg_samples)

def get_batch(corpus, batch_size, window_size=2):
    inputs, labels = [], []
    while len(inputs) < batch_size:
        doc_idx = np.random.randint(0, len(corpus))
        doc = corpus[doc_idx]
        if len(doc) < 2 * window_size + 1:
            continue
        center_idx = np.random.randint(window_size, len(doc) - window_size)
        center = doc[center_idx]
        offsets = list(range(-window_size, 0)) + list(range(1, window_size + 1))
        offset = np.random.choice(offsets)
        outside = doc[center_idx + offset]
        inputs.append([center])
        labels.append([outside])
    return np.array(inputs), np.array(labels)

class SkipgramNeg(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size) # Use outside embedding for negative samples
        
        uovc = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        
        ukvc = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        
        loss = self.logsigmoid(uovc) + torch.sum(self.logsigmoid(ukvc), 1).unsqueeze(1)
        
        return -torch.mean(loss)

device = torch.device('cpu')
batch_size = 512
emb_size   = 10
model      = SkipgramNeg(voc_size, emb_size).to(device)
optimizer  = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
k = 5 # negative samples

print(f"Training SkipgramNEG (Window=2, Emb={emb_size}, Epochs={num_epochs}, k={k})...")

num_tokens = sum([len(doc) for doc in corpus])
print(f"Total Tokens: {num_tokens}")
approx_total_pairs = num_tokens * 2 * 2 
steps_per_epoch = max(1, approx_total_pairs // batch_size)
print(f"Steps per epoch: {steps_per_epoch}")

total_start = time.time()
for epoch in range(num_epochs):
    start = time.time()
    total_loss = 0
    
    for step in range(steps_per_epoch):
        input_batch, label_batch = get_batch(corpus, batch_size, window_size=2)
        input_tensor = torch.LongTensor(input_batch).to(device)
        label_tensor = torch.LongTensor(label_batch).to(device)
        
        neg_tensor = negative_sampling(label_tensor, unigram_table, k).to(device)
        
        loss = model(input_tensor, label_tensor, neg_tensor)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if (step + 1) % 1000 == 0:
             print(f"  Step {step+1}/{steps_per_epoch} | Loss: {loss.item():.6f}")
    
    end = time.time()
    avg_loss = total_loss / steps_per_epoch
    print(f"Epoch {epoch+1:6.0f} | Avg Loss: {avg_loss:.6f} | Time: {end-start:.4f}s")

save_path = 'models/skipgram_neg_model.pth'
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")

import json
metrics_path = 'models/metrics.json'
if os.path.exists(metrics_path):
    with open(metrics_path, 'r') as f:
        metrics = json.load(f)
else:
    metrics = {}

metrics['SkipgramNeg'] = {
    'window_size': 2,
    'training_loss': avg_loss,
    'training_time': time.time() - total_start
}
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=4)
print("Metrics saved.")


Loading Reuters corpus (Full)...
Corpus size: 54716 sentences
Building vocabulary...
Vocabulary size: 11678
Preparing Unigram Table...
Unigram Table Size: 3393
Training SkipgramNEG (Window=2, Emb=10, Epochs=5, k=5)...
Total Tokens: 1720917
Steps per epoch: 13444
  Step 1000/13444 | Loss: 5.571229
  Step 2000/13444 | Loss: 4.086699
  Step 3000/13444 | Loss: 3.293912
  Step 4000/13444 | Loss: 2.950686
  Step 5000/13444 | Loss: 2.591558
  Step 6000/13444 | Loss: 2.452118
  Step 7000/13444 | Loss: 2.460414
  Step 8000/13444 | Loss: 2.381029
  Step 9000/13444 | Loss: 2.298933
  Step 10000/13444 | Loss: 2.291488
  Step 11000/13444 | Loss: 2.259181
  Step 12000/13444 | Loss: 2.243899
  Step 13000/13444 | Loss: 2.140116
Epoch      1 | Avg Loss: 3.038960 | Time: 322.2942s
  Step 1000/13444 | Loss: 2.129869
  Step 2000/13444 | Loss: 2.122855
  Step 3000/13444 | Loss: 2.151317
  Step 4000/13444 | Loss: 2.112882
  Step 5000/13444 | Loss: 2.069285
  Step 6000/13444 | Loss: 2.053159
  Step 7000/1344

## 5. Training GloVe

In [6]:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from itertools import combinations_with_replacement
import time
import math
import random

loader = DataLoader() # min_freq=5
corpus = loader.get_numericalized_corpus()
voc_size = loader.voc_size
vocab = loader.vocab
word2index = loader.get_word2index()

print("Building Co-occurrence Matrix...")
X_ik = {}

WINDOW_SIZE = 2

skip_grams = []
for doc in corpus:
    for i in range(1, len(doc)-1):
        center = doc[i]
        start = max(0, i - WINDOW_SIZE)
        end = min(len(doc), i + WINDOW_SIZE + 1)
        outside = [doc[j] for j in range(start, end) if j != i]
        
        for each_out in outside:
            skip_grams.append((center, each_out))
            
X_ik_skipgrams = Counter(skip_grams)

weighting_dic = {}
X_ik = {}

def weighting(x_ij):
    x_max = 100
    alpha = 0.75
    if x_ij < x_max:
        return (x_ij / x_max)**alpha
    else:
        return 1

print("Calculating weights...")
for bigram, count in X_ik_skipgrams.items():
    X_ik[bigram] = count
    weighting_dic[bigram] = weighting(count)

skip_grams_keys = list(X_ik.keys())
print(f"Number of non-zero co-occurrences: {len(skip_grams_keys)}")

class Glove(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        return torch.sum(loss)

def get_batch(batch_size, skip_grams_keys, X_ik, weighting_dic):
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    indices = np.random.choice(len(skip_grams_keys), batch_size, replace=False)
    
    for i in indices:
        pair = skip_grams_keys[i]
        center, outside = pair
        random_inputs.append([center])
        random_labels.append([outside])
        
        cooc = X_ik[pair]
        random_coocs.append([math.log(cooc)])
        random_weightings.append([weighting_dic[pair]])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

device = torch.device('cpu')
batch_size = 512
emb_size   = 10
model      = Glove(voc_size, emb_size).to(device)
optimizer  = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10 # GloVe converges faster, but let's do 10 full passes
print(f"Training GloVe (Window=2, Emb={emb_size}, Epochs={num_epochs})...")
print(f"Non-zero pairs: {len(skip_grams_keys)}")

steps_per_epoch = max(1, len(skip_grams_keys) // batch_size)
print(f"Steps per epoch: {steps_per_epoch}")

total_start = time.time()
for epoch in range(num_epochs):
    start = time.time()
    total_loss = 0
    
    random.shuffle(skip_grams_keys)
    
    for step in range(steps_per_epoch):
        
        input_batch, target_batch, cooc_batch, weighting_batch = get_batch(batch_size, skip_grams_keys, X_ik, weighting_dic)
        
        input_batch  = torch.LongTensor(input_batch).to(device)
        target_batch = torch.LongTensor(target_batch).to(device)
        cooc_batch   = torch.FloatTensor(cooc_batch).to(device)
        weighting_batch = torch.FloatTensor(weighting_batch).to(device)
        
        optimizer.zero_grad()
        loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        if (step + 1) % 1000 == 0:
            print(f"  Step {step+1}/{steps_per_epoch} | Loss: {loss.item():.6f}")
    
    end = time.time()
    avg_loss = total_loss / steps_per_epoch
    print(f"Epoch {epoch+1:6.0f} | Avg Loss: {avg_loss:.6f} | Time: {end-start:.4f}s")

save_path = 'models/glove_model.pth'
torch.save(model.state_dict(), save_path)
print(f"Model saved to {save_path}")

import json
metrics_path = 'models/metrics.json'
if os.path.exists(metrics_path):
    with open(metrics_path, 'r') as f:
        metrics = json.load(f)
else:
    metrics = {}

metrics['GloVe'] = {
    'window_size': 2,
    'training_loss': avg_loss,
    'training_time': time.time() - total_start
}
with open(metrics_path, 'w') as f:
    json.dump(metrics, f, indent=4)
print("Metrics saved.")


Loading Reuters corpus (Full)...
Corpus size: 54716 sentences
Building vocabulary...
Vocabulary size: 11678
Building Co-occurrence Matrix...
Calculating weights...
Number of non-zero co-occurrences: 1133669
Training GloVe (Window=2, Emb=10, Epochs=10)...
Non-zero pairs: 1133669
Steps per epoch: 2214
  Step 1000/2214 | Loss: 417.342957
  Step 2000/2214 | Loss: 448.408142
Epoch      1 | Avg Loss: 532.013049 | Time: 137.6871s
  Step 1000/2214 | Loss: 333.404541
  Step 2000/2214 | Loss: 226.810791
Epoch      2 | Avg Loss: 307.359025 | Time: 134.1092s
  Step 1000/2214 | Loss: 247.821884
  Step 2000/2214 | Loss: 117.673737
Epoch      3 | Avg Loss: 190.457472 | Time: 129.2668s
  Step 1000/2214 | Loss: 107.190239
  Step 2000/2214 | Loss: 121.762245
Epoch      4 | Avg Loss: 126.837518 | Time: 105.4103s
  Step 1000/2214 | Loss: 86.170609
  Step 2000/2214 | Loss: 59.505257
Epoch      5 | Avg Loss: 90.837703 | Time: 104.9334s
  Step 1000/2214 | Loss: 67.582878
  Step 2000/2214 | Loss: 55.343586
Ep

## 6. Evaluation & Comparison

This section loads all trained models (plus Pre-trained GloVe) and generates:
1. **Comparison Table**: Window Size, Loss, Time, Syntactic/Semantic Accuracy, correlation.
2. **Detailed Analysis**: WordSim353 predictions vs Human Scores.

In [7]:

import torch
import torch.nn as nn
import numpy as np
import scipy.stats
import gensim.downloader as api
import os

loader = DataLoader() # min_freq=5
word2index = loader.get_word2index()
index2word = {v:k for k, v in word2index.items()}
vocab = loader.vocab

class Skipgram(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    def forward(self, center, outside, all_vocabs):
        pass # Not needed for inference

class SkipgramNeg(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(SkipgramNeg, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    def forward(self, center, outside, negative):
        pass 

class Glove(nn.Module):
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    def forward(self, center, outside, coocs, weighting):
        pass

device = torch.device('cpu')
EMB_SIZE = 10
VOC_SIZE = loader.voc_size

print("Loading models...")

model_sg = Skipgram(VOC_SIZE, EMB_SIZE)
try:
    model_path = 'models/skipgram_model.pth'
    model_sg.load_state_dict(torch.load(model_path, map_location=device, weights_only=True))
    print("Skipgram loaded.")
except Exception as e:
    print(f"Skipgram load failed: {e}")

model_neg = SkipgramNeg(VOC_SIZE, EMB_SIZE)
try:
    model_path = 'models/skipgram_neg_model.pth'
    model_neg.load_state_dict(torch.load(model_path, map_location=device, weights_only=True))
    print("SkipgramNEG loaded.")
except Exception as e:
    print(f"SkipgramNEG load failed: {e}")

model_glove = Glove(VOC_SIZE, EMB_SIZE)
try:
    model_path = 'models/glove_model.pth'
    model_glove.load_state_dict(torch.load(model_path, map_location=device, weights_only=True))
    print("GloVe loaded.")
except Exception as e:
    print(f"GloVe load failed: {e}")

try:
    print("Loading Gensim GloVe (glove-twitter-25)...")
    model_gensim = api.load("glove-twitter-25")
    print("Gensim GloVe loaded.")
except Exception as e:
    print(f"Gensim GloVe failed: {e}")
    model_gensim = None

def get_vector(model, word, model_type='scratch'):
    if model_type == 'gensim':
        if word in model:
            return model[word]
        else:
            return None
    
    if word not in word2index:
        return None
    idx = torch.LongTensor([word2index[word]])
    
    if isinstance(model, Skipgram) or isinstance(model, SkipgramNeg):
        v = model.embedding_center(idx)
        vec = v # Usually we just use v_c. But averaging is also common. Notebook 1 used (c+o)/2.
        u = model.embedding_outside(idx)
        vec = (v + u) / 2
        return vec.detach().numpy()[0]
    
    if isinstance(model, Glove):
        v = model.center_embedding(idx)
        u = model.outside_embedding(idx)
        vec = (v + u) / 2
        return vec.detach().numpy()[0]
    
    return None

def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def evaluate_correlation(model, model_type='scratch'):
    pairs = []
    try:
        data_path = 'wordsim353/combined.csv'
        with open(data_path, 'r') as f:
            next(f) # skip header
            for line in f:
                parts = line.strip().split(',')
                if len(parts) >= 3:
                    pairs.append((parts[0].lower(), parts[1].lower(), float(parts[2])))
    except FileNotFoundError:
        print("wordsim353/combined.csv not found.")
        return 0.0

    preds = []
    humans = []
    
    for w1, w2, score in pairs:
        v1 = get_vector(model, w1, model_type)
        v2 = get_vector(model, w2, model_type)
        
        if v1 is not None and v2 is not None:
            sim = cosine_similarity(v1, v2)
            preds.append(sim)
            humans.append(score)
            
    if not preds:
        return 0.0
        
    corr, _ = scipy.stats.spearmanr(preds, humans)
    return corr

def evaluate_analogies(model, model_type='scratch'):
    
    sem_correct = 0
    sem_total = 0
    syn_correct = 0
    syn_total = 0
    
    current_section = None
    
    try:
        data_path = 'word-test.v1.txt'
        with open(data_path, 'r') as f:
            for line in f:
                if line.startswith(':'):
                    current_section = line.strip()
                    continue
                
                if current_section == ': capital-common-countries':
                    is_semantic = True
                elif current_section == ': past-tense':
                    is_semantic = False
                else:
                    continue
                    
                parts = line.lower().split()
                if len(parts) != 4: continue
                
                w_a, w_b, w_c, w_d = parts
                
                v_a = get_vector(model, w_a, model_type)
                v_b = get_vector(model, w_b, model_type)
                v_c = get_vector(model, w_c, model_type)
                
                if v_a is None or v_b is None or v_c is None:
                    continue
                
                target = v_b - v_a + v_c
                
                if model_type == 'gensim':
                    try:
                        res = model.most_similar(positive=[w_b, w_c], negative=[w_a], topn=1)
                        pred_word = res[0][0]
                    except:
                        pred_word = ""
                else:
                    
                    if not hasattr(model, 'embeddings_matrix'):
                        if isinstance(model, Skipgram) or isinstance(model, SkipgramNeg):
                            model.embeddings_matrix = (model.embedding_center.weight + model.embedding_outside.weight).detach().numpy() / 2
                        elif isinstance(model, Glove):
                            model.embeddings_matrix = (model.center_embedding.weight + model.outside_embedding.weight).detach().numpy() / 2
                            
                    target = target / np.linalg.norm(target)
                    
                    sims = np.dot(model.embeddings_matrix, target)
                    
                    if not hasattr(model, 'norm_embeddings_matrix'):
                        norm = np.linalg.norm(model.embeddings_matrix, axis=1, keepdims=True)
                        model.norm_embeddings_matrix = model.embeddings_matrix / (norm + 1e-9)
                        
                    sims = np.dot(model.norm_embeddings_matrix, target)
                    best_idx = np.argmax(sims)
                    pred_word = index2word[best_idx]
                
                if pred_word == w_d:
                    if is_semantic: sem_correct += 1
                    else: syn_correct += 1
                
                if is_semantic: sem_total += 1
                else: syn_total += 1
                
    except FileNotFoundError:
        print("word-test.v1.txt not found.")
        
    sem_acc = sem_correct / sem_total if sem_total > 0 else 0.0
    syn_acc = syn_correct / syn_total if syn_total > 0 else 0.0
    
    return sem_acc, syn_acc

results = {}

print("\n--- Evaluation Results ---")

import json
metrics_path = 'models/metrics.json'
if os.path.exists(metrics_path):
    with open(metrics_path, 'r') as f:
        training_metrics = json.load(f)
else:
    training_metrics = {}

models = {
    'Skipgram': (model_sg, 'scratch'),
    'SkipgramNeg': (model_neg, 'scratch'),
    'GloVe': (model_glove, 'scratch'),
    'Gensim': (model_gensim, 'gensim')
}

print(f"{'Model':<15} {'Window':<8} {'Loss':<10} {'Time(s)':<10} {'Sem Acc':<10} {'Syn Acc':<10} {'Spearman':<10}")
print("-" * 80)

for name, (m, m_type) in models.items():
    if m is None:
        print(f"{name:<15} {'N/A':<8} {'N/A':<10} {'N/A':<10} {'N/A':<10} {'N/A':<10} {'N/A':<10}")
        continue
        
    t_metrics = training_metrics.get(name, {})
    window = t_metrics.get('window_size', 'N/A')
    loss = t_metrics.get('training_loss', 'N/A')
    time_taken = t_metrics.get('training_time', 'N/A')
    
    if m_type == 'gensim':
        window = 'N/A' # Pre-trained
        loss = 'N/A'
        time_taken = 'N/A'

    if isinstance(loss, float): loss = f"{loss:.4f}"
    if isinstance(time_taken, float): time_taken = f"{time_taken:.2f}"
    if isinstance(window, int): window = str(window)

    corr = evaluate_correlation(m, m_type)
    sem, syn = evaluate_analogies(m, m_type)
    
    print(f"{name:<15} {window:<8} {loss:<10} {time_taken:<10} {sem:<10.4f} {syn:<10.4f} {corr:<10.4f}")

print("\n\n--- WordSim353 Detailed Results (MSE/Y_true Analysis) ---")
def get_correlation_details(model, model_type='scratch'):
    pairs = []
    try:
        data_path = 'wordsim353/combined.csv'
        with open(data_path, 'r') as f:
            next(f) # skip header
            for line in f:
                parts = line.strip().split(',')
                if len(parts) >= 3:
                    pairs.append((parts[0].lower(), parts[1].lower(), float(parts[2])))
    except FileNotFoundError:
        return []

    details = []
    for w1, w2, score in pairs:
        v1 = get_vector(model, w1, model_type)
        v2 = get_vector(model, w2, model_type)
        
        if v1 is not None and v2 is not None:
            sim = cosine_similarity(v1, v2)
            details.append((w1, w2, score, sim))
    return details

for name, (m, m_type) in models.items():
    if m is None: continue
    print(f"\nModel: {name}")
    details = get_correlation_details(m, m_type)
    if not details:
        print("No data found.")
        continue
        
    print(f"{'Word 1':<15} {'Word 2':<15} {'Human (Y)':<10} {'Model (Pred)':<12} {'Sq.Err':<10}")
    print("-" * 65)
    
    mse_sum = 0
    count = 0
    for w1, w2, human, pred in details[:15]:
        
        print(f"{w1:<15} {w2:<15} {human:<10.2f} {pred:<12.4f} {'-':<10}")
    print(f"... (showing first 15 of {len(details)})")


Loading Reuters corpus (Full)...
Corpus size: 54716 sentences
Building vocabulary...
Vocabulary size: 11678
Loading models...
Skipgram loaded.
SkipgramNEG loaded.
GloVe loaded.
Loading Gensim GloVe (glove-twitter-25)...
Gensim GloVe loaded.

--- Evaluation Results ---
Model           Window   Loss       Time(s)    Sem Acc    Syn Acc    Spearman  
--------------------------------------------------------------------------------
Skipgram        2        6.1143     34385.60   0.0059     0.0000     0.2667    
SkipgramNeg     2        1.9767     1227.33    0.0000     0.0000     0.1818    
GloVe           2        38.9339    1147.96    0.0000     0.0000     0.2333    
Gensim          N/A      N/A        N/A        0.1502     0.0000     0.3595    


--- WordSim353 Detailed Results (MSE/Y_true Analysis) ---

Model: Skipgram
Word 1          Word 2          Human (Y)  Model (Pred) Sq.Err    
-----------------------------------------------------------------
book            paper           7.46    

## 7. Web Application

The web application allows to search for relevant context using the trained embeddings.

### How to Run:

1. Open terminal.
2. Make sure it is under the repositary root folder
3. Run the following command:
   ```bash
   python app/app.py
   ```
4. Open the browser and go to `http://127.0.0.1:5000`.

### Features:
- Type a query phrase.
- The app converts the query to an embedding vector.
- It finds the top 10 most semantically similar sentences from the Reuters corpus.