## Implementing the Skip-Gram Model

### Contributors:
Raphael Steinborn |
Lidia Makishti |
Vineeth Racharla |
Saqib Sarwar

### Import Required Libraries
Vineeth Racharla 349073 | Saqib Sarwar 351757

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
import random
from tqdm import tqdm
import nltk

In [14]:
# Check if CUDA is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Training on:', device)


Training on: cuda


### Prepare the Data
Raphael Steinborn 351421 | Lidia Makishti 346621 | Vineeth Racharla 349073 | Saqib Sarwar 351757

In [15]:
# Download and preprocess the text8 dataset
from nltk.corpus import reuters
nltk.download('reuters')
nltk.download('punkt')

# Load the dataset
words = list(reuters.words())
text = ' '.join(words)

# Tokenize the text
words = nltk.word_tokenize(text.lower())

# Build vocabulary
vocab = Counter(words)
vocab_size = len(vocab)
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for word, i in word_to_idx.items()}

# Subsampling of frequent words
def subsample_frequent_words(words, threshold=1e-5):
    total_count = len(words)
    word_counts = Counter(words)
    freqs = {word: count/total_count for word, count in word_counts.items()}
    prob_drop = {word: 1 - np.sqrt(threshold / freqs[word]) for word in word_counts}
    subsampled_words = [word for word in words if random.random() < (1 - prob_drop[word])]
    return subsampled_words

words = subsample_frequent_words(words)

# Generate training data
def generate_training_data(words, word_to_idx, window_size):
    data = []
    for i in range(len(words)):
        target = word_to_idx[words[i]]
        context = []
        for j in range(-window_size, window_size + 1):
            if j != 0 and 0 <= i + j < len(words):
                context.append(word_to_idx[words[i + j]])
        for context_word in context:
            data.append((target, context_word))
    return data

window_size = 2 # Here we define the context window size
training_data = generate_training_data(words, word_to_idx, window_size)

[nltk_data] Downloading package reuters to C:\Users\Saqib
[nltk_data]     Sarwar\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Saqib
[nltk_data]     Sarwar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Define the Skip-gram Model
Raphael Steinborn 351421 | Vineeth Racharla 349073

In [16]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.in_embeddings = nn.Embedding(vocab_size, embedding_dim)  # Input word embeddings
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)  # Output word embeddings
    
    def forward(self, center_word, context_word, negative_samples):
        # Get embeddings for center word, context word, and negative samples
        center_embed = self.in_embeddings(center_word).unsqueeze(1)
        context_embed = self.out_embeddings(context_word).unsqueeze(1)
        neg_embed = self.out_embeddings(negative_samples)
        
        # Calculate positive score
        pos_score = torch.bmm(context_embed, center_embed.transpose(1, 2)).squeeze()
        
        # Calculate negative score
        neg_score = torch.bmm(neg_embed, center_embed.transpose(1, 2)).squeeze()
        
        # Calculate positive and negative loss
        pos_loss = torch.log(torch.sigmoid(pos_score)).sum()
        neg_loss = torch.log(torch.sigmoid(-neg_score)).sum()
        
        return -(pos_loss + neg_loss)  # Return total loss


### Implement Negative Sampling
Raphael Steinborn 351421 | Lidia Makishti 346621

In [17]:
def get_negative_samples(target, vocab_size, k=5):
    neg_samples = []
    while len(neg_samples) < k:
        sample = random.randint(0, vocab_size - 1)
        if sample != target:
            neg_samples.append(sample)
    return torch.tensor(neg_samples)  # Convert the list of negative samples to a tensor

### Train the Model
Raphael Steinborn 351421| Saqib Sarwar 351757

In [18]:
# Hyperparameters
embedding_dim = 100 # Here we define the embedding dimension
learning_rate = 0.01 # Here we define the learning rate
epochs = 10 # Here we define the amount of epochs

model = SkipGramModel(vocab_size, embedding_dim).to(device) # Move model to device
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(epochs):
    total_loss = 0
    random.shuffle(training_data)
    for target, context in tqdm(training_data):
        # Move data to the same device as the model
        target_tensor = torch.tensor([target], dtype=torch.long).to(device)
        context_tensor = torch.tensor([context], dtype=torch.long).to(device)
        neg_samples_tensor = get_negative_samples(target, vocab_size, k=5).unsqueeze(0).to(device)
        
        model.zero_grad()
        loss = model(target_tensor, context_tensor, neg_samples_tensor)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(f'Epoch: {epoch}, Loss: {total_loss}')




100%|██████████| 1346322/1346322 [54:09<00:00, 414.35it/s] 


Epoch: 0, Loss: 24215613.40907107


100%|██████████| 1346322/1346322 [58:05<00:00, 386.23it/s] 


Epoch: 1, Loss: 15406641.4315007


100%|██████████| 1346322/1346322 [58:39<00:00, 382.51it/s] 


Epoch: 2, Loss: 9858149.241396632


100%|██████████| 1346322/1346322 [58:21<00:00, 384.47it/s] 


Epoch: 3, Loss: 6761020.5883722305


100%|██████████| 1346322/1346322 [58:37<00:00, 382.77it/s] 


Epoch: 4, Loss: 5369496.472452118


100%|██████████| 1346322/1346322 [1:01:24<00:00, 365.40it/s]


Epoch: 5, Loss: 4566359.622814983


100%|██████████| 1346322/1346322 [1:01:34<00:00, 364.44it/s]


Epoch: 6, Loss: 4035564.3589102156


100%|██████████| 1346322/1346322 [1:00:44<00:00, 369.42it/s]


Epoch: 7, Loss: 3641700.801302188


100%|██████████| 1346322/1346322 [55:30<00:00, 404.29it/s] 


Epoch: 8, Loss: 3342709.3068018933


100%|██████████| 1346322/1346322 [55:41<00:00, 402.94it/s] 

Epoch: 9, Loss: 3092290.2434660653





### Evaluate the Model
Raphael Steinborn 351421 | Lidia Makishti 346621

In [19]:
def get_word_embedding(word):
    word_idx = word_to_idx[word]
    word_tensor = torch.tensor([word_idx], dtype=torch.long).to(device) # Move tensor to device
    return model.in_embeddings(word_tensor).detach().cpu().numpy().flatten() # Move embedding back to CPU for NumPy

# Example word embeddings
word = 'economy'
embedding = get_word_embedding(word)
print(f'Embedding for "{word}":\n{embedding}')

# Evaluate using a standard word similarity benchmark (e.g., SimLex-999)
from scipy.spatial.distance import cosine

def word_similarity(word1, word2):
    emb1 = get_word_embedding(word1)
    emb2 = get_word_embedding(word2)
    return 1 - cosine(emb1, emb2)

# Example word similarities
print(f'Similarity between "economy" and "market": {word_similarity("economy", "market")}')
print(f'Similarity between "economy" and "banana": {word_similarity("economy", "banana")}')

Embedding for "economy":
[ 0.5192357  -0.12822421  0.01935717  0.08980225 -0.30135232  0.12298822
  0.27826655 -0.07729244 -0.1602444   0.12366064 -0.09298617  0.27902424
  0.13965435 -0.03480436 -0.35620016 -0.108164    0.11556055 -0.20788234
  0.11945654 -0.3874983  -0.3050467  -0.09299957  0.28377932 -0.11577014
  0.17916234  0.05045917 -0.3179455  -0.21785542  0.28134823  0.30142704
  0.24886775  0.17752081 -0.00576534 -0.13229336  0.09931571  0.00523115
  0.0993351  -0.04925102  0.13245218 -0.0794709   0.24092875 -0.28034765
 -0.20903786 -0.08777105 -0.04629131  0.14778823 -0.11245999  0.02697894
 -0.07635371 -0.01300771  0.00153163  0.0284095   0.28540838  0.43226308
 -0.03276398 -0.00552599  0.00658814 -0.22874175  0.10414997  0.42934507
 -0.06483742 -0.03806027  0.03490882  0.01837328  0.27242014  0.05900625
 -0.09647197 -0.04939486  0.04505193  0.10131337  0.13005303 -0.28051662
 -0.15064062 -0.23593275 -0.07970688 -0.14525747 -0.31812096 -0.12576556
 -0.38688025 -0.08109877  

### Limitations
The reuters dataset is well suited for the task, however, because of our limited computational resources, it was not possible for us to try different hyperparameters (window size, embeddings, epochs). Therefore we have not implemented Phrase Learning or further analogical reasoning tasks.