In [54]:
# importing libs 
import torch as torch
import torch.nn.functional as F
import matplotlib.pyplot as plt 
%matplotlib inline

import time
import random

from collections import Counter

# Creating dataset 
Here we will create the X and Y training sets using the dataset the user enters

In [45]:
def create_vocab(words):
    # we are no longer going to have just 27 indices. our vocabulary size is a lot bigger since we are now working with word-level model not character level model .so each word is assigned an index 

    # # here i am getting the count for every word in the dataset
    word_counts = Counter(words)
    # vocab = [word for word,_ in Counter(words).most_common(vocab_size-1)]
    # this is done to replace any word which is in the dataste but not within the top vocab_size.
    pad_token = "."
    unk_token = "<UNK>" 
    # this token replaces words that have a freq of <= 3
    rare_token = "<RARE>"
    # adding words only once
    vocab=set()
    rare_words_exist = False
    # logic to add rare tokens. the following for loop will create a vocabulary for the model to use
    for word, count in word_counts.items():
        if count > 3:
            # vocab.append(word)
            vocab.add(word)
        else:
            # NOTE: we replace all words with <=3 freq with one token
            rare_words_exist = True


    vocab = [pad_token, unk_token] + list(vocab)

    # remeber, the first three items in the vocab are : . , <UNK>, <RARE>
    if rare_words_exist:
        vocab.insert(2, rare_token)
    
    return vocab, word_counts

def create_stoi_itos(vocab):
    # mapping words to integer/count
    stoi = {s: i + 1 for i, s in enumerate(vocab)}
    # invert the dictionary
    itos = {i: s for s, i in stoi.items()}
    return stoi, itos

def build_dataset(words, context_length, is_train=False, only_dict = False):
    """
    words: dataset that contains a continuous stream of words (so basically like text from books for example)
    context_length: how many words do you need to predict next one. we will be keeping this at 2
    """

    vocab, word_counts = create_vocab(words)
    stoi, itos = create_stoi_itos(vocab)
    X, Y =[], []

    # creating a context:target pair. this is basically input, output pairs. for example, if we are using a context of 2 and the next three words being iterated over are 'the quick fox' , X will be 'the quick' and Y (predicted) will be fox
    context = [0]*context_length
    for word in words:
        # take the index of the word from stoi
        # ix=stoi.get(word, stoi['<UNK>'])

        if word_counts[word] > 3:
             ix = stoi.get(word)
        elif is_train and word_counts[word] <= 3:
             ix = stoi.get("<RARE>")
        else:
            ix = stoi["<UNK>"]

        # can use the below print for debugging purposes
        # print(f"word: {word}, its currrent context: {itos[context[0]]} {itos[context[1]]}")

        # create a list of words. so the current/target word and the words before it 
        # store the current context
        X.append(context)
        Y.append(ix)
        
        context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)

    return X, Y

# Splitting train test validation

In [46]:
def train_test_split(train_ratio, words, context_length):
    """
    train: percentage of dataset to be used for training the neural network
    words: dataset that contains a list of words

    returns training, testing and validation sets for the entered dataset 
    """

    test_split = 1 - ((1 - train_ratio)/2)
    random.seed(42)

    n1 = int(train_ratio*len(words))
    n2 = int(test_split*len(words))
    # print(words[:4])

    # NOTE TO SELF: from the initial char-level model, i improved/changed by using one set of words for creating the vocab for all three splits. initially i was using different word sets for each and the inconsistency here was causing major issues. its a good point to take note of for later on.
    
    train_words = words[:n1]
    dev_words = words[n1:n1+n2]
    test_words = words[n1+n2:]

    vocab, word_counts = create_vocab(train_words)
    stoi, itos = create_stoi_itos(vocab)

    data = build_dataset(words, context_length)

    Xtr, Ytr = build_dataset(train_words, context_length, is_train=True)
    Xdev, Ydev = build_dataset(dev_words, context_length, is_train=True)
    Xte, Yte = build_dataset(test_words, context_length, is_train=True)

    return [Xtr, Ytr, Xdev, Ydev, Xte, Yte, stoi, itos]

In [47]:
brown_words = open("data/brown.txt", "r").read().split(" ")
sets = train_test_split(0.8, brown_words, 2)

# Creating the neural network parameters

Ok before creating it i will briefly go through the model architecture that we are implementing from the paper. So the main model will consist of the input layer -> hidden layer -> output layer. 

however, (note this is optional as mentioned in the paper) the paper does go through another path. 

Basically, while the non-linear layer learns the complex relationship between words we can have another <b>linear</b> layer (which i refer to as a direct connection) that learns only the simple relationships. It connects the input layer's embeddings directly to the output layer. 

From the paper:

<img src='images/direct_layer.png' >

What this does is that it ensures that the model learns/captures both relationship types in order to perform well on new examples. the diagram below gives a better idea of the model design.

<img src='images/model_design.png' >

In [48]:
import math
def create_params(context_length, dims, hidden_layer_neurons, number_of_words):
    """
    context_length: how many words do you need to predict next one

    This function initializes paramters for a neural network with one hidden layer and the number of neurons within it are set by the user. 
    """
    # building lookup table C
    # we will initalize the lookup table with random numbers
    # generator for reproducibility of results
    g = torch.Generator().manual_seed(1234567)

    # here we squeeze the words within the specified dimensions
    C = torch.rand((number_of_words, dims), generator=g)

    # initialize hidden layer
    # this will be 100 neurons  

    # NOTE: the .uniform_ is a slight moving away from the paper and can be removed to maintain 100% alignment with the paper
    W1 = torch.randn((context_length*dims, hidden_layer_neurons), generator=g).uniform_(-0.1, 0.1)
    b1 = torch.randn(hidden_layer_neurons, generator=g)

    # this was something i didnt realise earlier, after discussing with claude as well this is a layer that is a direct conenction between the feature vector and the final layer. this is a linear layer which is used to map connections between the most common words. 
    W = torch.randn((context_length*dims, number_of_words), generator=g).uniform_(-0.1, 0.1)

    # creating output/final layer
    W2 = torch.randn((hidden_layer_neurons, number_of_words), generator=g).uniform_(-0.1, 0.1)
    b2 = torch.randn(number_of_words, generator=g)
    parameters = [C, W1, b1, W2, b2, W]

    return parameters

In [49]:
def train_model(train_test_sets, parameters, context_length, dims, vocab, epochs):
    C, W1, b1, W2, b2, W = parameters
    Xtr, Ytr, Xdev, Ydev, Xte, Yte, stoi, itos = train_test_sets

    # set grad calc to true for backprop
    for p in parameters:
        p.requires_grad = True
    # training the neural network
    lri = []
    lossi = []

    # Xtr, Ytr, Xdev, Ydev, Xte, Yte = train_test_sets

    # print("After Mini_batch \n")
    for epoch in range(epochs):

        # mini-batch construct. 
        batch = 32
        ix = torch.randint(0, Xtr.shape[0], (batch, ))
        start_time = time.time()
        # --------- forward pass --------- #
        # embed all the inputs at once 
        # remember that when we do C[X], it treats each elemnt in X as an index to select a row in C

        # print(f"Xtr shape for embedding is: {Xtr.shape}")

        embed = C[Xtr[ix]]
        # embed.view squashes 32, 3, 2 into 32, 6(3*2). using -1 avoids hardcoding the length and instead pytorch infers the length

        # hidden layer calc
        # h = torch.tanh(embed.view(-1, context_length*dims) @ W1 + b1)

        # # direct connection/linear layer calculation
        embed_flat = embed.view(batch, -1)

        # now combine the non-linear layer and direct connection calc. u can consider it a second hidden layer. this, as mentioned in the paper, is optional. 
        # why ? non-linear layer learns the complex relationships and the linear layer calc learns the simple relationships
        h = torch.tanh(embed_flat @ W1 + b1)
        
        # output layer calc
        # logits = h @ W2 + b2
        logits = (h @ W2) + (embed_flat @ W) + b2

        # loss calc
        loss = F.cross_entropy(logits, Ytr[ix])

        if epoch%10==0:
            print(f'Epoch: {epoch} | Loss: {loss.item()}')
        # print(loss.item())

        # ---------- backward pass ------- #

        # set gradient for each param to 0 
        for p in parameters:
            p.grad = None
        loss.backward()

        # update parameter. learning rate
        lr = 0.01
        for p in parameters:
            p.data += -lr * p.grad
        
        # update stats 
        lri.append(lr)
        lossi.append(loss.item())
        
        end_time = time.time()
        # print(f"Iteration {epoch+1} took {end_time - start_time} seconds\n")
    
    return parameters, stoi, itos, Xdev, Ydev

In [50]:
def test_model(Xdev, Ydev, parameters):
    C, W1, b1, W2, b2, W = parameters
    # NOTE: MY initial code was computing gradients as well as extrmely slow. claude helped me add torch.no_grad() and batch processing for the validation set
    total_loss = 0
    num_batches = 0
    batch_size = 32 

    # stop recalculation of gradients for the params
    with torch.no_grad():  
        for i in range(0, len(Xdev), batch_size):
            batch_X = Xdev[i:i+batch_size]
            batch_Y = Ydev[i:i+batch_size]

            # forward pass same as for training
            embedData = C[batch_X]
            embed_flat = embedData.view(embedData.size(0), -1)

            actual_size = embedData.size(0)

            h = torch.tanh(embed_flat @ W1 + b1)
            logits = (h @ W2) + (embed_flat @ W) + b2
            # print(logits)
            loss = F.cross_entropy(logits, batch_Y)
            
            total_loss += loss.item()
            num_batches += 1

    average_loss = total_loss / num_batches
    print("\nAverage Validation Loss:", average_loss)

    # Optional: Calculate perplexity
    perplexity = torch.exp(torch.tensor(average_loss))
    print("Validation Perplexity:", perplexity.item())

    return average_loss, perplexity


In [72]:
# 1. Data Preparation
brown_words = open("data/brown.txt", "r").read().split(" ")
context_length = 2  # As used in the paper
dims = 120  # Word feature vector dimensionality, as per paper

# Split data and create vocabulary
train_test_sets = train_test_split(0.8, brown_words, context_length)
Xtr, Ytr, Xdev, Ydev, Xte, Yte, stoi, itos = train_test_sets
vocab_size = len(stoi)

# 2. Model Initialization
hidden_layer_neurons = 200  # As per paper
parameters = create_params(context_length, dims, hidden_layer_neurons, vocab_size)

# 3. Training
epochs = 200  # Paper mentions 10-20 epochs
updated_params, trained_stoi, trained_itos, Xdev, Ydev = train_model(train_test_sets, parameters, context_length, dims, (stoi, itos), epochs)

# 4. Final Evaluation (if not done in train_model)
C, W1, b1, W2, b2, W = updated_params
test_loss, test_perplexity = test_model(Xdev, Ydev, updated_params)
print(f"Final Test Perplexity: {test_perplexity:.2f}")

# 5. Save the model (optional)
torch.save({
    'model_params': updated_params,
    'stoi': trained_stoi,
    'itos': trained_itos
}, 'neural_lm_model.pth')

Epoch: 0 | Loss: 10.196393013000488
Epoch: 10 | Loss: 10.427043914794922
Epoch: 20 | Loss: 9.749485969543457
Epoch: 30 | Loss: 7.904714584350586
Epoch: 40 | Loss: 9.089534759521484
Epoch: 50 | Loss: 9.225329399108887
Epoch: 60 | Loss: 7.807784557342529
Epoch: 70 | Loss: 8.79438591003418
Epoch: 80 | Loss: 8.731880187988281
Epoch: 90 | Loss: 8.786593437194824
Epoch: 100 | Loss: 8.412059783935547
Epoch: 110 | Loss: 9.00832748413086
Epoch: 120 | Loss: 9.48662281036377
Epoch: 130 | Loss: 8.198122024536133
Epoch: 140 | Loss: 8.810020446777344
Epoch: 150 | Loss: 7.967146873474121
Epoch: 160 | Loss: 7.54957914352417
Epoch: 170 | Loss: 7.411753177642822
Epoch: 180 | Loss: 8.96130084991455
Epoch: 190 | Loss: 7.531877517700195

Average Validation Loss: 9.216295255039043
Validation Perplexity: 10059.7265625
Final Test Perplexity: 10059.73


In [73]:
# predicting the next word 
def predict_next_word(word1, word2, model_comps):
    parameters, stoi, itos = model_comps
    C, W1, b1, W2, b2, W = parameters

    # converting the words to indices
    ix1 = stoi.get(word1, stoi['<UNK>'])
    ix2 = stoi.get(word2, stoi['<UNK>'])

    input_indices = [ix1, ix2]
    
    # Convert to tensor
    input_tensor = torch.tensor(input_indices).unsqueeze(0) 
    
    embedData = C[input_tensor]
    embed_flat = embedData.view(embedData.size(0), -1)
    h = torch.tanh(embed_flat @ W1 + b1)
    logits = (h @ W2) + (embed_flat @ W) + b2

    preds = torch.nn.functional.softmax(logits)

    # Get the top 5 predictions
    top_preds, top_indices = torch.topk(preds, 5)
    
    print("Top 5 predictions:")
    for prob, idx in zip(top_preds[0], top_indices[0]):
        print(f"{itos[idx.item()]}: {prob.item():.4f}")

    # print(preds)


In [74]:
loaded_data = torch.load('neural_lm_model.pth')
model_params = loaded_data['model_params']
stoi= loaded_data['stoi']
itos=loaded_data['itos']

predict_next_word(word1="are", word2='you', model_comps=[model_params, stoi, itos])

Top 5 predictions:
<RARE>: 0.1099
the: 0.0502
of: 0.0328
and: 0.0223
a: 0.0091


  preds = torch.nn.functional.softmax(logits)
