# **Building and Training a Feedforward Neural Network for Language Modeling**


### Importing The Required Libraries

In [1]:
%%capture
import warnings
from tqdm import tqdm

warnings.simplefilter('ignore')
import time
from collections import OrderedDict

import re

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import string
import time
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize




### Defining Preprocessing Functions 

In [2]:
def preprocess_string(s):
    """
    Preprocesses a given string by performing the following steps:
    
    1. Removes all non-word characters (excluding letters and numbers).
    2. Removes all whitespace characters.
    3. Removes all numeric digits.

    Parameters:
    s (str): The input string to be cleaned.

    Returns:
    str: The processed string with only alphabetic characters, no spaces, and no digits.
    """

    # Remove all non-word characters (everything except letters and numbers)
    # \w matches any word character (letters, numbers, and underscores)
    # \s matches any whitespace characters
    # ^ inside [] negates the selection, so [^\w\s] matches anything that's NOT a word character or whitespace.
    s = re.sub(r"[^\w\s]", '', s)

    # Remove all whitespace characters (spaces, tabs, newlines)
    # \s+ matches one or more whitespace characters.
    s = re.sub(r"\s+", '', s)

    # Remove all digits (0-9)
    # \d matches any digit character.
    s = re.sub(r"\d", '', s)

    return s

In [3]:
def process(word):
    tokens=word_tokenize(word)
    tokens=[preprocess_string(token) for token in tokens]
    tokens=[token.lower() for token in tokens]
    return tokens

In [4]:
def convert_to_indices(tokens):
    vocab = build_vocab_from_iterator([tokens],specials=["<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    return vocab



### Giving Input 

In [5]:
song= """We are no strangers to love
You know the rules and so do I
A full commitments what Im thinking of
You wouldnt get this from any other guy
I just wanna tell you how Im feeling
Gotta make you understand
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Weve known each other for so long
Your hearts been aching but youre too shy to say it
Inside we both know whats been going on
We know the game and were gonna play it
And if you ask me how Im feeling
Dont tell me youre too blind to see
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Weve known each other for so long
Your hearts been aching but youre too shy to say it
Inside we both know whats been going on
We know the game and were gonna play it
I just wanna tell you how Im feeling
Gotta make you understand
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you"""

### Preprocessing The Input song

In [6]:
tokens=process(song)

In [7]:
vocab=convert_to_indices(tokens)

In [8]:
for token in tokens[0:10]:
    print(vocab[token]) 

22
60
72
76
26
71
3
21
32
74


In [9]:
def get_embedding(vocab):
    embedding_dim=20
    vocab_size=len(vocab)
    return nn.Embedding(vocab_size,embedding_dim)

In [10]:
embeddings=get_embedding(vocab)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\rajve\miniconda3\envs\nlp_env\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\rajve\miniconda3\envs\nlp_env\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\rajve\miniconda3\envs\nlp_env\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\rajve\miniconda3\envs\nlp_env\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    a

In [11]:
index_to_token=vocab.get_itos()
for n in range(2): 
    embedding=embeddings(torch.tensor(n))
    print("word",index_to_token[n])
    print("index",n)
    print( "embedding", embedding)
    print("embedding shape", embedding.shape)

word <unk>
index 0
embedding tensor([ 1.8901,  0.3956, -0.3977, -0.3144,  0.0081,  0.0214, -1.0338, -2.5969,
        -0.1704,  0.2626, -1.0628, -1.3763,  0.9188, -1.6850,  0.7507,  0.2583,
         1.3888,  0.6482, -0.9211,  0.5353], grad_fn=<EmbeddingBackward0>)
embedding shape torch.Size([20])
word na
index 1
embedding tensor([ 0.7926, -2.0075, -1.0527, -0.2150,  0.0782,  0.1578,  0.2381, -0.3948,
         1.4990,  0.5304, -0.6798,  0.1766,  1.8218, -1.3973,  0.8606, -0.0515,
         1.1187, -0.9006, -0.5457,  0.2750], grad_fn=<EmbeddingBackward0>)
embedding shape torch.Size([20])


### Defining The Model

In [67]:
class NGramLanguageModel(nn.Module):
    def __init__(self,vocab_size,embed_dim=20):
        super(NGramLanguageModel,self).__init__()
        self.embed=nn.Embedding(vocab_size,embed_dim)
        self.linear1=nn.Linear(Context_Size*embed_dim,128)
        self.linear2=nn.Linear(128,64)
        self.out=nn.Linear(64,vocab_size)
    def forward(self, inp_indx):
        
        inp_embed = self.embed(torch.tensor(inp_indx))
        
        inp_embed=inp_embed.view(1,-1)
        
        outp = torch.relu(self.linear1(inp_embed))
        outp = torch.relu(self.linear2(outp))
        
        return self.out(outp)
        

In [68]:
def give_final_indx(out):
    return torch.argmax(out)

In [69]:
def give_final_string(idx):
    return index_to_token(idx)

### Structuring The Input before passing it to the Model

In [70]:
Context_Size=2
input_structured=[
                    (
                      [vocab[tokens[i-j-1]] for j in range(Context_Size)],
                       vocab[tokens[i]])
                      for i in range(Context_Size,len(vocab))
                    ]

In [71]:
input_structured[0]

([60, 22], 72)

In [72]:
from torch.utils.data import DataLoader
dataloader=DataLoader(input_structured,batch_size=1,shuffle=False)

In [73]:
for context,target in dataloader:
    print("Context:",context)
    print("Target:",target)

Context: [tensor([60]), tensor([22])]
Target: tensor([72])
Context: [tensor([72]), tensor([60])]
Target: tensor([76])
Context: [tensor([76]), tensor([72])]
Target: tensor([26])
Context: [tensor([26]), tensor([76])]
Target: tensor([71])
Context: [tensor([71]), tensor([26])]
Target: tensor([3])
Context: [tensor([3]), tensor([71])]
Target: tensor([21])
Context: [tensor([21]), tensor([3])]
Target: tensor([32])
Context: [tensor([32]), tensor([21])]
Target: tensor([74])
Context: [tensor([74]), tensor([32])]
Target: tensor([5])
Context: [tensor([5]), tensor([74])]
Target: tensor([31])
Context: [tensor([31]), tensor([5])]
Target: tensor([64])
Context: [tensor([64]), tensor([31])]
Target: tensor([29])
Context: [tensor([29]), tensor([64])]
Target: tensor([9])
Context: [tensor([9]), tensor([29])]
Target: tensor([67])
Context: [tensor([67]), tensor([9])]
Target: tensor([63])
Context: [tensor([63]), tensor([67])]
Target: tensor([79])
Context: [tensor([79]), tensor([63])]
Target: tensor([24])
Contex

In [74]:
loss=nn.CrossEntropyLoss()

### Training Model

In [75]:
model=NGramLanguageModel(vocab_size=len(vocab),embed_dim=20)

In [76]:
optimizer=optim.SGD(model.parameters())

In [81]:
from tqdm import tqdm
def train(dataloader,model,epoch=100,show=10):
    model.train()
    loss_history=[]
    for epoch in tqdm(range(epoch)):
        total_loss=0
        for context,target in dataloader:
            optimizer.zero_grad()
            predicted=model(context)
            Loss=loss(predicted,target.reshape(-1))
            total_loss+=Loss.item()
            Loss.backward()
            optimizer.step()
        loss_history.append(total_loss)
        
        if (epoch + 1) % show == 0:
            print(f"Epoch {epoch+1}: Loss {total_loss:.4f}")
    return loss_history

In [85]:
loss_history=train(dataloader,model)

 11%|████████▉                                                                        | 11/100 [00:02<00:19,  4.56it/s]

Epoch 10: Loss 187.2063


 20%|████████████████▏                                                                | 20/100 [00:04<00:25,  3.15it/s]

Epoch 20: Loss 175.2827


 30%|████████████████████████▎                                                        | 30/100 [00:08<00:23,  2.92it/s]

Epoch 30: Loss 162.9539


 41%|█████████████████████████████████▏                                               | 41/100 [00:10<00:09,  6.18it/s]

Epoch 40: Loss 150.2508


 50%|████████████████████████████████████████▌                                        | 50/100 [00:12<00:10,  4.90it/s]

Epoch 50: Loss 137.2541


 60%|████████████████████████████████████████████████▌                                | 60/100 [00:14<00:11,  3.61it/s]

Epoch 60: Loss 124.0623


 70%|████████████████████████████████████████████████████████▋                        | 70/100 [00:16<00:05,  5.24it/s]

Epoch 70: Loss 110.9622


 81%|█████████████████████████████████████████████████████████████████▌               | 81/100 [00:19<00:03,  5.28it/s]

Epoch 80: Loss 98.2201


 91%|█████████████████████████████████████████████████████████████████████████▋       | 91/100 [00:21<00:01,  6.09it/s]

Epoch 90: Loss 86.1545


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:22<00:00,  4.42it/s]

Epoch 100: Loss 75.0556





In [86]:
len(vocab)

81

### Calculating Perplexity

In [93]:
np.exp(75)

np.float64(3.7332419967990015e+32)

### Testing Model 

In [94]:

def generate_song_lines(model, vocab, seed_tokens, max_length=20, temperature=1.0):
    model.eval()
    stoi = vocab.get_stoi()
    itos = vocab.get_itos()
    
    generated_indices = [stoi[token] for token in seed_tokens]
    
    with torch.no_grad():
        for _ in range(max_length):
            input_window = torch.tensor([generated_indices[-Context_Size:]])
            output = model(input_window)
            probs = torch.softmax(output / temperature, dim=1).squeeze()
            next_idx = torch.multinomial(probs, num_samples=1).item()
            generated_indices.append(next_idx)

    return " ".join([itos[idx] for idx in generated_indices])

In [95]:
seed = ["i", "love"] 

lyrics = generate_song_lines(model, vocab, seed, max_length=20)

print(lyrics)

i love what of wan of na you i got you im let gon up na a around a and full around


### Saving The Model

In [96]:
torch.save(model, "complete_ngram_model.pt")