# Prakhar Jain
# 2022121008

In [1]:
# import nltk
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt 
from tqdm.auto import tqdm

import random

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# device = torch.device("cpu")

from preprocess import TextProcessor, Vocabulary, TextDataset, TextDataset


# 1. Implement the Language Model and report the Perplexity Scores. [40 marks]

In [2]:
file_path = "Auguste_Maquet.txt"
glove_path = "glove.6B.300d.txt"
# can use fasttext as well just by specifying the path

text_processor = TextProcessor(file_path)
sentences = text_processor.preprocess_text()

random.seed(42)
random.shuffle(sentences)

val_len = int(len(sentences) * 0.1)
test_len = int(len(sentences) * 0.2)

train_sentences = sentences[val_len + test_len:]
val_sentences = sentences[:val_len]
test_sentences = sentences[val_len:val_len + test_len]

train_ngrams = text_processor.generate_ngrams(train_sentences, 5 + 1) # 5 for context and 1 for target
val_ngrams = text_processor.generate_ngrams(val_sentences, 5 +1)
test_ngrams = text_processor.generate_ngrams(test_sentences, 5 + 1)

vocabulary = Vocabulary(glove_path)
vocabulary.build_vocab(train_sentences)
embeddings = vocabulary.get_glove_embeddings()

train_dataset = TextDataset(train_ngrams, vocabulary)
val_dataset = TextDataset(val_ngrams, vocabulary)
test_dataset = TextDataset(test_ngrams, vocabulary)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [3]:
print(f"Sentences: {len(sentences)}")

print(f"Train: {len(train_sentences)}")
print(f"Val: {len(val_sentences)}")
print(f"Test: {len(test_sentences)}")

Sentences: 34840
Train: 24388
Val: 3484
Test: 6968


In [4]:
## test functionality
print(train_ngrams[0])

for batch in train_loader:
    for x in batch[0][0]:
        print(vocabulary.index2word(x.item()))
        print(str(x.item()) +" "+str(vocabulary.word2index(vocabulary.index2word(x.item()))))

    # print the target word
    print(vocabulary.index2word(batch[1][0].item()))
    print(str(batch[1][0].item()) +" "+str(vocabulary.word2index(vocabulary.index2word(batch[1][0].item()))))
    break

<S> copyright laws in most countries
<S>
1 1
copyright
3 3
laws
4 4
in
5 5
most
6 6
countries
7 7


In [5]:
print(embeddings.shape)
print(type(embeddings))

torch.Size([18843, 300])
<class 'torch.Tensor'>


In [9]:
class NeuralLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding, dropout):
        super(NeuralLanguageModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(embedding)
        self.dropout = dropout

        self.model = nn.Sequential(
            nn.Linear(5 * embedding_dim, 300),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(300, vocab_size)
            # nn.LogSoftmax(dim=1) # CrossEntropyLoss already applies log softmax
        )

    def forward(self, x):
        # x is a tensor of shape (batch_size, 5)    
        embeds = self.embedding(x)
        embeds = embeds.view(-1, 5 * self.embedding_dim) # flatten the tensor
        output = self.model(embeds)
        return output

https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html

Check out reduction parameter in torch

In [10]:
model = NeuralLanguageModel(vocab_size=embeddings.shape[0], embedding_dim=embeddings.shape[1], hidden_dim=300, embedding=embeddings, dropout=0.5).to(device)
criterion = nn.CrossEntropyLoss(reduction='sum')

optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

In [11]:

def calculate_perplexity(loss):
    # perplexity = exp(cross_entropy) this is a direct relationship between cross entropy and perplexity
    return torch.exp(torch.tensor(loss))

def train_one_epoch(model, train_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        context, target = batch
        context, target = context.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader.dataset)
    # avg_loss = total_loss / len(train_loader)
    perplexity = calculate_perplexity(avg_loss)
    return avg_loss, perplexity

def evaluate(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.inference_mode():
        for batch in tqdm(val_loader):
            context, target = batch
            context, target = context.to(device), target.to(device)
            output = model(context)
            loss = criterion(output, target)
            total_loss += loss.item()
    avg_loss = total_loss / len(val_loader.dataset)
    # avg_loss = total_loss / len(val_loader)
    perplexity = calculate_perplexity(avg_loss)
    return avg_loss, perplexity

def test(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.inference_mode():
        for batch in tqdm(test_loader):
            context, target = batch
            context, target = context.to(device), target.to(device)
            output = model(context)
            loss = criterion(output, target)
            total_loss += loss.item()
    avg_loss = total_loss / len(test_loader.dataset)
    # avg_loss = total_loss / len(test_loader)
    perplexity = calculate_perplexity(avg_loss)
    return avg_loss, perplexity

n_epochs = 10

for epoch in range(n_epochs):
    train_loss, train_perplexity = train_one_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_perplexity = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch + 1}/{n_epochs} Train Loss: {train_loss:.4f} Train Perplexity: {train_perplexity:.4f} Val Loss: {val_loss:.4f} Val Perplexity: {val_perplexity:.4f}")


test_loss, test_perplexity = test(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} Test Perplexity: {test_perplexity:.4f}")

  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 1/10 Train Loss: 5.9379 Train Perplexity: 379.1531 Val Loss: 5.4780 Val Perplexity: 239.3634


  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 2/10 Train Loss: 5.5988 Train Perplexity: 270.0974 Val Loss: 5.3377 Val Perplexity: 208.0315


  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 3/10 Train Loss: 5.5187 Train Perplexity: 249.3163 Val Loss: 5.3027 Val Perplexity: 200.8800


  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 4/10 Train Loss: 5.4741 Train Perplexity: 238.4338 Val Loss: 5.2610 Val Perplexity: 192.6802


  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 5/10 Train Loss: 5.4496 Train Perplexity: 232.6667 Val Loss: 5.2837 Val Perplexity: 197.1030


  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 6/10 Train Loss: 5.4233 Train Perplexity: 226.6206 Val Loss: 5.2608 Val Perplexity: 192.6284


  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 7/10 Train Loss: 5.4052 Train Perplexity: 222.5682 Val Loss: 5.2773 Val Perplexity: 195.8392


  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 8/10 Train Loss: 5.3913 Train Perplexity: 219.4792 Val Loss: 5.2571 Val Perplexity: 191.9179


  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 9/10 Train Loss: 5.3805 Train Perplexity: 217.1264 Val Loss: 5.2537 Val Perplexity: 191.2733


  0%|          | 0/11037 [00:00<?, ?it/s]

  0%|          | 0/1579 [00:00<?, ?it/s]

Epoch 10/10 Train Loss: 5.3761 Train Perplexity: 216.1799 Val Loss: 5.2571 Val Perplexity: 191.9220


  0%|          | 0/3155 [00:00<?, ?it/s]

Test Loss: 5.2591 Test Perplexity: 192.3162


In [12]:
# save the model
torch.save(model.state_dict(), "model.pth")

# 2. **Bonus** Plot graphs showing the variation of average train/test perplexities with varying hyperparameters  like Dropout rate, changing the dimensions of the layers, changing the Optimizer, etc. Report the most optimal Hyperparameters found. [10 marks]

In [10]:
HYPERPARAMS = {
    'hidden_dim': ,
    'optimizer': ,
    'dropout': ,
}

SyntaxError: expression expected after dictionary key and ':' (686894790.py, line 2)