# Prakhar Jain
# 2022121008

In [24]:
# import nltk
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt 
from tqdm.auto import tqdm

import random

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = torch.device("cpu")

from preprocess import TextProcessor, Vocabulary, TextDataset, TextDataset


# 1. Implement the Language Model and report the Perplexity Scores. [40 marks]

In [25]:
file_path = "Auguste_Maquet.txt"
glove_path = "glove.6B.300d.txt"
# can use fasttext as well just by specifying the path

text_processor = TextProcessor(file_path)
sentences = text_processor.preprocess_text()

random.seed(42)
random.shuffle(sentences)

val_len = 10000
test_len = 20000

train_sentences = sentences[val_len + test_len:]
val_sentences = sentences[:val_len]
test_sentences = sentences[val_len:val_len + test_len]

train_ngrams = text_processor.generate_ngrams(train_sentences, 5 + 1) # 5 for context and 1 for target
val_ngrams = text_processor.generate_ngrams(val_sentences, 5 +1)
test_ngrams = text_processor.generate_ngrams(test_sentences, 5 + 1)

vocabulary = Vocabulary(glove_path)
vocabulary.build_vocab(train_sentences)
embeddings = vocabulary.get_glove_embeddings()

train_dataset = TextDataset(train_ngrams, vocabulary)
val_dataset = TextDataset(val_ngrams, vocabulary)
test_dataset = TextDataset(test_ngrams, vocabulary)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [26]:
## test functionality
print(train_ngrams[0])

for batch in train_loader:
    for x in batch[0][0]:
        print(vocabulary.index2word(x.item()))
        print(str(x.item()) +" "+str(vocabulary.word2index(vocabulary.index2word(x.item()))))

    # print the target word
    print(vocabulary.index2word(batch[1][0].item()))
    print(str(batch[1][0].item()) +" "+str(vocabulary.word2index(vocabulary.index2word(batch[1][0].item()))))
    break

<S> the wounds that i make
<S>
1 1
the
3 3
wounds
4 4
that
5 5
i
6 6
make
7 7


In [27]:
print(embeddings.shape)
print(type(embeddings))

torch.Size([16377, 300])
<class 'torch.Tensor'>


In [28]:
class NeuralLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, embedding, dropout):
        super(NeuralLanguageModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding.from_pretrained(embedding)
        self.dropout = dropout

        self.model = nn.Sequential(
            nn.Linear(5 * embedding_dim, 300),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(300, vocab_size)
            # nn.LogSoftmax(dim=1) # CrossEntropyLoss already applies log softmax
        )

    def forward(self, x):
        # x is a tensor of shape (batch_size, 5)    
        embeds = self.embedding(x)
        embeds = embeds.view(-1, 5 * self.embedding_dim) # flatten the tensor
        output = self.model(embeds)
        return output

In [29]:
model = NeuralLanguageModel(vocab_size=embeddings.shape[0], embedding_dim=embeddings.shape[1], hidden_dim=300, embedding=embeddings, dropout=0.5)
criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

model.to(device)

NeuralLanguageModel(
  (embedding): Embedding(16377, 300)
  (model): Sequential(
    (0): Linear(in_features=1500, out_features=300, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=300, out_features=16377, bias=True)
  )
)

In [30]:

def calculate_perplexity(loss):
    # perplexity = exp(cross_entropy) this is a direct relationship between cross entropy and perplexity
    return torch.exp(loss.item())

def train_one_epoch(model, train_loader, criterion, optimizer):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        context, target = batch
        context, target = context.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader.dataset)
    perplexity = calculate_perplexity(avg_loss)
    return avg_loss, perplexity

def evaluate(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.inference_mode():
        for batch in tqdm(val_loader):
            context, target = batch
            context, target = context.to(device), target.to(device)
            output = model(context)
            loss = criterion(output, target)
            total_loss += loss.item()
    avg_loss = total_loss / len(val_loader.dataset)
    perplexity = calculate_perplexity(avg_loss)
    return avg_loss, perplexity

def test(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.inference_mode():
        for batch in tqdm(test_loader):
            context, target = batch
            context, target = context.to(device), target.to(device)
            output = model(context)
            loss = criterion(output, target)
            total_loss += loss.item()
    avg_loss = total_loss / len(test_loader.dataset)
    perplexity = calculate_perplexity(avg_loss)
    return avg_loss, perplexity

n_epochs = 10

for epoch in range(n_epochs):
    train_loss, train_perplexity = train_one_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_perplexity = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch + 1}/{n_epochs} Train Loss: {train_loss:.4f} Train Perplexity: {train_perplexity:.4f} Val Loss: {val_loss:.4f} Val Perplexity: {val_perplexity:.4f}")


test_loss, test_perplexity = test(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f} Test Perplexity: {test_perplexity:.4f}")

  0%|          | 0/7109 [00:00<?, ?it/s]

KeyboardInterrupt: 

# 2. **Bonus** Plot graphs showing the variation of average train/test perplexities with varying hyperparameters  like Dropout rate, changing the dimensions of the layers, changing the Optimizer, etc. Report the most optimal Hyperparameters found. [10 marks]

In [None]:
HYPERPARAMS = {
    'hidden_dim': ,
    'optimizer': ,
    'dropout': ,
}