# Prakhar Jain
# 2022121008

Trained Models : https://drive.google.com/drive/folders/1E1pYtju_sBlSnodKjQa1b_uAq80xNbYN?usp=sharing

In [1]:
# import nltk
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
import unicodedata
import re
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import KeyedVectors
# import nltk
from tqdm.auto import tqdm

import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# from preprocess import TextProcessor, Vocabulary, TextDatasetLSTM

[nltk_data] Downloading package punkt to /home/prakhar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/prakhar/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [2]:


# Download necessary NLTK data files
# nltk.download('punkt')


class TextProcessor:
    def __init__(self, file_path) -> None:
        self.file_path = file_path
# you can also try stemming and lemmatizers to improve performance
# https://towardsdatascience.com/text-preprocessing-with-nltk-9de5de891658

    def clean_text(self, text):
        text = unicodedata.normalize("NFD", text)
        text = text.lower()
        text = re.sub(r"[^0-9a-zA-Z?.,!:;]+", r" ", text)
        text = re.sub(r"(.)\1{3,}", r"\1", text)
        text = text.strip()
        return text

    def preprocess_text(self):
        with open(self.file_path, 'r') as f:
            corpus = f.read()
        sentences = sent_tokenize(corpus)
        sentences = [self.clean_text(sent) for sent in sentences if not sent.lower(
        ).startswith('chapter') and not sent[0].isdigit()]
        return sentences

    # def generate_ngrams(self, sentences, n):
    #     ngrams = []
    #     for sentence in sentences:
    #         tokens = word_tokenize(sentence)
    #         tokens = ['<S>'] + tokens + ['</S>']
    #         # <s> helps in identifying the start of a sentence, allowing the model to learn the patterns and probabilities associated with words that commonly appear at the beginning of sentences
    #         # <\s> helps the model understand when the a sentence is complete and learn word sentences that commonly end sentences.
    #         sentence_ngrams = zip(*[tokens[i:] for i in range(n)])
    #         ngrams.extend([' '.join(ngram) for ngram in sentence_ngrams])
    #     return ngrams

    def get_max_len(self, sentences):
        return max([len(word_tokenize(sent)) for sent in sentences])

    # def get_max_len(self, sentences):
    #     max_len = 0
    #     max_len_sentence = ""
    #     for sent in sentences:
    #         tokenized_sent = word_tokenize(sent)
    #         if len(tokenized_sent) > max_len:
    #             max_len = len(tokenized_sent)
    #             max_len_sentence = sent
    #     return max_len, max_len_sentence

    def generate_lstm_sentences(self, sentences):
        padded_sentences = []
        # get the maximum length of the sentence
        for sentence in sentences:
            tokens = word_tokenize(sentence)
            padded_sentence = tokens
            padded_sentence = ['<S>'] + tokens + ['</S>']
            padded_sentence = ' '.join(padded_sentence)
            padded_sentences.append(padded_sentence)
        return padded_sentences

    # def generate_padded_sentences(self, sentences, max_len):
    #     padded_sentences = []
    #     # get the maximum length of the sentence
    #     for sentence in sentences:
    #         tokens = word_tokenize(sentence)
    #         padded_sentence = tokens
    #         padded_sentence = ['<S>'] + tokens + ['</S>']
    #         padded_sentence += ['<PAD>'] * (max_len - len(tokens))
    #         padded_sentence = ' '.join(padded_sentence)
    #         padded_sentences.append(padded_sentence)
    #     return padded_sentences


class Vocabulary:
    def __init__(self, glove_path):
        self.glove_model = KeyedVectors.load_word2vec_format(
            glove_path, binary=False, no_header=True)
        self.word2idx = {}
        self.idx2word = []
        self.special_tokens = ['<PAD>', '<UNK>', '<S>', '</S>']
        self._add_special_tokens()

    def _add_special_tokens(self):
        for token in self.special_tokens:
            self.add_word(token)

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)

    def build_vocab(self, sentences):
        for sentence in sentences:
            tokens = word_tokenize(sentence)
            for token in tokens:
                self.add_word(token)

    def get_glove_embeddings(self):
        embedding_dim = self.glove_model.vector_size
        embeddings = np.zeros((len(self.idx2word), embedding_dim))
        # embeddings in the shape of (vocab_size, embedding_dim)
        for idx, word in enumerate(self.idx2word):
            if word in self.glove_model:
                embeddings[idx] = self.glove_model[word]
            else:
                embeddings[idx] = np.random.normal(
                    scale=0.6, size=(embedding_dim,))
        return torch.tensor(embeddings, dtype=torch.float32)

    def index2word(self, idx):
        return self.idx2word[idx]

    def word2index(self, word):
        return self.word2idx.get(word, self.word2idx['<UNK>'])

    def __len__(self):
        return len(self.word2idx)


class TextDatasetLSTM(Dataset):
    def __init__(self, sentences, vocab):
        self.sentences = sentences
        self.vocab = vocab

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx].split()
        input_idxs = [self.vocab.word2index(word) for word in sentence[:-1]]  # All words except the last one
        target_idxs = [self.vocab.word2index(word) for word in sentence[1:]]  # All words except the first one
        return torch.tensor(input_idxs, dtype=torch.long), torch.tensor(target_idxs, dtype=torch.long)



def collate_fn(batch, padding_value):
    # Sort batch by sequence length in descending order
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    
    inputs, targets = zip(*batch)
    lengths = [len(seq) for seq in inputs]  # Extract original sequence lengths
    
    # Pad sequences to the max length in the batch
    padded_inputs = pad_sequence(inputs, batch_first=True, padding_value=padding_value)
    padded_targets = pad_sequence(targets, batch_first=True, padding_value=padding_value)
    
    return padded_inputs, padded_targets, lengths

In [3]:
file_path = "Auguste_Maquet.txt"
glove_path = "glove.6B.300d.txt"
# can use fasttext as well just by specifying the path

text_processor = TextProcessor(file_path)
sentences = text_processor.preprocess_text()

max_len = text_processor.get_max_len(sentences)

# print(max_len)

random.seed(42)
random.shuffle(sentences)

val_len = int(len(sentences) * 0.1)
test_len = int(len(sentences) * 0.2)

train_sentences = sentences[val_len + test_len:]
val_sentences = sentences[:val_len]
test_sentences = sentences[val_len:val_len + test_len]

vocabulary = Vocabulary(glove_path)
vocabulary.build_vocab(train_sentences)
embeddings = vocabulary.get_glove_embeddings()

train_sentences = text_processor.generate_lstm_sentences(train_sentences)
val_sentences = text_processor.generate_lstm_sentences(val_sentences)
test_sentences = text_processor.generate_lstm_sentences(test_sentences)

train_dataset = TextDatasetLSTM(train_sentences, vocabulary)
val_dataset = TextDatasetLSTM(val_sentences, vocabulary)
test_dataset = TextDatasetLSTM(test_sentences, vocabulary)

padding_value = vocabulary.word2index('<PAD>')



In [4]:
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          collate_fn=lambda batch: collate_fn(batch, padding_value))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        collate_fn=lambda batch: collate_fn(batch, padding_value))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                         collate_fn=lambda batch: collate_fn(batch, padding_value))

In [5]:
# for batch in train_loader:
#     # print(batch)
#     print(batch.shape)
#     print(batch[1].shape)

In [6]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, output_dim, padding_value, embeddings):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding.from_pretrained(
            embeddings, padding_idx=padding_value, freeze=True)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        # Embedding lookup
        embedded = self.embedding(x)

        packed_embedded = pack_padded_sequence(
            embedded, lengths, batch_first=True, enforce_sorted=False)

        packed_output, _ = self.lstm(packed_embedded)

        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        output = self.fc(output)

        return output

In [11]:
def train_one_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch_input, batch_target, lengths in train_loader:
        optimizer.zero_grad()
        batch_input, batch_target = batch_input.to(device), batch_target.to(device)
        
        # Forward pass
        output = model(batch_input, lengths)
        
        # Compute loss
        loss = criterion(output.view(-1, output.size(-1)), batch_target.view(-1))
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    perplexity = calculate_perplexity(avg_loss)
    return avg_loss, perplexity


def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_input, batch_target, lengths in val_loader:
            batch_input, batch_target = batch_input.to(device), batch_target.to(device)
            
            # Forward pass
            output = model(batch_input, lengths)
            
            # Compute loss
            loss = criterion(output.view(-1, output.size(-1)), batch_target.view(-1))
            
            total_loss += loss.item()
    
    avg_loss = total_loss / len(val_loader)
    perplexity = calculate_perplexity(avg_loss)
    return avg_loss, perplexity


def test(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch_input, batch_target, lengths in test_loader:
            batch_input, batch_target = batch_input.to(device), batch_target.to(device)
            
            # Forward pass
            output = model(batch_input, lengths)
            
            # Compute loss
            loss = criterion(output.view(-1, output.size(-1)), batch_target.view(-1))
            
            total_loss += loss.item()
    
    avg_loss = total_loss / len(test_loader)
    perplexity = calculate_perplexity(avg_loss)
    return avg_loss, perplexity


def calculate_perplexity(loss):
    return torch.exp(torch.tensor(loss))

In [8]:
vocab_size = embeddings.shape[0]
embedding_dim = embeddings.shape[1]
hidden_dim = 512
num_layers = 2
output_dim = embeddings.shape[0]
padding_value = vocabulary.word2index('<PAD>')

model = LSTM(vocab_size=vocab_size, 
             embedding_dim=300, 
             hidden_dim=512, 
             num_layers=2, 
             output_dim=vocab_size,  # same as vocab_size
             padding_value=padding_value, 
             embeddings=embeddings).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=padding_value)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

In [9]:
%timeit

n_epochs = 10

for epoch in range(n_epochs):
    train_loss, train_perplexity = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_perplexity = evaluate(model, val_loader, criterion, device)
    
    print(f"Epoch: {epoch+1}, Train Loss: {train_loss:.4f}, Train Perplexity: {train_perplexity:.2f}, Val Loss: {val_loss:.4f}, Val Perplexity: {val_perplexity:.2f}")

test_loss, test_perplexity = test(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}, Test Perplexity: {test_perplexity:.2f}")

Epoch: 1, Train Loss: 5.8777, Train Perplexity: 356.99, Val Loss: 5.2042, Val Perplexity: 182.03
Epoch: 2, Train Loss: 4.9595, Train Perplexity: 142.53, Val Loss: 4.8455, Val Perplexity: 127.17
Epoch: 3, Train Loss: 4.6837, Train Perplexity: 108.17, Val Loss: 4.6918, Val Perplexity: 109.04
Epoch: 4, Train Loss: 4.5109, Train Perplexity: 91.00, Val Loss: 4.5819, Val Perplexity: 97.70
Epoch: 5, Train Loss: 4.3814, Train Perplexity: 79.95, Val Loss: 4.5218, Val Perplexity: 92.00
Epoch: 6, Train Loss: 4.2769, Train Perplexity: 72.02, Val Loss: 4.4757, Val Perplexity: 87.86
Epoch: 7, Train Loss: 4.1904, Train Perplexity: 66.05, Val Loss: 4.4441, Val Perplexity: 85.12
Epoch: 8, Train Loss: 4.1122, Train Perplexity: 61.08, Val Loss: 4.4060, Val Perplexity: 81.94
Epoch: 9, Train Loss: 4.0459, Train Perplexity: 57.16, Val Loss: 4.3802, Val Perplexity: 79.86
Epoch: 10, Train Loss: 3.9886, Train Perplexity: 53.98, Val Loss: 4.3656, Val Perplexity: 78.70
Test Loss: 4.3604, Test Perplexity: 78.29


In [10]:
torch.save(model.state_dict(), 'lstm_model.pth')

In [9]:
vocab_size = embeddings.shape[0]
padding_value = vocabulary.word2index('<PAD>')

model = LSTM(vocab_size=vocab_size, 
             embedding_dim=300, 
             hidden_dim=512, 
             num_layers=2, 
             output_dim=vocab_size,  # same as vocab_size
             padding_value=padding_value, 
             embeddings=embeddings).to(device)

model.load_state_dict(torch.load('lstm_model.pth'))

criterion = nn.CrossEntropyLoss(ignore_index=padding_value)

In [12]:
batch_size = 1

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          collate_fn=lambda batch: collate_fn(batch, padding_value))
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        collate_fn=lambda batch: collate_fn(batch, padding_value))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                         collate_fn=lambda batch: collate_fn(batch, padding_value))

# write perplexity for each sentence in train_loader, val_loader and test_loader in a file LM-2-training.txt, LM-2-validation.txt and LM-2-testing.txt respectively
# write the perplexity of each sentence in a new line
def write_perplexity(model, data_loader, criterion, device, file_path):
    model.eval()
    with open(file_path, 'w') as f:
        for batch_input, batch_target, lengths in data_loader:
            batch_input, batch_target = batch_input.to(device), batch_target.to(device)
            

            output = model(batch_input, lengths)
            
            
            loss = criterion(output.view(-1, output.size(-1)), batch_target.view(-1))
            perplexity = calculate_perplexity(loss.item())

            #  convert batch_input to sentence
            sentence = ' '.join([vocabulary.index2word(idx.item()) for idx in batch_input[0]])
            f.write(f"{sentence}: {perplexity:.2f}\n")
    
write_perplexity(model, train_loader, criterion, device, 'LM-2-training.txt')
write_perplexity(model, val_loader, criterion, device, 'LM-2-validation.txt')
write_perplexity(model, test_loader, criterion, device, 'LM-2-testing.txt')
