In [1]:
%cd ..

d:\nlp\assignment1


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import json
from Task1.task1 import WordPieceTokenizer
from Task2.task import Word2VecModel
import pickle


# Neural LM Dataset class
class NeuralLMDataset(Dataset):
    def __init__(self, tokenizer, word2vec, context_size=3):
        self.tokenizer = tokenizer
        self.word2vec = word2vec
        self.context_size = context_size
        self.data = self.preprocess_data()

    def tokenize_txt_file(self, input_file: str, output_file: str) -> None:
        """Tokenize sentences from input TXT file and save results as JSON"""
        # Reading the input text file
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        results = {}
        for idx, line in enumerate(lines):
            sentence = line.strip()  # Remove leading/trailing whitespaces or newlines
            tokens = self.tokenizer.tokenize(sentence)  # Tokenize the sentence using your tokenization method
            results[str(idx)] = tokens  # Store tokens with index as the key (starting from 0)
        # Saving the results to an output JSON file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2)

    def preprocess_data(self):
        # self.tokenizer.construct_vocabulary("corpus.txt", vocab_size=100)
        # self.tokenize_txt_file("corpus.txt", "tokenized_corpus.json")

        corpus = None
        with open(r'D:\nlp\assignment1\task2\tokenized_corpus.json', 'r') as f:
            # Load the JSON data
            tokenized_corpus = json.load(f)
            
            # Convert the dictionary into a list of sentences (list of tokenized words)
            corpus = [tokens for tokens in tokenized_corpus.values()]
            
        
        self.tokenized_sentences = corpus
        # updates the word to index mapping
        self.word2idx = {word: idx for idx, word in enumerate(self.tokenizer.vocab)}
        # updates the reverse index to word mapping
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

        # indexed_sentences = [[self.word2vec.embeddings(torch.tensor(self.word2idx[token], dtype=torch.long)).detach().numpy() for token in sent] for sent in self.tokenized_sentences]
        data = []

        for sent in self.tokenized_sentences:
            if len(sent) > self.context_size:
                for i in range(len(sent) - self.context_size):
                    # context = sent[i:i+self.context_size]
                    context = [self.word2vec.embeddings(torch.tensor(self.word2idx[token], dtype=torch.long)).detach().numpy() for token in sent[i:i+self.context_size]]
                    target = sent[i+self.context_size]
                    if target in self.word2idx:
                        target_idx = self.word2idx[target]
                    else:
                        target_idx = self.word2idx['[UNK]']
                    data.append((context, target_idx))
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        context_tensor = torch.tensor(np.concatenate(context).flatten(), dtype=torch.float32)
        target_tensor = torch.tensor(target, dtype=torch.long)
        return context_tensor, target_tensor

# Define Neural Language Model Variations
class NeuralLM1(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size):
        super(NeuralLM1, self).__init__()
        self.architecture = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, vocab_size)
        )
    
    def forward(self, x):
        return self.architecture(x)

class NeuralLM2(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size):
        super(NeuralLM2, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.projection = nn.Linear(input_dim, vocab_size)
    
    def forward(self, x):
        identity = self.projection(x)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        out = x + identity
        return out

# class NeuralLM2(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_dim):
#         super(NeuralLM2, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.fc1 = nn.Linear(embedding_dim * 2, hidden_dim)
#         self.tanh = nn.Tanh()
#         self.fc2 = nn.Linear(hidden_dim, vocab_size)
    
#     def forward(self, x):
#         x = self.embedding(x).view(x.shape[0], -1)
#         x = self.tanh(self.fc1(x))
#         return self.fc2(x)

class NeuralLM3(nn.Module):
    def __init__(self, input_dim, hidden_dim, vocab_size):
        super(NeuralLM3, self).__init__()
        # self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.batch_norm = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(0.3)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x):
        # x = self.embedding(x).view(x.shape[0], -1)
        x = self.relu(self.batch_norm(self.fc1(x)))
        x = self.dropout(x)
        return self.fc2(x)

# Training function with loss tracking
def train(model, train_dataloader, val_dataloader, epochs=100, lr=0.01):
    model.to("cpu")
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    train_losses, val_losses = [], []
    train_acc, val_acc = [], []

    for epoch in range(epochs):
        #training phase
        model.train()
        train_loss, correct, total = 0, 0, 0
        for context, target in train_dataloader:
            optimizer.zero_grad() #zero the gradients
            #forward pass
            output = model(context)
            loss = criterion(output, target)
            train_loss += loss.item()
            #backward pass and optimize
            loss.backward()
            optimizer.step()
            #compute accuracy
            _, predicted = torch.max(output, 1)
            correct += (predicted == target).sum().item()
            total += target.size(0)
        #calculate average training loss
        train_losses.append(train_loss / len(train_dataloader))
        train_acc.append(100 * correct / total)

        #validation phase
        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for context, target in val_dataloader:
                output = model(context)
                loss = criterion(output, target)
                val_loss += loss.item()
                #compute accuracy
                _, predicted = torch.max(output, 1)
                correct += (predicted == target).sum().item()
                total += target.size(0)
        #calculate average validation loss
        val_losses.append(val_loss / len(val_dataloader))
        val_acc.append(100 * correct / total)

        #print losses for each epoch
        print(f"----- Epoch {epoch + 1}/{epochs} -----")
        # print(f"Train Loss: {train_losses[-1]:.4f}, Val Loss: {val_losses[-1]:.4f}")
        print(f"Train Loss: {train_losses[-1]:.4f}, Train Accuracy: {train_acc[-1]:.2f}%")
        print(f"Val Loss: {val_losses[-1]:.4f}, Val Accuracy: {val_acc[-1]:.2f}%")

    # return train_losses, train_acc, val_losses, val_acc #return losses

# Plot loss function
def plot_losses(train_losses, val_losses):
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Loss vs Epochs')
    plt.legend()
    plt.show()

# Accuracy and Perplexity computation
def compute_accuracy(model, dataloader):
    correct, total = 0, 0
    with torch.no_grad():
        for context, target in dataloader:
            output = model(context)
            predictions = torch.argmax(output, dim=1)
            correct += (predictions == target).sum().item()
            total += target.size(0)
    return correct / total

def compute_perplexity(loss):
    return np.exp(loss)

In [3]:
%cd task3

d:\nlp\assignment1\task3


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [4]:
def load_model(final_model_dir, model_class):
    model_path = r'D:\nlp\assignment1\Task2\final_model\final_model.pt'
    checkpoint = torch.load(model_path)
    
    model = model_class(vocab_size=checkpoint['vocab_size'], embedding_dim=checkpoint['embedding_dim'])
    model.load_state_dict(checkpoint['model_state_dict'])
    
    val_loss = checkpoint['val_loss']
    val_accuracy = checkpoint['val_accuracy']
    
    return model, val_loss, val_accuracy

def load_vocabulary(final_model_dir):
    vocab_path = r'D:\nlp\assignment1\Task2\final_model\vocabulary.json'
    with open(vocab_path, 'r') as f:
        vocab_data = json.load(f)
    return vocab_data['word2idx'], vocab_data['idx2word']


# Example usage:
word2vec, val_loss, val_accuracy = load_model('final_model', Word2VecModel)
# word2idx, idx2word = load_vocabulary('final_model')
tokenizer = pickle.load(open(r'D:\nlp\assignment1\Task1\token.pkl', 'rb'))
dataset = NeuralLMDataset(tokenizer, word2vec)

In [5]:
WINDOW_SIZE = 2
BATCH_SIZE = 1024
NUM_EPOCHS = 10
LEARNING_RATE = 0.02
TRAIN_SPLIT = 0.8

# Split dataset into training and validation
train_size = int(TRAIN_SPLIT * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size]
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [6]:
model1 = NeuralLM1(input_dim=30, hidden_dim=256, vocab_size=8500)
train(model1, train_loader, val_loader, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

----- Epoch 1/10 -----
Train Loss: 5.1207, Train Accuracy: 30.21%
Val Loss: 4.7340, Val Accuracy: 32.51%
----- Epoch 2/10 -----
Train Loss: 4.3979, Train Accuracy: 32.86%
Val Loss: 4.7418, Val Accuracy: 34.37%
----- Epoch 3/10 -----
Train Loss: 4.2387, Train Accuracy: 33.28%
Val Loss: 4.7891, Val Accuracy: 34.07%
----- Epoch 4/10 -----
Train Loss: 4.1774, Train Accuracy: 33.43%
Val Loss: 4.8414, Val Accuracy: 33.60%
----- Epoch 5/10 -----
Train Loss: 4.1237, Train Accuracy: 33.95%
Val Loss: 4.9385, Val Accuracy: 34.76%
----- Epoch 6/10 -----
Train Loss: 4.0841, Train Accuracy: 34.39%
Val Loss: 4.9639, Val Accuracy: 34.66%
----- Epoch 7/10 -----
Train Loss: 4.0385, Train Accuracy: 34.56%
Val Loss: 5.0309, Val Accuracy: 34.84%
----- Epoch 8/10 -----
Train Loss: 4.0318, Train Accuracy: 34.71%
Val Loss: 5.0200, Val Accuracy: 34.56%
----- Epoch 9/10 -----
Train Loss: 4.0157, Train Accuracy: 34.82%
Val Loss: 5.0905, Val Accuracy: 32.84%
----- Epoch 10/10 -----
Train Loss: 3.9818, Train Accur

In [9]:
model2 = NeuralLM2(input_dim=30, hidden_dim=256, vocab_size=8500)
train(model2, train_loader, val_loader, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

----- Epoch 1/10 -----
Train Loss: 4.8666, Train Accuracy: 30.38%
Val Loss: 4.4216, Val Accuracy: 33.91%
----- Epoch 2/10 -----
Train Loss: 3.9814, Train Accuracy: 33.80%
Val Loss: 4.4708, Val Accuracy: 35.10%
----- Epoch 3/10 -----
Train Loss: 3.7612, Train Accuracy: 34.47%
Val Loss: 4.5645, Val Accuracy: 35.03%
----- Epoch 4/10 -----
Train Loss: 3.6314, Train Accuracy: 35.21%
Val Loss: 4.6657, Val Accuracy: 35.17%
----- Epoch 5/10 -----
Train Loss: 3.5436, Train Accuracy: 35.71%
Val Loss: 4.7359, Val Accuracy: 35.28%
----- Epoch 6/10 -----
Train Loss: 3.4774, Train Accuracy: 36.16%
Val Loss: 4.8305, Val Accuracy: 35.65%
----- Epoch 7/10 -----
Train Loss: 3.4195, Train Accuracy: 36.88%
Val Loss: 4.9105, Val Accuracy: 35.15%
----- Epoch 8/10 -----
Train Loss: 3.3773, Train Accuracy: 37.08%
Val Loss: 4.9773, Val Accuracy: 35.74%
----- Epoch 9/10 -----
Train Loss: 3.3376, Train Accuracy: 37.52%
Val Loss: 5.0440, Val Accuracy: 34.87%
----- Epoch 10/10 -----
Train Loss: 3.3066, Train Accur

In [10]:
model3 = NeuralLM3(input_dim=30, hidden_dim=256, vocab_size=8500)
train(model3, train_loader, val_loader, epochs=NUM_EPOCHS, lr=LEARNING_RATE)

----- Epoch 1/10 -----
Train Loss: 4.7709, Train Accuracy: 31.32%
Val Loss: 4.2545, Val Accuracy: 34.16%
----- Epoch 2/10 -----
Train Loss: 4.0447, Train Accuracy: 33.99%
Val Loss: 4.2109, Val Accuracy: 34.89%
----- Epoch 3/10 -----
Train Loss: 3.8412, Train Accuracy: 34.51%
Val Loss: 4.2294, Val Accuracy: 35.99%
----- Epoch 4/10 -----
Train Loss: 3.6988, Train Accuracy: 34.83%
Val Loss: 4.2468, Val Accuracy: 35.67%
----- Epoch 5/10 -----
Train Loss: 3.5924, Train Accuracy: 35.17%
Val Loss: 4.2858, Val Accuracy: 35.79%
----- Epoch 6/10 -----
Train Loss: 3.5182, Train Accuracy: 35.45%
Val Loss: 4.3241, Val Accuracy: 36.22%
----- Epoch 7/10 -----
Train Loss: 3.4511, Train Accuracy: 35.89%
Val Loss: 4.3781, Val Accuracy: 36.27%
----- Epoch 8/10 -----
Train Loss: 3.3985, Train Accuracy: 36.17%
Val Loss: 4.4050, Val Accuracy: 36.57%
----- Epoch 9/10 -----
Train Loss: 3.3631, Train Accuracy: 36.31%
Val Loss: 4.4284, Val Accuracy: 36.33%
----- Epoch 10/10 -----
Train Loss: 3.3254, Train Accur

In [None]:
len(tokenizer.vocab)

In [None]:
sentence = "The weather is"
model.eval()
tokens = tokenizer.tokenize(sentence)
sentence_embeds = [dataset.word2vec.embeddings(torch.tensor(dataset.word2idx[token], dtype=torch.long)).detach().numpy() for token in tokens]
context = sentence_embeds[-3:]
context = torch.tensor(np.concatenate(context).flatten(), dtype=torch.float32).unsqueeze(0)  # Add batch dim
print(context.shape)

In [None]:
def predict_tokens(sentence: str, num_tokens: int, context_size: int, dataset: NeuralLMDataset, model: nn.Module):
    model.eval() #evaluation mode for predicing tokens

    tokens = dataset.tokenizer.tokenize(sentence) #tokenize the input sentence
    if len(tokens) < context_size:
        pass #add padding tokens
    sentence_embeds = [dataset.word2vec.embeddings(torch.tensor(dataset.word2idx[token], dtype=torch.long)).detach().numpy() for token in tokens] #create embeddings for tokenized sentence

    predicted_tokens = []
    for _ in range(num_tokens):
        context = sentence_embeds[-context_size:]
        context_tensor = torch.tensor(np.concatenate(context).flatten(), dtype=torch.float32)
        output = model(context_tensor)
        predicted_idx = torch.argmax(output).item()
        predicted_token = dataset.idx2word[predicted_idx]
        predicted_tokens.append(predicted_token)
        sentence_embeds.append(dataset.word2vec.embeddings(torch.tensor(predicted_idx, dtype=torch.long)).detach().numpy())
    
    return predicted_tokens

In [None]:
def prediciton_pipeline():
    pass

In [None]:
predicted_words = []
for _ in range(3):
    with torch.no_grad():
        context = sentence_embeds[-3:]
        context = torch.tensor(np.concatenate(context).flatten(), dtype=torch.float32)#.unsqueeze(0)  # Add batch dim
        output = model(context)  # Predict next word
        next_word_idx = torch.argmax(output).item()  # Get most probable word
        # print(next_word_idx)
        next_word = dataset.idx2word.get(next_word_idx, "<UNK>")  # Convert index to word
        # print(next_word)
        predicted_words.append(next_word)

        tokens.append(next_word)
        sentence_embeds.append(dataset.word2vec.embeddings(torch.tensor(next_word_idx, dtype=torch.long)).detach().numpy())


print(" ".join(predicted_words))

In [None]:
def predict_next_words(model, sentence, tokenizer, word2vec, vocab, num_words=3):
    model.eval()
    words = tokenizer.tokenize(sentence)
    
    # Convert to embeddings
    context = [word2vec[word] for word in words if word in word2vec][-3:]  # Use last 3 tokens
    context = torch.tensor(context, dtype=torch.float32).unsqueeze(0)  # Add batch dim

    predicted_words = []
    for _ in range(num_words):
        with torch.no_grad():
            output = model(context)  # Predict next word
            next_word_idx = torch.argmax(output, dim=1).item()  # Get most probable word
            next_word = vocab[next_word_idx]  # Convert index to word
            predicted_words.append(next_word)

            # Update context by adding new word embedding
            if next_word in word2vec:
                next_embedding = word2vec[next_word]
                context = torch.cat([context[:, 1:, :], torch.tensor(next_embedding).unsqueeze(0).unsqueeze(1)], dim=1)

    return " ".join(predicted_words)

In [None]:
sentence = "The weather is"
predicted = predict_next_words(model, sentence, tokenizer, word2vec, vocab)
print(f"Predicted words: {predicted}")

In [None]:
# Prepare data for training
dataset = NeuralLMDataset(corpus, text_processor.word2idx)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Train models
embedding_dim = 10
hidden_dim = 20

models = [NeuralLM1, NeuralLM2, NeuralLM3]
losses = {}
accuracies = {}
perplexities = {}

for i, model_class in enumerate(models):
    model = model_class(len(text_processor.vocab), embedding_dim, hidden_dim)
    loss = train(model, dataloader)
    losses[f'NeuralLM{i+1}'] = loss
    acc = compute_accuracy(model, dataloader)
    accuracies[f'NeuralLM{i+1}'] = acc
    perplexities[f'NeuralLM{i+1}'] = compute_perplexity(loss[-1])
    plot_losses(loss, f'NeuralLM{i+1} Training Loss')
    torch.save(model.state_dict(), f"neural_lm{i+1}.pth")
    print(f"NeuralLM{i+1} - Accuracy: {acc}, Perplexity: {perplexities[f'NeuralLM{i+1}']}")

# Predict next 3 tokens from test.txt
def predict_next_tokens(model, sentence, n=3):
    words = sentence.split()
    context = torch.tensor([text_processor.word2idx[word] for word in words[-2:]], dtype=torch.long)
    predictions = []
    for _ in range(n):
        output = model(context.unsqueeze(0))
        next_word_idx = torch.argmax(output, dim=1).item()
        next_word = text_processor.idx2word[next_word_idx]
        predictions.append(next_word)
        context = torch.cat((context[1:], torch.tensor([next_word_idx])))
    return predictions

print("Model checkpoints saved!")