In [1]:
# Import necessary libraries for deep learning, data processing, and tokenization

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk
import time

In [2]:
# Load a subset of the BookCorpus dataset using HuggingFace Datasets

from datasets import load_dataset

bookcorpus_dataset = load_dataset('bookcorpus', split='train', trust_remote_code=True)

subset = bookcorpus_dataset.select(range(10000))

document = "\n".join(subset['text'])

In [3]:
# Download NLTK tokenizers required for word tokenization

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
# Tokenize the entire document into words (lowercased)

tokens = word_tokenize(document.lower())

In [5]:
# Build a vocabulary dictionary mapping each unique token to a unique integer index

vocab = {'<UNK>':0}     # Reserve 0 for unknown tokens

for token in Counter(tokens).keys():
    if token not in vocab:
        vocab[token] = len(vocab)

len(vocab)  # Get the size of the vocabulary

6997

In [6]:
# Split the document into sentences for sequence modeling

input_sentences = document.split('\n')

In [7]:
# Function to convert a tokenized sentence into a list of vocabulary indices

def text_to_indices(sentence, vocab):

    numerical_sentence = []

    for token in sentence:
        if token in vocab:
            numerical_sentence.append(vocab[token])

        else:
            numerical_sentence.append(vocab['<UNK>'])    

    return numerical_sentence        


In [8]:
# Convert all sentences into lists of vocabulary indices

input_numerical_sentences = []

for sentence in input_sentences:
    input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))

In [9]:
len(input_numerical_sentences)

10000

In [10]:
# Prepare training sequences for next-word prediction

training_seq = []
for sentence in input_numerical_sentences:

    for i in range(1,len(sentence)):
        training_seq.append(sentence[:i+1])


In [11]:
len(training_seq)

123701

In [12]:
# Find the maximum sequence length for padding

length_list = []
for seq in training_seq:
    length_list.append(len(seq))

max_len = max(length_list)
print(max_len)  # Print the maximum sequence length

77


In [13]:
# Pad all sequences so they have the same length (left-padding with zeros)

padded_training_seq = []
for seq in training_seq:
    padded_training_seq.append([0]*(max(length_list) - len(seq)) + seq)

In [14]:
len(padded_training_seq[0])

77

In [15]:
# Convert the padded sequences to a PyTorch tensor

padded_training_seq = torch.tensor(padded_training_seq, dtype = torch.long)

In [16]:
padded_training_seq.shape

torch.Size([123701, 77])

In [17]:
# Split the sequences into input (all but last token) and target (last token)

x = padded_training_seq[:, :-1]
y = padded_training_seq[:, -1]

In [18]:
# Define a custom PyTorch Dataset for our training data

class CustomDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, index):
         return self.x[index], self.y[index]

In [19]:
dataset = CustomDataset(x,y)

In [20]:
len(dataset)

123701

In [21]:
# Create the dataset and dataloader for batching and shuffling

dataloader = DataLoader(dataset, batch_size = 32, shuffle = True)

In [22]:
# Define the LSTM-based language model

class LSTMModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, 100)  # Embedding layer for word vectors
        self.lstm = nn.LSTM(100, 150, batch_first = True)   # LSTM layer
        self.fc = nn.Linear(150, vocab_size)    # Output layer to predict next word

    def forward(self, x):
        embedded = self.embedding(x)
        intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
        output = self.fc(final_hidden_state.squeeze(0))    # Use the last hidden state for prediction

        return output

In [23]:
# Instantiate the model and move it to the appropriate device (GPU if available)

model = LSTMModel(len(vocab))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMModel(
  (embedding): Embedding(6997, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=6997, bias=True)
)

In [24]:
# Set training hyperparameters and loss/optimizer

epochs = 50
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)


In [25]:
# Training loop for the LSTM language model

for epoch in range(epochs):
    total_loss = 0

    for batch_x, batch_y in dataloader:

        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()

        output = model(batch_x)

        loss = criterion(output, batch_y)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()


In [26]:
# Function to predict the next word given an input text prompt

def prediction(model, vocab, text):

    tokenized_text = word_tokenize(text.lower())

    numerical_text = text_to_indices(tokenized_text, vocab)
    
    padded_text = torch.tensor([0]*(max_len - len(numerical_text)) + numerical_text, dtype = torch.long).unsqueeze(0).to(device)

    output = model(padded_text)

    value, index =torch.max(output, dim = 1)

    return text + " " + list(vocab.keys())[index]


In [27]:
# Example: Predict the next word for a given prompt

prediction(model, vocab, "nice to meet you")

'nice to meet you back'

In [28]:
# Generate a sequence of words by repeatedly predicting the next word

num_tokens = 100
input_text = "nice to meet you"

for i in range(num_tokens):

    output_text = prediction(model, vocab, input_text)
    print(output_text)
    input_text = output_text    # Use the new text as the next input
    time.sleep(0.5)

nice to meet you back
nice to meet you back ,
nice to meet you back , just
nice to meet you back , just for
nice to meet you back , just for a
nice to meet you back , just for a few
nice to meet you back , just for a few weeks
nice to meet you back , just for a few weeks or
nice to meet you back , just for a few weeks or at
nice to meet you back , just for a few weeks or at the
nice to meet you back , just for a few weeks or at the end
nice to meet you back , just for a few weeks or at the end of
nice to meet you back , just for a few weeks or at the end of the
nice to meet you back , just for a few weeks or at the end of the people
nice to meet you back , just for a few weeks or at the end of the people who
nice to meet you back , just for a few weeks or at the end of the people who 'd
nice to meet you back , just for a few weeks or at the end of the people who 'd be
nice to meet you back , just for a few weeks or at the end of the people who 'd be a
nice to meet you back , just for a