In [2]:
# Import necessary libraries for deep learning, data processing, and tokenization

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk
import time

In [3]:
# Load a subset of the BookCorpus dataset using HuggingFace Datasets

from datasets import load_dataset

bookcorpus_dataset = load_dataset('bookcorpus', split='train', trust_remote_code=True)

subset = bookcorpus_dataset.select(range(10000))

document = "\n".join(subset['text'])

In [4]:
# Download NLTK tokenizers required for word tokenization

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\rishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
# Tokenize the entire document into words (lowercased)

tokens = word_tokenize(document.lower())

In [6]:
# Build a vocabulary dictionary mapping each unique token to a unique integer index

vocab = {'<UNK>':0}     # Reserve 0 for unknown tokens

for token in Counter(tokens).keys():
    if token not in vocab:
        vocab[token] = len(vocab)

len(vocab)  # Get the size of the vocabulary

6997

In [7]:
# Split the document into sentences for sequence modeling

input_sentences = document.split('\n')
input_sentences

['usually , he would be tearing around the living room , playing with his toys .',
 'but just one look at a minion sent him practically catatonic .',
 "that had been megan 's plan when she got him dressed earlier .",
 "he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .",
 'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .',
 "`` are n't you being a good boy ? ''",
 'she said .',
 'mason barely acknowledged her .',
 'instead , his baby blues remained focused on the television .',
 'since the movie was almost over , megan knew she better slip into the bedroom and finish getting ready .',
 "each time she looked into mason 's face , she was grateful that he looked nothing like his father .",
 'his platinum blond hair and blue eyes were completely hers .',
 'it was only his 

In [8]:
# Function to convert a tokenized sentence into a list of vocabulary indices

def text_to_indices(sentence, vocab):

    numerical_sentence = []

    for token in sentence:
        if token in vocab:
            numerical_sentence.append(vocab[token])

        else:
            numerical_sentence.append(vocab['<UNK>'])    

    return numerical_sentence        


In [9]:
# Convert all sentences into lists of vocabulary indices

input_numerical_sentences = []

for sentence in input_sentences:
    input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()), vocab))

In [10]:
input_numerical_sentences

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 11, 12, 13, 14, 15],
 [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 15],
 [27, 28, 29, 30, 31, 32, 33, 34, 35, 24, 36, 37, 15],
 [3,
  38,
  39,
  8,
  40,
  41,
  42,
  43,
  2,
  44,
  3,
  45,
  21,
  46,
  47,
  48,
  8,
  49,
  50,
  2,
  16,
  12,
  51,
  52,
  2,
  53,
  12,
  54,
  55,
  2,
  56,
  45,
  57,
  58,
  59,
  60,
  27,
  61,
  51,
  15],
 [34,
  62,
  59,
  63,
  64,
  65,
  42,
  66,
  67,
  51,
  68,
  45,
  18,
  69,
  70,
  3,
  45,
  21,
  71,
  21,
  72,
  73,
  48,
  13,
  74,
  15],
 [75, 76, 77, 78, 64, 21, 72, 79, 80, 75],
 [34, 81, 15],
 [56, 82, 83, 54, 15],
 [84, 2, 13, 85, 86, 87, 88, 89, 8, 90, 15],
 [91,
  8,
  40,
  45,
  41,
  92,
  2,
  30,
  93,
  34,
  94,
  95,
  96,
  8,
  97,
  67,
  98,
  99,
  100,
  15],
 [101,
  102,
  34,
  103,
  96,
  56,
  31,
  104,
  2,
  34,
  45,
  105,
  27,
  3,
  103,
  106,
  107,
  13,
  108,
  15],
 [13, 109, 110, 111, 67, 112, 113, 61, 114, 115, 15],
 [116, 45, 117, 13, 1

In [11]:
len(input_numerical_sentences)

10000

In [12]:
# Prepare training sequences for next-word prediction

training_seq = []
for sentence in input_numerical_sentences:

    for i in range(1,len(sentence)):
        training_seq.append(sentence[:i+1])


In [13]:
len(training_seq)

123701

In [14]:
training_seq

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5, 6],
 [1, 2, 3, 4, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7, 8],
 [1, 2, 3, 4, 5, 6, 7, 8, 9],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 11],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 11, 12],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 11, 12, 13],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 11, 12, 13, 14],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 2, 11, 12, 13, 14, 15],
 [16, 17],
 [16, 17, 18],
 [16, 17, 18, 19],
 [16, 17, 18, 19, 20],
 [16, 17, 18, 19, 20, 21],
 [16, 17, 18, 19, 20, 21, 22],
 [16, 17, 18, 19, 20, 21, 22, 23],
 [16, 17, 18, 19, 20, 21, 22, 23, 24],
 [16, 17, 18, 19, 20, 21, 22, 23, 24, 25],
 [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26],
 [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 15],
 [27, 28],
 [27, 28, 29],
 [27, 28, 29, 30],
 [27, 28, 29, 30, 31],
 [27, 28, 29, 30, 31, 32],
 [27, 28, 29, 30, 31, 32, 33],
 [27, 28, 29, 30, 31, 32, 33, 34],
 [27, 28, 29, 30, 31, 32, 

In [15]:
# Find the maximum sequence length for padding

length_list = []
for seq in training_seq:
    length_list.append(len(seq))

max_len = max(length_list)
print(max_len)  # Print the maximum sequence length

77


In [16]:
# Pad all sequences so they have the same length (left-padding with zeros)

padded_training_seq = []
for seq in training_seq:
    padded_training_seq.append([0]*(max(length_list) - len(seq)) + seq)

In [17]:
len(padded_training_seq[0])

77

In [18]:
# Convert the padded sequences to a PyTorch tensor

padded_training_seq = torch.tensor(padded_training_seq, dtype = torch.long)

In [19]:
padded_training_seq.shape

torch.Size([123701, 77])

In [20]:
# Split the sequences into input (all but last token) and target (last token)

x = padded_training_seq[:, :-1]
y = padded_training_seq[:, -1]

In [21]:
x

tensor([[   0,    0,    0,  ...,    0,    0,    1],
        [   0,    0,    0,  ...,    0,    1,    2],
        [   0,    0,    0,  ...,    1,    2,    3],
        ...,
        [   0,    0,    0,  ..., 6996,  360,  595],
        [   0,    0,    0,  ...,  360,  595,   59],
        [   0,    0,    0,  ...,  595,   59,  307]])

In [22]:
y

tensor([  2,   3,   4,  ...,  59, 307,  15])

In [23]:
# Define a custom PyTorch Dataset for our training data

class CustomDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y

    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, index):
         return self.x[index], self.y[index]

In [24]:
dataset = CustomDataset(x,y)

In [25]:
len(dataset)

123701

In [26]:
dataset[0]

(tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1]),
 tensor(2))

In [27]:
# Create the dataset and dataloader for batching and shuffling

dataloader = DataLoader(dataset, batch_size = 32, shuffle = True)

In [28]:
# Define the LSTM-based language model

class LSTMModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, 100)  # Embedding layer for word vectors
        self.lstm = nn.LSTM(100, 150, batch_first = True)   # LSTM layer
        self.fc = nn.Linear(150, vocab_size)    # Output layer to predict next word

    def forward(self, x):
        embedded = self.embedding(x)
        intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
        output = self.fc(final_hidden_state.squeeze(0))    # Use the last hidden state for prediction

        return output

In [29]:
# Instantiate the model and move it to the appropriate device (GPU if available)

model = LSTMModel(len(vocab))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMModel(
  (embedding): Embedding(6997, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=6997, bias=True)
)

In [30]:
# Set training hyperparameters and loss/optimizer

epochs = 50
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)


In [31]:
# Training loop for the LSTM language model

for epoch in range(epochs):
    total_loss = 0

    for batch_x, batch_y in dataloader:

        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()

        output = model(batch_x)

        loss = criterion(output, batch_y)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()

        print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")


Epoch: 1, Loss: 8.8889
Epoch: 1, Loss: 17.7318
Epoch: 1, Loss: 26.5975
Epoch: 1, Loss: 35.4327
Epoch: 1, Loss: 44.2935
Epoch: 1, Loss: 53.1209
Epoch: 1, Loss: 61.9385
Epoch: 1, Loss: 70.7689
Epoch: 1, Loss: 79.5852
Epoch: 1, Loss: 88.3954
Epoch: 1, Loss: 97.1858
Epoch: 1, Loss: 105.9586
Epoch: 1, Loss: 114.7226
Epoch: 1, Loss: 123.4953
Epoch: 1, Loss: 132.2554
Epoch: 1, Loss: 141.0042
Epoch: 1, Loss: 149.6996
Epoch: 1, Loss: 158.3529
Epoch: 1, Loss: 166.9865
Epoch: 1, Loss: 175.5556
Epoch: 1, Loss: 184.1135
Epoch: 1, Loss: 192.6262
Epoch: 1, Loss: 201.0851
Epoch: 1, Loss: 209.2528
Epoch: 1, Loss: 217.4306
Epoch: 1, Loss: 225.5714
Epoch: 1, Loss: 233.2809
Epoch: 1, Loss: 240.7879
Epoch: 1, Loss: 248.1176
Epoch: 1, Loss: 254.8505
Epoch: 1, Loss: 261.9284
Epoch: 1, Loss: 268.5382
Epoch: 1, Loss: 275.6749
Epoch: 1, Loss: 282.1044
Epoch: 1, Loss: 288.9705
Epoch: 1, Loss: 295.3108
Epoch: 1, Loss: 301.7475
Epoch: 1, Loss: 308.2466
Epoch: 1, Loss: 314.5386
Epoch: 1, Loss: 321.5214
Epoch: 1, Lo

In [32]:
# Function to predict the next word given an input text prompt

def prediction(model, vocab, text):

    tokenized_text = word_tokenize(text.lower())

    numerical_text = text_to_indices(tokenized_text, vocab)
    
    padded_text = torch.tensor([0]*(max_len - len(numerical_text)) + numerical_text, dtype = torch.long).unsqueeze(0).to(device)

    output = model(padded_text)

    value, index =torch.max(output, dim = 1)

    return text + " " + list(vocab.keys())[index]


In [33]:
# Example: Predict the next word for a given prompt

prediction(model, vocab, "nice to meet you")

'nice to meet you .'

In [34]:
# Generate a sequence of words by repeatedly predicting the next word

num_tokens = 100
input_text = "nice to meet you"

for i in range(num_tokens):

    output_text = prediction(model, vocab, input_text)
    print(output_text)
    input_text = output_text    # Use the new text as the next input
    time.sleep(0.5)

nice to meet you .
nice to meet you . ``
nice to meet you . `` smiling
nice to meet you . `` smiling replied
nice to meet you . `` smiling replied .
nice to meet you . `` smiling replied . ``
nice to meet you . `` smiling replied . `` ``
nice to meet you . `` smiling replied . `` `` ``
nice to meet you . `` smiling replied . `` `` `` emma
nice to meet you . `` smiling replied . `` `` `` emma chastised
nice to meet you . `` smiling replied . `` `` `` emma chastised .
nice to meet you . `` smiling replied . `` `` `` emma chastised . ``
nice to meet you . `` smiling replied . `` `` `` emma chastised . `` ``
nice to meet you . `` smiling replied . `` `` `` emma chastised . `` `` ``
nice to meet you . `` smiling replied . `` `` `` emma chastised . `` `` `` there
nice to meet you . `` smiling replied . `` `` `` emma chastised . `` `` `` there are
nice to meet you . `` smiling replied . `` `` `` emma chastised . `` `` `` there are fine
nice to meet you . `` smiling replied . `` `` `` emma cha