## Lstm _ pytorch _ nextwordpredictor
---------

In [2]:
# !pip install nltk

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk

In [4]:
document = """
What is the Artificial Intelligence (AI) Mentorship Program?
This program is a structured mentorship designed to help learners understand, build, and deploy Artificial Intelligence systems from fundamentals to applied use cases.

What is the course fee for the AI Mentorship Program?
The course follows a monthly subscription model where you have to make monthly payments of Rs 999/month.

What is the total duration of the course?
The total duration of the course is 6 months. So the total course fee becomes 999*6 = Rs 6000 (approx.)

What is the syllabus of the mentorship program?
We will be covering the following modules:
Python for AI
Mathematics for AI (Linear Algebra, Probability, Statistics)
Data Handling and Preprocessing
Machine Learning Fundamentals
Supervised Learning Algorithms
Unsupervised Learning Algorithms
Introduction to Neural Networks
Deep Learning Basics
Natural Language Processing (NLP) Fundamentals
Computer Vision Basics
Model Evaluation and Optimization
AI Project Development
Model Deployment Basics
Ethics and Responsible AI
Case Studies and Real-world AI Applications

Will Deep Learning and NLP be a part of this program?
Yes, both Deep Learning and NLP are core parts of the curriculum.

What if I miss a live session? Will I get a recording of the session?
Yes, all sessions are recorded. You can watch the recordings anytime during your subscription period.

Where can I find the class schedule?
You will find the month-wise class schedule in your dashboard after registration.

What is the time duration of all the live sessions?
Each live session is approximately 2 hours long.

What is the language spoken by the instructor during the sessions?
Hinglish (Hindi + English)

How will I be informed about the upcoming class?
You will receive an email notification before every live session.

Can I do this course if I am from a non-tech background?
Yes. The program starts from fundamentals and is suitable for non-tech backgrounds with basic computer knowledge.

I am late, can I join the program in the middle?
Yes, you can join the program at any time.

If I join/pay in the middle, will I be able to see all the past lectures?
Yes, once you subscribe, all previous sessions will be available in your dashboard.

Where do I have to submit the tasks and assignments?
Assignments are self-evaluated. Solutions will be provided for learning and comparison.

Will we do real-world AI case studies in the program?
Yes, multiple real-world AI case studies will be discussed and implemented.

Where can we contact you for queries?
You can mail us at support.ai@mentorship.com

Payment/Registration related questions
Where do we have to make our payments?
All payments must be made through our official website.

Can we pay the entire amount at once?
No, the program follows a strict monthly subscription model.

What is the validity of the monthly subscription?
The validity is 30 days from the date of payment.

What if I don’t like the course after making the payment. What is the refund policy?
There is a 7-day refund policy from the date of payment.

I am living outside India and unable to make payments. What should I do?
Please contact us via email for alternative payment options.

Post registration queries
Till when can I view the paid videos on the website?
You can watch videos as long as your subscription is active. After completing all installments, you will retain access till the program’s end date.

Why lifetime validity is not provided?
Lifetime access is not provided due to the low course fee and continuous content updates.

Where can I ask doubts after the session?
You can submit your doubts using the doubt-clearing form available in your dashboard.

If I join the program late, can I still ask doubts from past sessions?
Yes, you can raise doubts related to past sessions as well.

Certificate and Placement Assistance related queries
What is the criteria to get the certificate?
There are two criteria:
1. Completion of all monthly payments
2. Attempting all assessments and projects

I am joining late. How can I pay for previous months?
You will get an option in your dashboard to clear pending months.

Is placement assistance included in this program?
Yes, placement assistance is included but placement is not guaranteed. The assistance includes:
Portfolio and project guidance
Resume reviews
Mock interviews
Career guidance sessions
Job search strategy discussions
"""

In [6]:
# # Tokenization
# nltk.download("punkt")
# nltk.download("punkt_tab")

In [7]:
# tokenize
tokens = word_tokenize(document.lower())

In [8]:
# build vocab
vocab = {"<unk>": 0}

for token in Counter(tokens).keys():
    if token not in vocab:
        vocab[token] = len(vocab)

vocab

{'<unk>': 0,
 'what': 1,
 'is': 2,
 'the': 3,
 'artificial': 4,
 'intelligence': 5,
 '(': 6,
 'ai': 7,
 ')': 8,
 'mentorship': 9,
 'program': 10,
 '?': 11,
 'this': 12,
 'a': 13,
 'structured': 14,
 'designed': 15,
 'to': 16,
 'help': 17,
 'learners': 18,
 'understand': 19,
 ',': 20,
 'build': 21,
 'and': 22,
 'deploy': 23,
 'systems': 24,
 'from': 25,
 'fundamentals': 26,
 'applied': 27,
 'use': 28,
 'cases': 29,
 '.': 30,
 'course': 31,
 'fee': 32,
 'for': 33,
 'follows': 34,
 'monthly': 35,
 'subscription': 36,
 'model': 37,
 'where': 38,
 'you': 39,
 'have': 40,
 'make': 41,
 'payments': 42,
 'of': 43,
 'rs': 44,
 '999/month': 45,
 'total': 46,
 'duration': 47,
 '6': 48,
 'months': 49,
 'so': 50,
 'becomes': 51,
 '999': 52,
 '*': 53,
 '=': 54,
 '6000': 55,
 'approx': 56,
 'syllabus': 57,
 'we': 58,
 'will': 59,
 'be': 60,
 'covering': 61,
 'following': 62,
 'modules': 63,
 ':': 64,
 'python': 65,
 'mathematics': 66,
 'linear': 67,
 'algebra': 68,
 'probability': 69,
 'statistics': 

In [9]:
len(vocab)

295

In [10]:
input_sentences = document.split("\n")

In [11]:
def text_to_indices(sentence, vocab):

    numerical_sentence = []

    for token in sentence:
        if token in vocab:
            numerical_sentence.append(vocab[token])
        else:
            numerical_sentence.append(vocab["<unk>"])

    return numerical_sentence

In [12]:
input_numerical_sentences = []

for sentence in input_sentences:
    input_numerical_sentences.append(
        text_to_indices(word_tokenize(sentence.lower()), vocab)
    )

In [13]:
len(input_numerical_sentences)

110

In [14]:
training_sequence = []
for sentence in input_numerical_sentences:

    for i in range(1, len(sentence)):
        training_sequence.append(sentence[: i + 1])

In [15]:
len(training_sequence)

731

In [16]:
training_sequence[:5]

[[1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6]]

In [17]:
len_list = []

for sequence in training_sequence:
    len_list.append(len(sequence))

max(len_list)

29

In [18]:
training_sequence[0]

[1, 2]

In [19]:
padded_training_sequence = []
for sequence in training_sequence:

    padded_training_sequence.append([0] * (max(len_list) - len(sequence)) + sequence)

In [20]:
len(padded_training_sequence[10])

29

In [21]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [22]:
padded_training_sequence

tensor([[  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        [  0,   0,   0,  ...,   2,   3,   4],
        ...,
        [  0,   0,   0,  ...,   0, 291, 292],
        [  0,   0,   0,  ..., 291, 292, 293],
        [  0,   0,   0,  ..., 292, 293, 294]])

In [23]:
X = padded_training_sequence[:, :-1]
y = padded_training_sequence[:, -1]

In [24]:
X

tensor([[  0,   0,   0,  ...,   0,   0,   1],
        [  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        ...,
        [  0,   0,   0,  ...,   0,   0, 291],
        [  0,   0,   0,  ...,   0, 291, 292],
        [  0,   0,   0,  ..., 291, 292, 293]])

In [25]:
y

tensor([  2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  10,   2,  13,  14,
          9,  15,  16,  17,  18,  19,  20,  21,  20,  22,  23,   4,   5,  24,
         25,  26,  16,  27,  28,  29,  30,   2,   3,  31,  32,  33,   3,   7,
          9,  10,  11,  31,  34,  13,  35,  36,  37,  38,  39,  40,  16,  41,
         35,  42,  43,  44,  45,  30,   2,   3,  46,  47,  43,   3,  31,  11,
         46,  47,  43,   3,  31,   2,  48,  49,  30,  50,   3,  46,  31,  32,
         51,  52,  53,  48,  54,  44,  55,   6,  56,  30,   8,   2,   3,  57,
         43,   3,   9,  10,  11,  59,  60,  61,   3,  62,  63,  64,  33,   7,
         33,   7,   6,  67,  68,  20,  69,  20,  70,   8,  72,  22,  73,  75,
         26,  75,  77,  75,  77,  16,  80,  81,  75,  83,  85,  86,   6,  87,
          8,  26,  89,  83,  90,  22,  91,  92,  93,  94,  83,  22,  96,   7,
         98,  22,  99,   7, 100,  82,  75,  22,  87,  60,  13, 101,  43,  12,
         10,  11,  20, 103,  82,  75,  22,  87, 104, 105, 106,  

In [26]:
class CustomDataset(Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [27]:
dataset = CustomDataset(X, y)

In [28]:
len(dataset)

731

In [29]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

----------

In [30]:
class LSTMModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 100)
        self.lstm = nn.LSTM(100, 150, batch_first=True)
        self.fc = nn.Linear(150, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)
        intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(
            embedded
        )
        output = self.fc(final_hidden_state.squeeze(0))
        return output

In [None]:
model = LSTMModel(len(vocab)) # class LSTMModel(nn.Module)

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [33]:
model.to(device)

LSTMModel(
  (embedding): Embedding(295, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=295, bias=True)
)

In [34]:
epochs = 50
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [35]:
# training loop

for epoch in range(epochs):
    total_loss = 0

    for batch_x, batch_y in dataloader:

        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()

        output = model(batch_x)

        loss = criterion(output, batch_y)

        loss.backward()

        optimizer.step()

        total_loss = total_loss + loss.item()

    print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 128.8088
Epoch: 2, Loss: 116.0142
Epoch: 3, Loss: 106.5071
Epoch: 4, Loss: 98.9402
Epoch: 5, Loss: 91.3713
Epoch: 6, Loss: 83.4184
Epoch: 7, Loss: 76.1104
Epoch: 8, Loss: 69.3036
Epoch: 9, Loss: 62.6884
Epoch: 10, Loss: 56.2215
Epoch: 11, Loss: 50.3249
Epoch: 12, Loss: 44.9324
Epoch: 13, Loss: 39.9002
Epoch: 14, Loss: 35.3306
Epoch: 15, Loss: 31.1960
Epoch: 16, Loss: 27.6105
Epoch: 17, Loss: 24.3909
Epoch: 18, Loss: 21.7311
Epoch: 19, Loss: 19.3209
Epoch: 20, Loss: 17.2693
Epoch: 21, Loss: 15.4651
Epoch: 22, Loss: 13.9973
Epoch: 23, Loss: 12.7344
Epoch: 24, Loss: 11.5818
Epoch: 25, Loss: 10.6119
Epoch: 26, Loss: 9.7769
Epoch: 27, Loss: 9.0229
Epoch: 28, Loss: 8.4683
Epoch: 29, Loss: 7.9159
Epoch: 30, Loss: 7.4164
Epoch: 31, Loss: 7.0127
Epoch: 32, Loss: 6.6495
Epoch: 33, Loss: 6.4333
Epoch: 34, Loss: 6.1102
Epoch: 35, Loss: 5.8471
Epoch: 36, Loss: 5.6839
Epoch: 37, Loss: 5.4117
Epoch: 38, Loss: 5.2538
Epoch: 39, Loss: 5.0740
Epoch: 40, Loss: 4.9562
Epoch: 41, Loss: 4.80

In [36]:
# prediction


def prediction(model, vocab, text):

    # tokenize
    tokenized_text = word_tokenize(text.lower())

    # text -> numerical indices
    numerical_text = text_to_indices(tokenized_text, vocab)

    # padding
    padded_text = torch.tensor(
        [0] * (61 - len(numerical_text)) + numerical_text, dtype=torch.long
    ).unsqueeze(0)

    # send to model
    output = model(padded_text)

    # predicted index
    value, index = torch.max(output, dim=1)

    # merge with text
    return text + " " + list(vocab.keys())[index]

In [43]:
prediction(model, vocab, "Yes. The program starts from")

'Yes. The program starts from fundamentals'

In [44]:
import time

num_tokens = 10
input_text = "You will find the"

for i in range(num_tokens):
    output_text = prediction(model, vocab, input_text)
    print(output_text)
    input_text = output_text
    time.sleep(0.5)

You will find the month-wise
You will find the month-wise class
You will find the month-wise class schedule
You will find the month-wise class schedule in
You will find the month-wise class schedule in your
You will find the month-wise class schedule in your dashboard
You will find the month-wise class schedule in your dashboard after
You will find the month-wise class schedule in your dashboard after registration
You will find the month-wise class schedule in your dashboard after registration .
You will find the month-wise class schedule in your dashboard after registration . after


In [45]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

In [46]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # No need to compute gradients
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Get model predictions
            outputs = model(batch_x)

            # Get the predicted word indices
            _, predicted = torch.max(outputs, dim=1)

            # Compare with actual labels
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy


# Compute accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")

Model Accuracy: 94.80%
