<a href="https://colab.research.google.com/github/pratikjagtapofficial/Next-Word-Prediction-LSTM/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [110]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk

In [111]:
import requests

url = "https://raw.githubusercontent.com/pratikjagtapofficial/Next-Word-Prediction-LSTM/main/ai_ml_corpus_2000_vocab.txt"

response = requests.get(url)

document = response.text

In [112]:
document

'Artificial intelligence is a multidisciplinary field that integrates computer science, mathematics, statistics, and cognitive science.\nMachine learning is a subfield of artificial intelligence that focuses on algorithms that learn from data and generalize to unseen data.\nDeep learning is a subset of machine learning that uses neural networks with multiple hidden layers.\nNeural networks are computational models inspired by biological neurons.\nMachine learning models learn patterns from structured and unstructured data.\nDeep learning models can represent complex nonlinear relationships.\nNatural language processing enables machines to understand and generate human language.\nComputer vision allows machines to interpret and analyze visual information from images and videos.\nGenerative models such as transformers and diffusion models can generate synthetic text, images, and audio.\nOptimization algorithms such as stochastic gradient descent minimize differentiable loss functions dur

In [113]:
print(type(document))

<class 'str'>


In [114]:
len(document)

12741

In [115]:
# Tokenization
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [116]:
# tokenize
tokens = word_tokenize(document.lower())

In [117]:
# build Vocab

from collections import Counter

# build vocab
vocab = {'<unk>':0}
counter = Counter(tokens)
Counter(tokens) # this provides how many times unique words came in document

Counter({'artificial': 5,
         'intelligence': 4,
         'is': 33,
         'a': 33,
         'multidisciplinary': 1,
         'field': 2,
         'that': 15,
         'integrates': 1,
         'computer': 2,
         'science': 2,
         ',': 41,
         'mathematics': 1,
         'statistics': 3,
         'and': 52,
         'cognitive': 1,
         '.': 193,
         'machine': 16,
         'learning': 45,
         'subfield': 1,
         'of': 11,
         'focuses': 6,
         'on': 30,
         'algorithms': 8,
         'learn': 7,
         'from': 14,
         'data': 37,
         'generalize': 1,
         'to': 52,
         'unseen': 5,
         'deep': 6,
         'subset': 2,
         'uses': 3,
         'neural': 5,
         'networks': 6,
         'with': 14,
         'multiple': 4,
         'hidden': 2,
         'layers': 3,
         'are': 12,
         'computational': 3,
         'models': 25,
         'inspired': 1,
         'by': 10,
         'biological': 1

In [118]:
for word in counter.keys():
    if word not in vocab:
        vocab[word] = len(vocab)
vocab # Vocabulary built

{'<unk>': 0,
 'artificial': 1,
 'intelligence': 2,
 'is': 3,
 'a': 4,
 'multidisciplinary': 5,
 'field': 6,
 'that': 7,
 'integrates': 8,
 'computer': 9,
 'science': 10,
 ',': 11,
 'mathematics': 12,
 'statistics': 13,
 'and': 14,
 'cognitive': 15,
 '.': 16,
 'machine': 17,
 'learning': 18,
 'subfield': 19,
 'of': 20,
 'focuses': 21,
 'on': 22,
 'algorithms': 23,
 'learn': 24,
 'from': 25,
 'data': 26,
 'generalize': 27,
 'to': 28,
 'unseen': 29,
 'deep': 30,
 'subset': 31,
 'uses': 32,
 'neural': 33,
 'networks': 34,
 'with': 35,
 'multiple': 36,
 'hidden': 37,
 'layers': 38,
 'are': 39,
 'computational': 40,
 'models': 41,
 'inspired': 42,
 'by': 43,
 'biological': 44,
 'neurons': 45,
 'patterns': 46,
 'structured': 47,
 'unstructured': 48,
 'can': 49,
 'represent': 50,
 'complex': 51,
 'nonlinear': 52,
 'relationships': 53,
 'natural': 54,
 'language': 55,
 'processing': 56,
 'enables': 57,
 'machines': 58,
 'understand': 59,
 'generate': 60,
 'human': 61,
 'vision': 62,
 'allows': 

In [119]:
len(vocab)

706

In [120]:
from nltk.tokenize import sent_tokenize

input_sentences = sent_tokenize(document)
input_sentences

['Artificial intelligence is a multidisciplinary field that integrates computer science, mathematics, statistics, and cognitive science.',
 'Machine learning is a subfield of artificial intelligence that focuses on algorithms that learn from data and generalize to unseen data.',
 'Deep learning is a subset of machine learning that uses neural networks with multiple hidden layers.',
 'Neural networks are computational models inspired by biological neurons.',
 'Machine learning models learn patterns from structured and unstructured data.',
 'Deep learning models can represent complex nonlinear relationships.',
 'Natural language processing enables machines to understand and generate human language.',
 'Computer vision allows machines to interpret and analyze visual information from images and videos.',
 'Generative models such as transformers and diffusion models can generate synthetic text, images, and audio.',
 'Optimization algorithms such as stochastic gradient descent minimize diffe

In [121]:
# Convert words to numbers

def words_to_numbers(sentence, vocab):
  numerical_sentence = []

  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<unk>']) # Corrected from '<UNK>'

  return numerical_sentence

In [122]:
# After converting words into numbers assign numbers to sentences

input_numerical_sentences = []

for sentence in input_sentences:
  input_numerical_sentences.append(words_to_numbers(word_tokenize(sentence.lower()), vocab))

In [123]:
len(input_numerical_sentences)

210

In [124]:
# Create training Sequence
training_sequences = []
for sentence in input_numerical_sentences:

  for i in range(1, len(sentence)):
    training_sequences.append(sentence[:i+1])
training_sequences

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5, 6],
 [1, 2, 3, 4, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7, 8],
 [1, 2, 3, 4, 5, 6, 7, 8, 9],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 11],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 11, 13],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 11, 13, 11],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 11, 13, 11, 14],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 11, 13, 11, 14, 15],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 11, 13, 11, 14, 15, 10],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 11, 13, 11, 14, 15, 10, 16],
 [17, 18],
 [17, 18, 3],
 [17, 18, 3, 4],
 [17, 18, 3, 4, 19],
 [17, 18, 3, 4, 19, 20],
 [17, 18, 3, 4, 19, 20, 1],
 [17, 18, 3, 4, 19, 20, 1, 2],
 [17, 18, 3, 4, 19, 20, 1, 2, 7],
 [17, 18, 3, 4, 19, 20, 1, 2, 7, 21],
 [17, 18, 3, 4, 19, 20, 1, 2, 7, 21, 22],
 [17, 18, 3, 4, 19, 20, 1, 2, 7, 21, 22, 23],
 [17, 18, 3, 4, 1

In [125]:
len(training_sequences)

1898

In [126]:
training_sequences[:5]

[[1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5, 6]]

In [127]:
# we have 942 sequence but all are having different size we want same size of sequence for model training
# so we find max len of sentence and using padding make all sentences same len
len_list = []

for sequence in training_sequences:
  len_list.append(len(sequence))

max(len_list)

22

In [128]:
padded_training_sequence = []
for sequence in training_sequences:
  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [129]:
len(padded_training_sequence[8]) # now any sequence is of same length

22

In [130]:
# Now for training LSTM model we need to convert 2d vector into 3d Tensor
# for that we will do

padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [131]:
padded_training_sequence # Now our data is ready for training

tensor([[  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        [  0,   0,   0,  ...,   2,   3,   4],
        ...,
        [  0,   0,   0,  ..., 487,  35, 635],
        [  0,   0,   0,  ...,  35, 635, 705],
        [  0,   0,   0,  ..., 635, 705,  16]])

In [132]:
# Split data into x&y

x = padded_training_sequence[:, :-1]
y = padded_training_sequence[:, -1]
x.dtype

torch.int64

In [133]:
x

tensor([[  0,   0,   0,  ...,   0,   0,   1],
        [  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        ...,
        [  0,   0,   0,  ..., 586, 487,  35],
        [  0,   0,   0,  ..., 487,  35, 635],
        [  0,   0,   0,  ...,  35, 635, 705]])

In [134]:
y[1:500]

tensor([  3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  11,  13,  11,  14,
         15,  10,  16,  18,   3,   4,  19,  20,   1,   2,   7,  21,  22,  23,
          7,  24,  25,  26,  14,  27,  28,  29,  26,  16,  18,   3,   4,  31,
         20,  17,  18,   7,  32,  33,  34,  35,  36,  37,  38,  16,  34,  39,
         40,  41,  42,  43,  44,  45,  16,  18,  41,  24,  46,  25,  47,  14,
         48,  26,  16,  18,  41,  49,  50,  51,  52,  53,  16,  55,  56,  57,
         58,  28,  59,  14,  60,  61,  55,  16,  62,  63,  58,  28,  64,  14,
         65,  66,  67,  25,  68,  14,  69,  16,  41,  71,  72,  73,  14,  74,
         41,  49,  60,  75,  76,  11,  68,  11,  14,  77,  16,  23,  71,  72,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  16,  89,  90,  91,  92,
         18,  93,  94,  22,  80,  13,  16,  18,  96,  24,  43,  97,  98,  99,
        100, 101,  35, 102, 103,  16,  18,  32, 105,  26,  28, 106,  41, 107,
        108,  14, 109, 110,  16,  18, 112,  46,  14, 113, 114, 1

In [135]:
# Now Create Dataset & DataLoader Class

class CustomDataset(Dataset):
  def __init__(self, x, y):
    self.x = x
    self.y = y

  def __len__(self):
    return self.x.shape[0]

  def __getitem__(self, idx):
    return self.x[idx], self.y[idx]

In [136]:
dataset = CustomDataset(x,y)

In [137]:
len(dataset)

1898

In [138]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [139]:
# Create LSTM Architecture

class LSTMModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100)
    self.lstm = nn.LSTM(100, 150, batch_first=True)
    self.fc = nn.Linear(150, vocab_size)

  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

In [140]:
model = LSTMModel(len(vocab))

In [141]:
# Load Model on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMModel(
  (embedding): Embedding(706, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=706, bias=True)
)

In [142]:
# Create loss & optimizer Function
epochs = 25
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [143]:
# Create training Loop

for epoch in range(epochs):
  total_loss = 0
  for batch_x, batch_y in dataloader:
    batch_x = batch_x.to(device)
    batch_y = batch_y.to(device)
    optimizer.zero_grad()
    output = model(batch_x)
    loss = loss_fn(output, batch_y)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f"Epoch: {epoch+1}/{epochs} Loss: {loss.item()}")

Epoch: 1/20 Loss: 6.128513336181641
Epoch: 2/20 Loss: 4.9514570236206055
Epoch: 3/20 Loss: 6.079002380371094
Epoch: 4/20 Loss: 4.184048652648926
Epoch: 5/20 Loss: 4.580319404602051
Epoch: 6/20 Loss: 3.232288360595703
Epoch: 7/20 Loss: 4.37563419342041
Epoch: 8/20 Loss: 3.012436628341675
Epoch: 9/20 Loss: 2.794795274734497
Epoch: 10/20 Loss: 2.5213074684143066
Epoch: 11/20 Loss: 2.0284438133239746
Epoch: 12/20 Loss: 0.9789384603500366
Epoch: 13/20 Loss: 1.1679296493530273
Epoch: 14/20 Loss: 0.6827265620231628
Epoch: 15/20 Loss: 1.0971235036849976
Epoch: 16/20 Loss: 1.2972614765167236
Epoch: 17/20 Loss: 0.8977915644645691
Epoch: 18/20 Loss: 0.7038439512252808
Epoch: 19/20 Loss: 0.49480271339416504
Epoch: 20/20 Loss: 0.8815481066703796


In [153]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

In [154]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # No need to compute gradients
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Get model predictions
            outputs = model(batch_x)

            # Get the predicted word indices
            _, predicted = torch.max(outputs, dim=1)

            # Compare with actual labels
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy

# Compute accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")

Model Accuracy: 93.10%


In [144]:
# prediction

def prediction(model, vocab, text, max_seq_len):

  # tokenize
  tokenized_text = word_tokenize(text.lower())

  # text -> numerical indices
  numerical_text = words_to_numbers(tokenized_text, vocab)

  # padding
  padded_text = torch.tensor([0] * (max_seq_len - len(numerical_text)) + numerical_text, dtype=torch.long).unsqueeze(0)
  # Move the padded_text to the same device as the model
  padded_text = padded_text.to(device)

  # send to model
  output = model(padded_text)

  # predicted index
  value, index = torch.max(output, dim=1)

  # merge with text
  # Convert the index tensor to a Python integer using .item()
  return text + " " + list(vocab.keys())[index.item()]

In [145]:
prediction(model, vocab, "Artificial intelligence", x.shape[1]) # Single output

'Artificial intelligence is'

In [155]:
# Function for prediction
def generate_text(model, vocab, input_text, max_seq_len, num_tokens=20):
    text = input_text

    for _ in range(num_tokens):
        output_text = prediction(model, vocab, text, max_seq_len)
        next_word = output_text.split()[-1]
        text = output_text

    return text

In [158]:
generate_text(model, vocab, "Artificial intelligence", x.shape[1], num_tokens=20) # output according to given length

'Artificial intelligence is a field that focuses on building systems that can perform tasks requiring human intelligence . . . . .'

In [157]:
# Model is ready for Fast API
max_seq_len = x.shape[1]
torch.save({
    "model_state_dict": model.state_dict(),
    "vocab": vocab,
    "max_seq_len": max_seq_len
}, "model.pth")