In [44]:
pip install nltk




In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
from torch.utils.data import DataLoader, Dataset



In [46]:
# Load the data
document = """What is machine learning?
Machine learning is a branch of artificial intelligence that enables computers to learn from data and make decisions without being explicitly programmed.

What are the types of machine learning?
There are three main types of machine learning: supervised learning, unsupervised learning, and reinforcement learning.

What is supervised learning?
Supervised learning is a type of machine learning where the model is trained on labeled data, meaning each input has a corresponding correct output.

What is unsupervised learning?
Unsupervised learning is a type of machine learning where the model is trained on unlabeled data and finds patterns or relationships in the data.

What is reinforcement learning?
Reinforcement learning is a type of machine learning where an agent learns to make decisions by receiving rewards or penalties based on its actions.

What is deep learning?
Deep learning is a subset of machine learning that uses neural networks with multiple layers to process complex data.

What is a neural network?
A neural network is a computational model inspired by the human brain, consisting of interconnected layers of neurons that process and learn from data.

What is natural language processing?
Natural language processing (NLP) is a field of AI that focuses on enabling computers to understand, interpret, and generate human language.

What is a dataset?
A dataset is a collection of data used to train, validate, and test machine learning models.

What is overfitting in machine learning?
Overfitting occurs when a machine learning model learns too much from the training data, capturing noise instead of general patterns, leading to poor performance on new data.

What is underfitting in machine learning?
Underfitting occurs when a model is too simple and fails to learn meaningful patterns from the training data, resulting in poor performance on both training and test data.

What is an epoch in deep learning?
An epoch refers to one complete pass through the entire training dataset during model training.

What is gradient descent?
Gradient descent is an optimization algorithm used to minimize the loss function by adjusting model parameters iteratively.

What is backpropagation?
Backpropagation is a method used to train neural networks by propagating errors backward to update weights using gradient descent.

What is the activation function in a neural network?
An activation function introduces non-linearity to a neural network, helping it learn complex patterns. Common activation functions include ReLU, Sigmoid, and Tanh.

"""

In [47]:
# necceary imports for nltk
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\DC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [48]:
# tokenize the document
tokens = word_tokenize(document.lower())
tokens

['what',
 'is',
 'machine',
 'learning',
 '?',
 'machine',
 'learning',
 'is',
 'a',
 'branch',
 'of',
 'artificial',
 'intelligence',
 'that',
 'enables',
 'computers',
 'to',
 'learn',
 'from',
 'data',
 'and',
 'make',
 'decisions',
 'without',
 'being',
 'explicitly',
 'programmed',
 '.',
 'what',
 'are',
 'the',
 'types',
 'of',
 'machine',
 'learning',
 '?',
 'there',
 'are',
 'three',
 'main',
 'types',
 'of',
 'machine',
 'learning',
 ':',
 'supervised',
 'learning',
 ',',
 'unsupervised',
 'learning',
 ',',
 'and',
 'reinforcement',
 'learning',
 '.',
 'what',
 'is',
 'supervised',
 'learning',
 '?',
 'supervised',
 'learning',
 'is',
 'a',
 'type',
 'of',
 'machine',
 'learning',
 'where',
 'the',
 'model',
 'is',
 'trained',
 'on',
 'labeled',
 'data',
 ',',
 'meaning',
 'each',
 'input',
 'has',
 'a',
 'corresponding',
 'correct',
 'output',
 '.',
 'what',
 'is',
 'unsupervised',
 'learning',
 '?',
 'unsupervised',
 'learning',
 'is',
 'a',
 'type',
 'of',
 'machine',
 'lea

In [49]:
# building the vocabulary
vocab = {'<unk>':0} # unknown token

Counter(tokens) # count the frequency of each token

Counter({'is': 27,
         'learning': 24,
         'a': 18,
         '.': 16,
         'what': 15,
         '?': 15,
         ',': 14,
         'machine': 12,
         'of': 12,
         'to': 12,
         'data': 11,
         'the': 10,
         'and': 9,
         'model': 7,
         'on': 6,
         'in': 6,
         'neural': 6,
         'an': 5,
         'training': 5,
         'that': 4,
         'learn': 4,
         'from': 4,
         'patterns': 4,
         'by': 4,
         'network': 4,
         'supervised': 3,
         'unsupervised': 3,
         'reinforcement': 3,
         'type': 3,
         'where': 3,
         'deep': 3,
         'language': 3,
         'dataset': 3,
         'used': 3,
         'gradient': 3,
         'descent': 3,
         'function': 3,
         'activation': 3,
         'computers': 2,
         'make': 2,
         'decisions': 2,
         'are': 2,
         'types': 2,
         'trained': 2,
         'or': 2,
         'learns': 2,
         'net

In [50]:
Counter(tokens).keys() # get the unique tokens

dict_keys(['what', 'is', 'machine', 'learning', '?', 'a', 'branch', 'of', 'artificial', 'intelligence', 'that', 'enables', 'computers', 'to', 'learn', 'from', 'data', 'and', 'make', 'decisions', 'without', 'being', 'explicitly', 'programmed', '.', 'are', 'the', 'types', 'there', 'three', 'main', ':', 'supervised', ',', 'unsupervised', 'reinforcement', 'type', 'where', 'model', 'trained', 'on', 'labeled', 'meaning', 'each', 'input', 'has', 'corresponding', 'correct', 'output', 'unlabeled', 'finds', 'patterns', 'or', 'relationships', 'in', 'an', 'agent', 'learns', 'by', 'receiving', 'rewards', 'penalties', 'based', 'its', 'actions', 'deep', 'subset', 'uses', 'neural', 'networks', 'with', 'multiple', 'layers', 'process', 'complex', 'network', 'computational', 'inspired', 'human', 'brain', 'consisting', 'interconnected', 'neurons', 'natural', 'language', 'processing', '(', 'nlp', ')', 'field', 'ai', 'focuses', 'enabling', 'understand', 'interpret', 'generate', 'dataset', 'collection', 'use

In [51]:
# looping through the unique tokens and adding them to the vocab
for token in Counter(tokens).keys():
    if token not in vocab:
        vocab[token] = len(vocab)

vocab

{'<unk>': 0,
 'what': 1,
 'is': 2,
 'machine': 3,
 'learning': 4,
 '?': 5,
 'a': 6,
 'branch': 7,
 'of': 8,
 'artificial': 9,
 'intelligence': 10,
 'that': 11,
 'enables': 12,
 'computers': 13,
 'to': 14,
 'learn': 15,
 'from': 16,
 'data': 17,
 'and': 18,
 'make': 19,
 'decisions': 20,
 'without': 21,
 'being': 22,
 'explicitly': 23,
 'programmed': 24,
 '.': 25,
 'are': 26,
 'the': 27,
 'types': 28,
 'there': 29,
 'three': 30,
 'main': 31,
 ':': 32,
 'supervised': 33,
 ',': 34,
 'unsupervised': 35,
 'reinforcement': 36,
 'type': 37,
 'where': 38,
 'model': 39,
 'trained': 40,
 'on': 41,
 'labeled': 42,
 'meaning': 43,
 'each': 44,
 'input': 45,
 'has': 46,
 'corresponding': 47,
 'correct': 48,
 'output': 49,
 'unlabeled': 50,
 'finds': 51,
 'patterns': 52,
 'or': 53,
 'relationships': 54,
 'in': 55,
 'an': 56,
 'agent': 57,
 'learns': 58,
 'by': 59,
 'receiving': 60,
 'rewards': 61,
 'penalties': 62,
 'based': 63,
 'its': 64,
 'actions': 65,
 'deep': 66,
 'subset': 67,
 'uses': 68,
 '

In [52]:
len(vocab)

161

In [53]:
# extracting the sentences from the document
sentences = document.split('\n') # split the document by new line
sentences

['What is machine learning?',
 'Machine learning is a branch of artificial intelligence that enables computers to learn from data and make decisions without being explicitly programmed.',
 '',
 'What are the types of machine learning?',
 'There are three main types of machine learning: supervised learning, unsupervised learning, and reinforcement learning.',
 '',
 'What is supervised learning?',
 'Supervised learning is a type of machine learning where the model is trained on labeled data, meaning each input has a corresponding correct output.',
 '',
 'What is unsupervised learning?',
 'Unsupervised learning is a type of machine learning where the model is trained on unlabeled data and finds patterns or relationships in the data.',
 '',
 'What is reinforcement learning?',
 'Reinforcement learning is a type of machine learning where an agent learns to make decisions by receiving rewards or penalties based on its actions.',
 '',
 'What is deep learning?',
 'Deep learning is a subset of m

In [54]:
# replacing the words with their index in the vocab
def text_to_index(sentence, vocab):
  numerical_sentence = []
  
  for token in sentence:
    if token in vocab:
      numerical_sentence.append(vocab[token]) # if the token is in the vocab, replace it with its index
    else:
      numerical_sentence.append(vocab['<unk>']) # if the token is not in the vocab, replace it with the unknown token 

  return numerical_sentence

In [55]:
# converting the sentences to tokens
input_numerical_sentences = []

for sentence in sentences:
    input_numerical_sentences.append(text_to_index(word_tokenize(sentence.lower()), vocab))

In [56]:
len(input_numerical_sentences) # list of sentences with words replaced by their index in the vocab

46

In [57]:
# creating the training sequences
training_sequences = []
for sentence in input_numerical_sentences:
    
    for i in range(1, len(sentence)):
        training_sequences.append(sentence[:i+1])

In [58]:
len(training_sequences)

411

In [59]:
# finding out the longest sequence sentence
len_list = []

for sequence in training_sequences:
  len_list.append(len(sequence))

max(len_list)

30

In [60]:
padded_training_sequence = []
for sequence in training_sequences:

  padded_training_sequence.append([0]*(max(len_list) - len(sequence)) + sequence)

In [61]:
len(padded_training_sequence[9])

30

In [62]:
# converting list to to torch.tensor
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [63]:
padded_training_sequence

tensor([[  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        [  0,   0,   0,  ...,   2,   3,   4],
        ...,
        [  0,   0,   0,  ..., 159,  34,  18],
        [  0,   0,   0,  ...,  34,  18, 160],
        [  0,   0,   0,  ...,  18, 160,  25]])

In [64]:
padded_training_sequence.shape

torch.Size([411, 30])

In [65]:
# extraxting X
X = padded_training_sequence[:, :-1]
X

tensor([[  0,   0,   0,  ...,   0,   0,   1],
        [  0,   0,   0,  ...,   0,   1,   2],
        [  0,   0,   0,  ...,   1,   2,   3],
        ...,
        [  0,   0,   0,  ...,  34, 159,  34],
        [  0,   0,   0,  ..., 159,  34,  18],
        [  0,   0,   0,  ...,  34,  18, 160]])

In [66]:
# extraxting y
y = padded_training_sequence[:,-1]
y

tensor([  2,   3,   4,   5,   4,   2,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
         28,   8,   3,   4,   5,  26,  30,  31,  28,   8,   3,   4,  32,  33,
          4,  34,  35,   4,  34,  18,  36,   4,  25,   2,  33,   4,   5,   4,
          2,   6,  37,   8,   3,   4,  38,  27,  39,   2,  40,  41,  42,  17,
         34,  43,  44,  45,  46,   6,  47,  48,  49,  25,   2,  35,   4,   5,
          4,   2,   6,  37,   8,   3,   4,  38,  27,  39,   2,  40,  41,  50,
         17,  18,  51,  52,  53,  54,  55,  27,  17,  25,   2,  36,   4,   5,
          4,   2,   6,  37,   8,   3,   4,  38,  56,  57,  58,  14,  19,  20,
         59,  60,  61,  53,  62,  63,  41,  64,  65,  25,   2,  66,   4,   5,
          4,   2,   6,  67,   8,   3,   4,  11,  68,  69,  70,  71,  72,  73,
         14,  74,  75,  17,  25,   2,   6,  69,  76,   5,  69,  76,   2,   6,
         77,  39,  78,  59,  27,  79,  80,  34,  81,   8,  82,  

In [67]:
# creating a dataset and data loader objects

class CustomDataset(Dataset):
  # defining the constructor
  def __init__(self, X, y):
    self.X = X
    self.y = y

  # defining the length method
  def __len__(self):
    return self.X.shape[0]
  
  # defining the getitem method
  def __getitem__(self, index):
    return self.X[index], self.y[index]

In [68]:
dataset = CustomDataset(X, y)

In [69]:
len(dataset)

411

In [70]:
dataset[0]

(tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1]),
 tensor(2))

In [71]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [72]:
# looping through the dataloader
for input, output in dataloader:
  print(input, output)

tensor([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 118, 105, 106,   6,
          39],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,  56, 150, 138, 151, 152,
          14],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,
           2],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,  33,   4,   2,   6,  37,
           8,   3,   4,  38,  27,  39,   2,  40,  41,  42,  17,  34,  43,  44,
          45],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          35,   4,   2,   6,  37,   8,   3,   4,  38,  27,  39,   2,  40,  41,
          50],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,  66,   4,   2,   6,  67,   8,   

In [73]:
# model building
class LSTMmodel(nn.Module):

  # defining the constructor
  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 100) # vocab size, embedding size
    self.lstm = nn.LSTM(100, 150, batch_first=True) # input size, hidden size, batch_first
    self.fc = nn.Linear(150, vocab_size) # hidden size
  
  # defining the forward method
  def forward(self, x):
    embedded = self.embedding(x)
    intermediate_hidden_state, (final_hidden_state, final_cell_state) =  self.lstm(embedded) # batch size, seq length, hidden size
    output = self.fc(final_hidden_state.squeeze(0)) # batch size, hidden size
    return output
  

In [74]:
# creating the model object
model = LSTMmodel(len(vocab))

In [75]:
# defining the loss function and optimizer
epochs = 50
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [76]:
# training loop

for epoch in range(epochs):
  total_loss = 0

  for batch_x, batch_y in dataloader: # looping through the dataloader
    optimizer.zero_grad() # zero the gradients
    output = model(batch_x) # forward pass
    loss = criterion(output, batch_y) # calculate the loss
    loss.backward() # backward pass
    optimizer.step() # update the weights
    total_loss += loss.item() # store the total loss of the epoch

  print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss}')

Epoch 1/50, Loss: 64.96747827529907
Epoch 2/50, Loss: 59.75106859207153
Epoch 3/50, Loss: 54.08328294754028
Epoch 4/50, Loss: 49.805298805236816
Epoch 5/50, Loss: 45.6928346157074
Epoch 6/50, Loss: 41.70562744140625
Epoch 7/50, Loss: 37.71639919281006
Epoch 8/50, Loss: 33.869136571884155
Epoch 9/50, Loss: 30.368619322776794
Epoch 10/50, Loss: 27.013951182365417
Epoch 11/50, Loss: 23.96402084827423
Epoch 12/50, Loss: 21.15981936454773
Epoch 13/50, Loss: 18.680723667144775
Epoch 14/50, Loss: 16.52837038040161
Epoch 15/50, Loss: 14.5831458568573
Epoch 16/50, Loss: 12.938183188438416
Epoch 17/50, Loss: 11.559973120689392
Epoch 18/50, Loss: 10.451006829738617
Epoch 19/50, Loss: 9.377250164747238
Epoch 20/50, Loss: 8.412908494472504
Epoch 21/50, Loss: 7.6879207491874695
Epoch 22/50, Loss: 7.059604823589325
Epoch 23/50, Loss: 6.484353303909302
Epoch 24/50, Loss: 6.042887568473816
Epoch 25/50, Loss: 5.649902135133743
Epoch 26/50, Loss: 5.250669375061989
Epoch 27/50, Loss: 4.960945442318916
Epo

In [77]:
# predicting the next word

def predict_next_word(model, vocab, text):
  # tokenize the text
  tokenized_text = word_tokenize(text.lower())

  # convert the text to index
  numerical_text = text_to_index(tokenized_text, vocab)

  # padding the text
  padded_text = torch.tensor([0] * (61 - len(numerical_text)) + numerical_text, dtype=torch.long).unsqueeze(0)

  # sending the text to the model
  output = model(padded_text)

  # maximum value index
  max_value_index = torch.argmax(output).item() # get the index of the maximum value

  # merging the index to the vocab
  return text + ' ' + list(vocab.keys())[list(vocab.values()).index(max_value_index)]

In [87]:
import time
num_tokens = 20
input_text = 'A neural network is a'

for i in range(num_tokens):
  output_text = predict_next_word(model, vocab, input_text)
  print(output_text)
  input_text = output_text
  time.sleep(0.5) # sleep for 0.5 seconds

A neural network is a computational
A neural network is a computational model
A neural network is a computational model inspired
A neural network is a computational model inspired by
A neural network is a computational model inspired by the
A neural network is a computational model inspired by the human
A neural network is a computational model inspired by the human brain
A neural network is a computational model inspired by the human brain ,
A neural network is a computational model inspired by the human brain , consisting
A neural network is a computational model inspired by the human brain , consisting of
A neural network is a computational model inspired by the human brain , consisting of interconnected
A neural network is a computational model inspired by the human brain , consisting of interconnected layers
A neural network is a computational model inspired by the human brain , consisting of interconnected layers of
A neural network is a computational model inspired by the human 