<a href="https://colab.research.google.com/github/rupeshthapa123/NotebookProject/blob/main/RupeshThapa_Lab5NLPEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Embeddings

While Tokenization is the reading part of NLP but Embedding performs the understanding part of it.
To understand language ML encodes text in a new (numerical) way based on text meaning rather than text content. Any word can be represented by a bunch of numbers that describes its properties.


In [None]:
#!pip install torchtext==0.6.0

## Preprocessing
CUDA device is used as the process needs a faster device for execution and for that all the necessary files are installed and imported.

In [None]:
import torch.nn.functional as F  # Import PyTorch functional module for activation functions and other neural network operations
import torch.nn as nn  # Import PyTorch module for neural network classes
import torch  # Import PyTorch module
from torch import optim  # Import PyTorch module for optimization algorithms

In [None]:
import torchtext
# import the necessary classes from torchtext
from torchtext import *

In [None]:
# Check if GPU is available
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### Now Tokenizeing and vectorize the data for preprocessing

In [None]:
# Define the TEXT and LABEL fields
TEXT = data.Field(lower=True, include_lengths=True,
                  batch_first=False)
LABEL = data.LabelField()

In [None]:
# Load the IMDB dataset
train, test = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:01<00:00, 49.5MB/s]


In [None]:
# Build the vocabulary for the text data using the training data and the GloVe vectors of dimension 100
TEXT.build_vocab(train, vectors='glove.6B.100d')
# Build the vocabulary for the label data using the training data
LABEL.build_vocab(train)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.42MB/s]                           
100%|█████████▉| 399999/400000 [00:20<00:00, 19886.15it/s]


In [None]:
# Create training and testing iterators with batch sizes of 128 and 1024, device set to dev, sort within batch set to true, and repeat set to false
train_iter, test_iter = data.BucketIterator.splits((train, test),
                                                   batch_sizes=(128,1024), device=dev, sort_within_batch=True, repeat=False)

## Model

A model is created and its consists of:

*   Pretrained embedding layer with GloVe Array lookup

*   Standard RNN module

*   Binary classification section with two fully connected layers


In [None]:
class RNN_classifier(nn.Module):
  def __init__(self, embedding_size = 100, hidden_size = 512, num_layers=3):
    super().__init__()

    # Initialize the embedding layer with the given embedding size
    vocab = TEXT.vocab
    self.embed = nn.Embedding(len(vocab), embedding_size).cuda()
    # Initialize the embedding layer with the pre-trained vectors from the vocabulary
    self.embed.weight.data.copy_(vocab.vectors)
    # Initialize the RNN layer with the given hidden size, number of layers, and batch size
    self.rnn = nn.RNN(embedding_size, hidden_size, num_layers)

    # Initialize the first layer of the classification network with the given input size and output size
    self.classificationLayer1 = nn.Linear(hidden_size,10)
    # Initialize the second layer of the classification network with the given input size and output size
    self.classificationLayer2 = nn.Linear(10,1)

  def forward(self, input, text_lengths):
    # Pass the input through the embedding layer
    embed_input = self.embed(input)
    # Ensure text_lengths is on CPU and is a 1D tensor with dtype torch.int64
    text_lengths = text_lengths.cpu().long()
    # Pack the padded sequences
    packed_emb = nn.utils.rnn.pack_padded_sequence(embed_input, text_lengths, batch_first=False)
    # Pass the padded sequences through the RNN layer
    output, hidden = self.rnn(packed_emb)
    # Get the last hidden state
    hidden = hidden[-1]
    # Squeeze the hidden state to remove the extra dimension
    x = hidden.squeeze(0)
    # Pass the hidden state through the first layer of the classification network
    x = self.classificationLayer1(x)
    # Pass the output of the first layer through the second layer of the classification network
    x = self.classificationLayer2(x)

    # Get the logits
    logits = x.view(-1)
    # Return the logits
    return logits

### Creating an instance of RNN_Classifier with hidden units defined to 256 and number of layers to 1

In [None]:
# Create a RNN classifier model with hidden size 256 and 1 layer
model = RNN_classifier(hidden_size=256, num_layers=1)
# Move the model to the specified device
model.to(dev)

RNN_classifier(
  (embed): Embedding(251639, 100)
  (rnn): RNN(100, 256)
  (classificationLayer1): Linear(in_features=256, out_features=10, bias=True)
  (classificationLayer2): Linear(in_features=10, out_features=1, bias=True)
)

### Running one batch of training data to test the model

In [None]:
# loop through each batch in the train_iter
for batch in train_iter:
  # get the input data from the batch
  (x,x_len) = batch.text
  # pass the input data through the model
  pred = model(x,x_len)
  # print the shape of the output
  print(pred.shape)
  # exit the loop after the first batch
  break

torch.Size([128])


## Training

Setup the parameters and then define a function that runs through the test sets and computes the model accuracy

In [None]:
# Define the loss function
loss_func = F.binary_cross_entropy_with_logits

# Define the optimizer
opt = optim.Adam(model.parameters(), lr=1e-4)

# Define the number of epochs
epochs = 6

In [None]:
def get_metrics(model, test_data):
  # Put the model in evaluation mode
  model.eval()
  # Initialize counters for correct and total predictions
  correct, total = 0, 0
  # Disable gradient calculation for validation
  with torch.no_grad():
    # Iterate over the test data
    for batch_idx, batch_data in enumerate(test_data):
      # Get the text and text_lengths from the batch_data
      text, text_lengths = batch_data.text
      # Get the logits from the model
      logits = model(text, text_lengths)
      # Get the predicted labels from the logits
      predicted_labels = (torch.sigmoid(logits) > 0.5).long()
      # Update the total and correct counters
      total += batch_data.label.size(0)
      correct += (predicted_labels == batch_data.label.long()).sum()
    # Return the accuracy
    return correct.float()/total

### For the training loop, run through the whole training dataset to update the models parameters

In [None]:
# Import the tqdm library as tqdm_notebook for creating a progress bar
from tqdm import tqdm_notebook as tqdm

# Iterate through each epoch
for epoch in tqdm(range(epochs)):
  # Set the model to training mode
  model.train()
  # Iterate through each batch
  for batch in tqdm(train_iter):
    # Get the input and input lengths from the batch
    (x,x_lengths)=batch.text
    # Make a prediction with the model
    pred = model(x,x_lengths)

    # Get the actual labels from the batch
    actual=batch.label.float()
    # Calculate the loss
    loss = loss_func(pred, actual)
    # Backpropagate the loss
    loss.backward()
    # Update the model parameters
    opt.step()
    # Clear the gradients
    opt.zero_grad()

  # Set the learning rate to 3e-3 after 5 epochs
  if(epoch==5):
    for g in opt.param_groups:
      g['lr'] = 3e-3

  # Print the accuracy after each epoch
  print("Accuracy: " + str(get_metrics(model, test_iter).cpu().numpy()))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for epoch in tqdm(range(epochs)):


  0%|          | 0/6 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm(train_iter):


  0%|          | 0/196 [00:00<?, ?it/s]

Accuracy: 0.57584


  0%|          | 0/196 [00:00<?, ?it/s]

Accuracy: 0.6458


  0%|          | 0/196 [00:00<?, ?it/s]

Accuracy: 0.7488


  0%|          | 0/196 [00:00<?, ?it/s]

Accuracy: 0.76664


  0%|          | 0/196 [00:00<?, ?it/s]

Accuracy: 0.76304


  0%|          | 0/196 [00:00<?, ?it/s]

Accuracy: 0.7848


## Validation

Validation will test the model on a new data and build a function to predict sentiment as shown

In [None]:
import spacy
nlp =spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
  # Set the model to evaluation mode and tokenize the input sentence
  model.eval()
  tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
  # Convert the tokenized sentence to a list of integers using the vocab object
  indexed = [TEXT.vocab.stoi[t] for t in tokenized]
  # Create a list to store the length of the tokenized sentence
  length = [len(indexed)]

  # Convert the list of integers to a tensor and add a dimension to make it a 2D tensor
  tensor = torch.LongTensor(indexed).to(dev)
  tensor = tensor.unsqueeze(1)
  # Convert the list of the length of the tokenized sentence to a tensor
  length_tensor = torch.LongTensor(length)
  # Make a prediction using the model
  prediction = torch.sigmoid(model(tensor, length_tensor))
  # Return the prediction as a single value
  return prediction.item()

## Prediction

The following is a review of Spider-man: Far From Home movie that is not in the dataset and we are predicting positive score of that review along with another review that I created to test and compare score.

In [None]:
review = """I like that Far From Home is trying something new and
that its humor feels mode real than the ironic cracks in most
superhero movies. I just wish its good pieces
all came together more satisfyingly. """

In [None]:
review2 = """I don't think the movie had character growth as expected
and all the works were out of place in that movie."""

In [None]:
# Print the probability of a positive sentiment
print('Probability positive:')
predict_sentiment(model, review)

# This function takes a model and a review as input and returns the probability of a positive sentiment


Probability positive:


0.938538134098053

In [None]:
# Print the statement 'Probability positive:'
print('Probability positive:')
# Call the function 'predict_sentiment' and pass the 'model' and 'review2' parameters to it
predict_sentiment(model, review2)

Probability positive:


0.37646153569221497