In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [2]:
# Load necessary libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [3]:
import os

In [4]:
# Install NLTK library
!pip install nltk
import urllib.request
import tarfile

# Import necessary modules
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Define a function to lemmatize text
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Download IMDB dataset
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
urllib.request.urlretrieve(url, "aclImdb_v1.tar.gz")

# Extract IMDB dataset
with tarfile.open("aclImdb_v1.tar.gz", "r:gz") as tar:
    tar.extractall()

# Load IMDB dataset
imdb_df = pd.DataFrame(columns=['review', 'sentiment'])
for dataset in ['train', 'test']:
    for sentiment in ['pos', 'neg']:
        path = f'aclImdb/{dataset}/{sentiment}'
        for filename in os.listdir(path):
            with open(f'{path}/{filename}', 'r') as file:
                review = file.read()
            lemmatized_review = lemmatize_text(review)
            sentiment_value = 1 if sentiment == 'pos' else 0
            imdb_df = pd.concat([imdb_df, pd.DataFrame({'review': lemmatized_review, 'sentiment': sentiment_value}, index=[0])], ignore_index=True)


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
# Count the number of instances
num_instances = len(imdb_df)

# Count the number of positive and negative instances
num_positive = imdb_df['sentiment'].value_counts()[1]
num_negative = imdb_df['sentiment'].value_counts()[0]

# Print the results
print(f'Total number of instances: {num_instances}')
print(f'Number of positive instances: {num_positive}')
print(f'Number of negative instances: {num_negative}')


Total number of instances: 50000
Number of positive instances: 25000
Number of negative instances: 25000


In [6]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97


In [7]:
# Convert sentiment values to numeric
import numpy as np
from sklearn.model_selection import train_test_split
imdb_df['sentiment'] = pd.to_numeric(imdb_df['sentiment'], errors='coerce')
imdb_df = imdb_df.dropna()

# Preprocess dataset
# tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-uncased', do_lower_case=True)
tokenizer = AutoTokenizer.from_pretrained('xlnet-base-cased')

input_ids = []
attention_masks = []

for review in imdb_df['review']:
    encoded_dict = tokenizer.encode_plus(
                        review,                     
                        add_special_tokens = True,
                        max_length = 256,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(imdb_df['sentiment'].values, dtype=torch.long)

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [8]:
# Split dataset into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, 
                                                            random_state=42, test_size=0.2)
train_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                 random_state=42, test_size=0.2)

In [9]:
batch_size = 16
epochs = 3

In [10]:
# Define XLNet model for sequence classification
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)


Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'sequence_summary.summary.weight', 'logits_proj.bias', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [11]:
# Set the device to run the model on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward

In [12]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)




In [14]:
# Create the DataLoader for training data
from torch.utils.data import TensorDataset, DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Create the DataLoader for validation data
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [15]:
# Define the training loop
for epoch in range(epochs):
    # Set the model to training mode
    model.train()

    # Track the training loss and accuracy
    total_train_loss = 0
    total_train_accuracy = 0

    # Iterate over the training data
    for step, batch in enumerate(train_dataloader):
        # Clear the gradients
        model.zero_grad()

        # Move the batch to the device
        batch_inputs = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch_inputs[0],
                  'attention_mask': batch_inputs[1],
                  'labels': batch_inputs[2]}

        # Perform the forward pass
        outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]

        # Perform the backward pass and update the parameters
        loss.backward()
        optimizer.step()

        # Track the training loss and accuracy
        total_train_loss += loss.item()
        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == inputs['labels']).float().mean()
        total_train_accuracy += accuracy.item()

        # Print training progress
        if step % 50 == 0:
            print(f'Epoch: {epoch + 1}, Batch: {step}, Training Loss: {total_train_loss / (step + 1)}, Training Accuracy: {total_train_accuracy / (step + 1)}')


Epoch: 1, Batch: 0, Training Loss: 0.7425596117973328, Training Accuracy: 0.5
Epoch: 1, Batch: 50, Training Loss: 0.5009999469506974, Training Accuracy: 0.7279411764705882
Epoch: 1, Batch: 100, Training Loss: 0.37948283317065473, Training Accuracy: 0.8137376237623762
Epoch: 1, Batch: 150, Training Loss: 0.33963361668665676, Training Accuracy: 0.8418874172185431
Epoch: 1, Batch: 200, Training Loss: 0.318769411197794, Training Accuracy: 0.8575870646766169
Epoch: 1, Batch: 250, Training Loss: 0.31039878475179233, Training Accuracy: 0.8645418326693227
Epoch: 1, Batch: 300, Training Loss: 0.3044845464444438, Training Accuracy: 0.867109634551495
Epoch: 1, Batch: 350, Training Loss: 0.2966655122588503, Training Accuracy: 0.8730413105413105
Epoch: 1, Batch: 400, Training Loss: 0.29855483238053443, Training Accuracy: 0.8734413965087282
Epoch: 1, Batch: 450, Training Loss: 0.29252703707177863, Training Accuracy: 0.875970066518847
Epoch: 1, Batch: 500, Training Loss: 0.28934970396959375, Training

In [16]:
# Define the evaluation loop
model.eval()
total_val_loss = 0
total_val_accuracy = 0

# Iterate over the validation data
for batch in val_dataloader:
    # Move the batch to the device
    batch_inputs = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch_inputs[0],
              'attention_mask': batch_inputs[1],
              'labels': batch_inputs[2]}

    # Disable gradient calculation
    with torch.no_grad():
        # Perform the forward pass
        outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]

    # Track the validation loss and accuracy
    total_val_loss += loss.item()
    preds = torch.argmax(logits, dim=1).flatten()
    accuracy = (preds == inputs['labels']).float().mean()
    total_val_accuracy += accuracy.item()

# Calculate the average validation loss and accuracy
avg_val_loss = total_val_loss / len(val_dataloader)
avg_val_accuracy = total_val_accuracy / len(val_dataloader)
print(f'Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_accuracy}')


Validation Loss: 0.21284754961207508, Validation Accuracy: 0.9293


Here we have used seqence length we have used is 256 and batch size is 16.