In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Load necessary libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import XLNetTokenizer, XLNetForSequenceClassification, AdamW

In [None]:
import os

In [None]:
import pandas as pd
import urllib.request
import tarfile

# Download IMDB dataset
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
urllib.request.urlretrieve(url, "aclImdb_v1.tar.gz")

# Extract IMDB dataset
with tarfile.open("aclImdb_v1.tar.gz", "r:gz") as tar:
    tar.extractall()

# Load IMDB dataset
imdb_df = pd.DataFrame(columns=['review', 'sentiment'])
for dataset in ['train', 'test']:
    for sentiment in ['pos', 'neg']:
        path = f'aclImdb/{dataset}/{sentiment}'
        for filename in os.listdir(path):
            with open(f'{path}/{filename}', 'r') as file:
                review = file.read()
            sentiment_value = 1 if sentiment == 'pos' else 0
            imdb_df = imdb_df.append({'review': review, 'sentiment': sentiment_value}, ignore_index=True)


In [None]:
# Count the number of instances
num_instances = len(imdb_df)

# Count the number of positive and negative instances
num_positive = imdb_df['sentiment'].value_counts()[1]
num_negative = imdb_df['sentiment'].value_counts()[0]

# Print the results
print(f'Total number of instances: {num_instances}')
print(f'Number of positive instances: {num_positive}')
print(f'Number of negative instances: {num_negative}')


Total number of instances: 50000
Number of positive instances: 25000
Number of negative instances: 25000


In [None]:
# Convert sentiment values to numeric
import numpy as np
from sklearn.model_selection import train_test_split
imdb_df['sentiment'] = pd.to_numeric(imdb_df['sentiment'], errors='coerce')
imdb_df = imdb_df.dropna()

# Preprocess dataset
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

input_ids = []
attention_masks = []

for review in imdb_df['review']:
    encoded_dict = tokenizer.encode_plus(
                        review,                     
                        add_special_tokens = True,
                        max_length = 256,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(imdb_df['sentiment'].values, dtype=torch.long)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Split dataset into training and validation sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, 
                                                            random_state=42, test_size=0.2)
train_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                                 random_state=42, test_size=0.2)

In [None]:
batch_size = 16
epochs = 3

In [None]:
# Define XLNet model for sequence classification
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [None]:
# Set the device to run the model on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward

In [None]:
# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
# Create the DataLoader for training data
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Create the DataLoader for validation data
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

In [None]:
# Define the training loop
for epoch in range(epochs):
    # Set the model to training mode
    model.train()

    # Track the training loss and accuracy
    total_train_loss = 0
    total_train_accuracy = 0

    # Iterate over the training data
    for step, batch in enumerate(train_dataloader):
        # Clear the gradients
        model.zero_grad()

        # Move the batch to the device
        batch_inputs = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch_inputs[0],
                  'attention_mask': batch_inputs[1],
                  'labels': batch_inputs[2]}

        # Perform the forward pass
        outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]

        # Perform the backward pass and update the parameters
        loss.backward()
        optimizer.step()

        # Track the training loss and accuracy
        total_train_loss += loss.item()
        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == inputs['labels']).float().mean()
        total_train_accuracy += accuracy.item()

        # Print training progress
        if step % 50 == 0:
            print(f'Epoch: {epoch + 1}, Batch: {step}, Training Loss: {total_train_loss / (step + 1)}, Training Accuracy: {total_train_accuracy / (step + 1)}')


Epoch: 1, Batch: 0, Training Loss: 0.6707586050033569, Training Accuracy: 0.5625
Epoch: 1, Batch: 50, Training Loss: 0.46363530687841714, Training Accuracy: 0.7843137254901961
Epoch: 1, Batch: 100, Training Loss: 0.38383136689662933, Training Accuracy: 0.8316831683168316
Epoch: 1, Batch: 150, Training Loss: 0.3526092063980979, Training Accuracy: 0.8526490066225165
Epoch: 1, Batch: 200, Training Loss: 0.32465390178064507, Training Accuracy: 0.8650497512437811
Epoch: 1, Batch: 250, Training Loss: 0.304850274058749, Training Accuracy: 0.8742529880478087
Epoch: 1, Batch: 300, Training Loss: 0.2930980690776609, Training Accuracy: 0.8799833887043189
Epoch: 1, Batch: 350, Training Loss: 0.2789401008543211, Training Accuracy: 0.8856837606837606
Epoch: 1, Batch: 400, Training Loss: 0.27349867563499625, Training Accuracy: 0.8877805486284289
Epoch: 1, Batch: 450, Training Loss: 0.2654901701180541, Training Accuracy: 0.8910753880266076
Epoch: 1, Batch: 500, Training Loss: 0.2622512399726464, Train

In [None]:
# Define the evaluation loop
model.eval()
total_val_loss = 0
total_val_accuracy = 0

# Iterate over the validation data
for batch in val_dataloader:
    # Move the batch to the device
    batch_inputs = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch_inputs[0],
              'attention_mask': batch_inputs[1],
              'labels': batch_inputs[2]}

    # Disable gradient calculation
    with torch.no_grad():
        # Perform the forward pass
        outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]

    # Track the validation loss and accuracy
    total_val_loss += loss.item()
    preds = torch.argmax(logits, dim=1).flatten()
    accuracy = (preds == inputs['labels']).float().mean()
    total_val_accuracy += accuracy.item()

# Calculate the average validation loss and accuracy
avg_val_loss = total_val_loss / len(val_dataloader)
avg_val_accuracy = total_val_accuracy / len(val_dataloader)
print(f'Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_accuracy}')


Validation Loss: 0.23119196572685613, Validation Accuracy: 0.9324


Here we have used seqence length we have used is 256 and batch size is 16.

In [None]:
# Save the fine-tuned model
# model_save_path = './xlnet_sentiment_analysis_model/'
# if not os.path.exists(model_save_path):
#     os.makedirs(model_save_path)

# model.save_pretrained(model_save_path)
# tokenizer.save_pretrained(model_save_path)


('./xlnet_sentiment_analysis_model/tokenizer_config.json',
 './xlnet_sentiment_analysis_model/special_tokens_map.json',
 './xlnet_sentiment_analysis_model/spiece.model',
 './xlnet_sentiment_analysis_model/added_tokens.json')

In [None]:
# from google.colab import files
# import shutil

# # Zip the model directory
# shutil.make_archive('xlnet_sentiment_analysis_model', 'zip', 'xlnet_sentiment_analysis_model')

# # Download the zip file
# files.download('xlnet_sentiment_analysis_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# # Load the saved model and tokenizer
# model_save_path = './xlnet_sentiment_analysis_model/'
# model = XLNetForSequenceClassification.from_pretrained(model_save_path)
# tokenizer = XLNetTokenizer.from_pretrained(model_save_path)

In [None]:
# # Set the model to evaluation mode
# model.eval()

# # Define variables to track the validation loss and accuracy
# total_val_loss = 0
# total_val_accuracy = 0

# # Iterate over the validation data
# for batch in val_dataloader:
#     # Move the batch to the device
#     batch_inputs = tuple(t.to(device) for t in batch)
#     inputs = {'input_ids': batch_inputs[0],
#               'attention_mask': batch_inputs[1],
#               'labels': batch_inputs[2]}

#     # Load the model parameters onto the same device as the input tensors
#     model.to(device)

#     # Disable gradient calculation
#     with torch.no_grad():
#         # Perform the forward pass
#         outputs = model(**inputs)
#         loss = outputs[0]
#         logits = outputs[1]

#     # Track the validation loss and accuracy
#     total_val_loss += loss.item()
#     preds = torch.argmax(logits, dim=1).flatten()
#     accuracy = (preds == inputs['labels']).float().mean()
#     total_val_accuracy += accuracy.item()

# # Calculate the average validation loss and accuracy
# avg_val_loss = total_val_loss / len(val_dataloader)
# avg_val_accuracy = total_val_accuracy / len(val_dataloader)

# print(f'Validation Loss: {avg_val_loss}, Validation Accuracy: {avg_val_accuracy}')


Validation Loss: 0.23119196572685613, Validation Accuracy: 0.9324
