<a href="https://colab.research.google.com/github/shrish842/NLP_Sentiment_Analysis/blob/main/NLP_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install --upgrade fsspec



In [3]:
!pip install transformers datasets torch scikit-learn pandas -q

In [4]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,f1_score
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [6]:
imdb_dataset = load_dataset("imdb")
print(imdb_dataset)
print("\nSample Training Example:")
print(imdb_dataset['train'][0])
print("\nSample Testing Example:")
print(imdb_dataset['test'][0])

# Check label distribution
train_df = pd.DataFrame(imdb_dataset['train'])
test_df = pd.DataFrame(imdb_dataset['test'])
print("\nTraining label distribution:")
print(train_df['label'].value_counts())
# Label 0 is typically negative, Label 1 is positive

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

Sample Training Example:
{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In betw

In [7]:
# Split the training set into train and validation (e.g., 90% train, 10% validation)
train_val_split = imdb_dataset['train'].train_test_split(test_size=0.1, seed=42) # Use a seed for reproducibility

train_dataset = train_val_split['train']
val_dataset = train_val_split['test']
test_dataset = imdb_dataset['test']

print("\nDataset splits:")
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")
print(f"Test examples: {len(test_dataset)}")


Dataset splits:
Training examples: 22500
Validation examples: 2500
Test examples: 25000


In [8]:
model_checkpoint = "distilbert-base-uncased"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
sample_text = "This is a test sentence for the tokenizer."
encoded_input = tokenizer(sample_text)
print("\nTokenized Sample:")
print(encoded_input)
print("Decoded tokens:", tokenizer.convert_ids_to_tokens(encoded_input['input_ids']))


Tokenized Sample:
{'input_ids': [101, 2023, 2003, 1037, 3231, 6251, 2005, 1996, 19204, 17629, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Decoded tokens: ['[CLS]', 'this', 'is', 'a', 'test', 'sentence', 'for', 'the', 'token', '##izer', '.', '[SEP]']


In [11]:
num_labels = 2 # Positive or Negative
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

# Check if GPU is available and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"\nModel loaded on device: {device}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model loaded on device: cuda


In [12]:
def tokenize_function(examples):
    # Tokenize the text. The tokenizer handles padding and truncation.
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Apply the tokenization function to all splits of the dataset
# Use batched=True for faster processing
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove the original 'text' column as it's no longer needed
# Keep 'input_ids', 'attention_mask', 'label'
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(["text"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])

# Rename the 'label' column to 'labels' (expected by the Trainer)
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_val_dataset = tokenized_val_dataset.rename_column("label", "labels")
tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")

# Set the format to PyTorch tensors
tokenized_train_dataset.set_format("torch")
tokenized_val_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

print("\nProcessed dataset sample (train):")
print(tokenized_train_dataset[0])

Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]


Processed dataset sample (train):
{'labels': tensor(0), 'input_ids': tensor([  101,  2007,  2122,  2111,  6904,  6834,  2061,  2116,  7171,  1010,
         2478,  2214,  8333,  1010,  1998,  3806,  7741,  4176,  2000,  2131,
         2068,  2041,  1010,  2025,  2000,  5254,  2008,  2070,  1997,  1996,
         5019,  2020,  6361,  2006,  1037,  2580,  2275,  2007,  5889,  1010,
         2054,  1005,  1055,  2000,  2903,  1029,  2214,  2143,  1997,  3032,
         2003,  3835,  1010,  2021,  1996,  4111,  6905,  1998, 16627,  1997,
        12493,  2003,  9145,  2000,  3422,  1999,  2122,  3152,  1012,  1045,
         2113,  1010, 14398,  2003,  7929,  1999,  2122,  2214,  3152,  1010,
         2021,  2045,  2003,  2062,  2000,  2008,  2000,  2191,  2023,  3232,
         4558, 21553,  1012,  6791,  2004, 13109, 10136,  1010,  2027,  2196,
         5520,  2037,  9738,  1010,  3235,  3779,  2001,  2019,  4654,  1011,
        12436, 12672, 26548,  2937,  1010,  2109,  2814,  2066,  2990,  

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) # Get the index of the highest logit
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary') # Use 'weighted' for multi-class
    return {"accuracy": accuracy, "f1": f1}

In [14]:
# Define the directory where model checkpoints will be saved
output_dir = "./sentiment_model_results"

training_args = TrainingArguments(
    output_dir=output_dir,                   # Directory to save model checkpoints
    num_train_epochs=3,                      # Total number of training epochs (start with 1-3)
    per_device_train_batch_size=16,          # Batch size per device during training
    per_device_eval_batch_size=32,           # Batch size for evaluation
    warmup_steps=500,                        # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                       # Strength of weight decay regularization
    logging_dir='./logs',                    # Directory for storing logs
    logging_steps=100,                       # Log metrics every N steps
    evaluation_strategy="epoch",             # Evaluate performance at the end of each epoch
    save_strategy="epoch",                   # Save a model checkpoint at the end of each epoch
    load_best_model_at_end=True,             # Load the best model (based on validation metric) at the end
    metric_for_best_model="f1",              # Metric to determine the best model (can be accuracy, f1, etc.)
    greater_is_better=True,                  # True if a higher metric value is better
    fp16=torch.cuda.is_available(),          # Use mixed precision training if GPU is available (faster, less memory)
    report_to="none"                         # Disable reporting to external services like W&B for this example
)



In [15]:
trainer = Trainer(
    model=model,                             # The instantiated Transformers model to be trained
    args=training_args,                      # Training arguments defined above
    train_dataset=tokenized_train_dataset,   # Training dataset
    eval_dataset=tokenized_val_dataset,      # Evaluation dataset
    tokenizer=tokenizer,                     # Tokenizer (needed for padding collation)
    compute_metrics=compute_metrics          # Function to compute evaluation metrics
)

  trainer = Trainer(


In [16]:
print("\nStarting training...")
train_result = trainer.train()
print("\nTraining finished.")

# You can print some training stats
print(f"Training Metrics: {train_result.metrics}")


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2406,0.243364,0.91,0.90887
2,0.1676,0.227725,0.9296,0.928803
3,0.054,0.330538,0.928,0.928685



Training finished.
Training Metrics: {'train_runtime': 942.5526, 'train_samples_per_second': 71.614, 'train_steps_per_second': 4.478, 'total_flos': 8941549409280000.0, 'train_loss': 0.1786880477913407, 'epoch': 3.0}


In [17]:
print("\nEvaluating on the test set...")
eval_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)

print("\nTest Set Evaluation Results:")
print(eval_results)
# Example output: {'eval_loss': 0.XXXX, 'eval_accuracy': 0.YYYY, 'eval_f1': 0.ZZZZ, ...}


Evaluating on the test set...



Test Set Evaluation Results:
{'eval_loss': 0.20836912095546722, 'eval_accuracy': 0.92944, 'eval_f1': 0.9285714285714286, 'eval_runtime': 96.5261, 'eval_samples_per_second': 258.997, 'eval_steps_per_second': 8.101, 'epoch': 3.0}
