In [None]:
pip install --upgrade fsspec

In [None]:
!pip install transformers datasets torch scikit-learn pandas -q

In [None]:
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score,f1_score
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
imdb_dataset = load_dataset("imdb")
print(imdb_dataset)
print("\nSample Training Example:")
print(imdb_dataset['train'][0])
print("\nSample Testing Example:")
print(imdb_dataset['test'][0])

# Check label distribution
train_df = pd.DataFrame(imdb_dataset['train'])
test_df = pd.DataFrame(imdb_dataset['test'])
print("\nTraining label distribution:")
print(train_df['label'].value_counts())
# Label 0 is typically negative, Label 1 is positive

In [None]:
# Split the training set into train and validation (e.g., 90% train, 10% validation)
train_val_split = imdb_dataset['train'].train_test_split(test_size=0.1, seed=42) # Use a seed for reproducibility

train_dataset = train_val_split['train']
val_dataset = train_val_split['test']
test_dataset = imdb_dataset['test']

print("\nDataset splits:")
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(val_dataset)}")
print(f"Test examples: {len(test_dataset)}")

In [None]:
model_checkpoint = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
sample_text = "This is a test sentence for the tokenizer."
encoded_input = tokenizer(sample_text)
print("\nTokenized Sample:")
print(encoded_input)
print("Decoded tokens:", tokenizer.convert_ids_to_tokens(encoded_input['input_ids']))

In [None]:
num_labels = 2 # Positive or Negative
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

# Check if GPU is available and move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"\nModel loaded on device: {device}")

In [None]:
def tokenize_function(examples):
    # Tokenize the text. The tokenizer handles padding and truncation.
    return tokenizer(examples['text'], padding="max_length", truncation=True)

# Apply the tokenization function to all splits of the dataset
# Use batched=True for faster processing
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove the original 'text' column as it's no longer needed
# Keep 'input_ids', 'attention_mask', 'label'
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["text"])
tokenized_val_dataset = tokenized_val_dataset.remove_columns(["text"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["text"])

# Rename the 'label' column to 'labels' (expected by the Trainer)
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_val_dataset = tokenized_val_dataset.rename_column("label", "labels")
tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")

# Set the format to PyTorch tensors
tokenized_train_dataset.set_format("torch")
tokenized_val_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

print("\nProcessed dataset sample (train):")
print(tokenized_train_dataset[0])

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) # Get the index of the highest logit
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='binary') # Use 'weighted' for multi-class
    return {"accuracy": accuracy, "f1": f1}

In [None]:
# Define the directory where model checkpoints will be saved
output_dir = "./sentiment_model_results"

training_args = TrainingArguments(
    output_dir=output_dir,                   # Directory to save model checkpoints
    num_train_epochs=3,                      # Total number of training epochs (start with 1-3)
    per_device_train_batch_size=16,          # Batch size per device during training
    per_device_eval_batch_size=32,           # Batch size for evaluation
    warmup_steps=500,                        # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                       # Strength of weight decay regularization
    logging_dir='./logs',                    # Directory for storing logs
    logging_steps=100,                       # Log metrics every N steps
    evaluation_strategy="epoch",             # Evaluate performance at the end of each epoch
    save_strategy="epoch",                   # Save a model checkpoint at the end of each epoch
    load_best_model_at_end=True,             # Load the best model (based on validation metric) at the end
    metric_for_best_model="f1",              # Metric to determine the best model (can be accuracy, f1, etc.)
    greater_is_better=True,                  # True if a higher metric value is better
    fp16=torch.cuda.is_available(),          # Use mixed precision training if GPU is available (faster, less memory)
    report_to="none"                         # Disable reporting to external services like W&B for this example
)

In [None]:
trainer = Trainer(
    model=model,                             # The instantiated Transformers model to be trained
    args=training_args,                      # Training arguments defined above
    train_dataset=tokenized_train_dataset,   # Training dataset
    eval_dataset=tokenized_val_dataset,      # Evaluation dataset
    tokenizer=tokenizer,                     # Tokenizer (needed for padding collation)
    compute_metrics=compute_metrics          # Function to compute evaluation metrics
)

In [None]:
print("\nStarting training...")
train_result = trainer.train()
print("\nTraining finished.")

# You can print some training stats
print(f"Training Metrics: {train_result.metrics}")

In [None]:
print("\nEvaluating on the test set...")
eval_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)

print("\nTest Set Evaluation Results:")
print(eval_results)
# Example output: {'eval_loss': 0.XXXX, 'eval_accuracy': 0.YYYY, 'eval_f1': 0.ZZZZ, ...}