In [None]:
# --- DistilBERT Model for IMDB Sentiment Analysis ---

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import os

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

# Load IMDB dataset
dataset = load_dataset("imdb")

# Model name
model_name = "distilbert-base-uncased"

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

# Training arguments (no evaluation_strategy)
training_args = TrainingArguments(
    output_dir="./distilbert_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train and evaluate
trainer.train()
trainer.evaluate()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
500,0.3791
1000,0.3138
1500,0.3083
2000,0.2874
2500,0.2692
3000,0.2482
3500,0.207
4000,0.1691
4500,0.1454
5000,0.1655


{'eval_loss': 0.2853568494319916,
 'eval_runtime': 365.433,
 'eval_samples_per_second': 68.412,
 'eval_steps_per_second': 8.551,
 'epoch': 2.0}

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model.save_pretrained("./distilbert_results")
tokenizer.save_pretrained("./distilbert_results")

('./distilbert_results/tokenizer_config.json',
 './distilbert_results/special_tokens_map.json',
 './distilbert_results/vocab.txt',
 './distilbert_results/added_tokens.json')

In [None]:

from transformers import pipeline

# Load your trained model
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="./distilbert_results",     # path to your fine-tuned model
    tokenizer="distilbert-base-uncased"
)

# Test sentences
texts = [
    "I absolutely loved this movie! The acting was brilliant.",
    "This film was terrible and a complete waste of time.",
    "It was okay, not great but not bad either.",
    "The storyline was engaging and the visuals were stunning!",
    "The plot was confusing and the ending made no sense."
]

# Run predictions
results = sentiment_pipeline(texts)

# Display
for text, result in zip(texts, results):
    print(f"Text: {text}\n   --> Label: {result['label']} | --> Confidence: {result['score']:.4f}\n")


Device set to use cuda:0


Text: I absolutely loved this movie! The acting was brilliant.
   --> Label: LABEL_1 | --> Confidence: 0.9984

Text: This film was terrible and a complete waste of time.
   --> Label: LABEL_0 | --> Confidence: 0.9986

Text: It was okay, not great but not bad either.
   --> Label: LABEL_0 | --> Confidence: 0.8039

Text: The storyline was engaging and the visuals were stunning!
   --> Label: LABEL_1 | --> Confidence: 0.9982

Text: The plot was confusing and the ending made no sense.
   --> Label: LABEL_0 | --> Confidence: 0.9978

