# Fine-tuning DistilBERT for News Sentiment Classification

This jupyter notebook fine-tunes `distilbert-base-uncased-finetuned-sst-2-english` model on the `"sara-nabhani/ML-news-sentiment"` dataset.


In [1]:
# Install required libraries
!pip install datasets transformers evaluate torch scikit-learn 'accelerate>=0.26.0'

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m947.4 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting accelerate>=0.26.0
  Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3

In [63]:
# Load libraries
import sys
import importlib
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import evaluate

In [64]:
# Load the dataset and filter to binary sentiment only
dataset = load_dataset('sara-nabhani/ML-news-sentiment')

dataset = dataset.filter(lambda example: example['labels'] in [0, 2])
# Convert labels: 0 (negative), 1, (neutral), 2 (positive) => 0 and 1
def relabel(example):
    example['labels'] = 1 if example['labels'] == 2 else 0
    return example
dataset = dataset.map(relabel)

In [65]:
# Create train/validation/test splits with larger test set
from datasets import concatenate_datasets

combined_dataset = concatenate_datasets([dataset['train'], dataset['test']])
total_samples = len(combined_dataset)

print(f"Total available samples: {total_samples}")

# First split: separate test set
train_val_test_split = combined_dataset.train_test_split(test_size=0.3, seed=42)
test_dataset = train_val_test_split['test']
remaining_data = train_val_test_split['train']

# Second split: separate validation from remaining training data
train_val_split = remaining_data.train_test_split(test_size=0.1, seed=42)
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

print(f"\nActual splits:")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Total available samples: 2113

Actual splits:
Training samples: 1331
Validation samples: 148
Test samples: 634


In [80]:
# Load model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [81]:
# Tokenize datasets
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=256)
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

Map:   0%|          | 0/148 [00:00<?, ? examples/s]

Map:   0%|          | 0/634 [00:00<?, ? examples/s]

In [82]:
# Define metrics
accuracy = evaluate.load('accuracy')
f1 = evaluate.load('f1')
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy.compute(references=p.label_ids, predictions=preds)['accuracy'],
        'f1': f1.compute(references=p.label_ids, predictions=preds)['f1']
    }

In [83]:
# Evaluate the base model on the test set
trainer = Trainer(model=model)
outputs = trainer.predict(test_dataset)
print("Baseline metrics:", compute_metrics(outputs))

Baseline metrics: {'accuracy': 0.7444794952681388, 'f1': 0.763157894736842}


In [84]:
# Set up the fine-tune training parameters
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.075,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [85]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3951,0.28014,0.905405,0.9375
2,0.3247,0.256785,0.918919,0.946429


TrainOutput(global_step=168, training_loss=0.3877804690883273, metrics={'train_runtime': 20.6691, 'train_samples_per_second': 128.791, 'train_steps_per_second': 8.128, 'total_flos': 176314107611136.0, 'train_loss': 0.3877804690883273, 'epoch': 2.0})

In [None]:
# Evaluate the fine-tuned model
outputs = trainer.predict(test_dataset)
print("Post fine-tuning metrics:", compute_metrics(outputs))

Post fine-tuning metrics: {'accuracy': 0.8848580441640379, 'f1': 0.9099876695437731}


In [88]:
# Test the model on some example news articles
import torch.nn.functional as F

def predict_sentiment(text):
    model.eval()
    device = next(model.parameters()).device
    
    with torch.no_grad():
        inputs = tokenizer(text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        outputs = model(**inputs)
        probabilities = F.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        
        sentiment = "Positive" if predicted_class == 1 else "Negative"
        neg_prob = probabilities[0][0].item()
        pos_prob = probabilities[0][1].item()
        
        print(f"Text: {text}")
        print(f"Predicted label: {sentiment}")
        print(f"Probabilities: {pos_prob:.3f}/{neg_prob:.3f}\n")
        return sentiment, neg_prob, pos_prob

headlines = [
    "Tesla stock soars after record-breaking delivery numbers",
    "Banking sector faces regulatory scrutiny over lending practices",
    "Next BioShock Game Changes Leaders After Development Turmoil",
    "The eye-opening conversation that led to Carlos Correa’s stunning Astros reunion",
    "Central bank maintains interest rates at current levels"
]

for headline in headlines:
    predict_sentiment(headline)


Text: Tesla stock soars after record-breaking delivery numbers
Predicted label: Positive
Probabilities: 0.999/0.001

Text: Banking sector faces regulatory scrutiny over lending practices
Predicted label: Negative
Probabilities: 0.051/0.949

Text: Next BioShock Game Changes Leaders After Development Turmoil
Predicted label: Positive
Probabilities: 0.985/0.015

Text: The eye-opening conversation that led to Carlos Correa’s stunning Astros reunion
Predicted label: Positive
Probabilities: 1.000/0.000

Text: Central bank maintains interest rates at current levels
Predicted label: Positive
Probabilities: 0.991/0.009



In [None]:
# Save the model
model.save_pretrained('./news-sentiment-model')
tokenizer.save_pretrained('./news-sentiment-model')

('./news-sentiment-model/tokenizer_config.json',
 './news-sentiment-model/special_tokens_map.json',
 './news-sentiment-model/vocab.txt',
 './news-sentiment-model/added_tokens.json',
 './news-sentiment-model/tokenizer.json')