In [1]:
# Install Libraries

!pip install transformers datasets --quiet
!pip install -U transformers datasets accelerate

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-4.4.1 pyar

In [2]:
## Import all necessary libraries

import os
import pandas as pd
import numpy as np
import transformers
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from datasets import ClassLabel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.metrics import confusion_matrix
os.environ["WANDB_DISABLED"] = "true"
print(transformers.__version__)

4.57.3


In [3]:
## Upload the dataset

from google.colab import files
uploaded = files.upload()

Saving bbc_news_text_complexity_summarization.csv to bbc_news_text_complexity_summarization.csv


In [4]:
## Load the dataset

df = pd.read_csv("bbc_news_text_complexity_summarization.csv", delimiter=',')
df.head()

Unnamed: 0,text,labels,no_sentences,Flesch Reading Ease Score,Dale-Chall Readability Score,text_rank_summary,lsa_summary
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,26,62.17,9.72,It hopes to increase subscribers by offering t...,Its profits were buoyed by one-off gains which...
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,17,65.56,9.09,The dollar has hit its highest level against t...,"""I think the chairman's taking a much more san..."
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,14,69.21,9.66,The owners of embattled Russian oil giant Yuko...,Yukos' owner Menatep Group says it will ask Ro...
3,High fuel prices hit BA's profits\n\nBritish A...,business,24,62.98,9.86,Looking ahead to its full year results to Marc...,"Rod Eddington, BA's chief executive, said the ..."
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,17,70.63,10.23,Reports in the Wall Street Journal and the Fin...,Shares in UK drinks and food firm Allied Domec...


In [5]:
## Convert to HuggingFace Dataset

dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['text', 'labels', 'no_sentences', 'Flesch Reading Ease Score', 'Dale-Chall Readability Score', 'text_rank_summary', 'lsa_summary'],
    num_rows: 2127
})

In [6]:
## Encode label strings
# Create a ClassLabel object with all label names

class_label = ClassLabel(names=df["labels"].unique().tolist())

def encode_labels(example):
    example["labels"] = class_label.str2int(example["labels"])
    return example

dataset = dataset.map(encode_labels)


Map:   0%|          | 0/2127 [00:00<?, ? examples/s]

In [7]:
## Create the tokenizer

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
# Tokenize text

#Tokenization includes: lowercasing, splitting into wordpieces, adding attention masks, and truncation/padding to max length

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

dataset = dataset.map(tokenize, batched=True)


Map:   0%|          | 0/2127 [00:00<?, ? examples/s]

In [9]:
## Train-Test Split

dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_ds = dataset["train"]
test_ds = dataset["test"]


In [10]:
## Set tensor format

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [11]:
## Define DistilBERT classification model

num_classes = len(class_label.names)

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_classes
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
## Training

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    report_to="none",           # disable W&B
)


In [13]:
# Compute accuracy, precision, recall, and F1 (macro) for model predictions

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro'
    )

    return {
        "accuracy": accuracy,
        "precision_macro": precision,
        "recall_macro": recall,
        "f1_macro": f1
    }

In [14]:
# Trainer object

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [15]:
## Train

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
1,0.667,0.167307,0.955399,0.957372,0.951495,0.953612
2,0.0848,0.127813,0.967136,0.967381,0.963962,0.965451
3,0.0274,0.130653,0.971831,0.971777,0.970365,0.970817
4,0.0117,0.128584,0.969484,0.969998,0.967575,0.968347
5,0.0077,0.133039,0.967136,0.966873,0.965747,0.966137
6,0.0026,0.122418,0.976526,0.975575,0.976342,0.97588
7,0.0019,0.130518,0.971831,0.970629,0.970818,0.970637
8,0.0016,0.131918,0.971831,0.970629,0.970818,0.970637
9,0.0013,0.13488,0.971831,0.970629,0.970818,0.970637
10,0.0011,0.137567,0.971831,0.970629,0.970818,0.970637


TrainOutput(global_step=1605, training_loss=0.054107251480175325, metrics={'train_runtime': 821.2477, 'train_samples_per_second': 31.069, 'train_steps_per_second': 1.954, 'total_flos': 845021626041600.0, 'train_loss': 0.054107251480175325, 'epoch': 15.0})

In [16]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.14343272149562836, 'eval_accuracy': 0.971830985915493, 'eval_precision_macro': 0.9706287008011147, 'eval_recall_macro': 0.9708180720942675, 'eval_f1_macro': 0.9706368677200624, 'eval_runtime': 1.5862, 'eval_samples_per_second': 268.567, 'eval_steps_per_second': 17.022, 'epoch': 15.0}


In [17]:
## Predict on test samples

preds_output = trainer.predict(test_ds)       # Predict on test set
y_true = preds_output.label_ids               # True labels
y_pred = np.argmax(preds_output.predictions, axis=1)  # Predicted labels

# Accuracy
accuracy = accuracy_score(y_true, y_pred)

# Macro Precision, Recall, F1
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Precision (macro): {precision:.4f}")
print(f"Recall (macro): {recall:.4f}")
print(f"F1 (macro): {f1:.4f}")

Test Accuracy: 0.9718
Precision (macro): 0.9706
Recall (macro): 0.9708
F1 (macro): 0.9706
