### Training roBERTa on the News Category Dataset for news classification

#### Loading libraries
Preparing the needed libraries. Also since the dataset is big, it is recommended to train this using a GPU, that is why the device is set to cuda.

In [3]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  if not hasattr(np, "object"):





#### Loading the dataset
Similar to the Naive Bayes model, we load the dataset and keep only the same 12 categories. Similar categories are merged together.

In [4]:
df = pd.read_json("data/News_Category_Dataset_v3.json", lines=True)

df['text'] = df['headline'] + " " + df['short_description']

categories = {
    'POLITICS': 'POLITICS',
    'WELLNESS': 'WELLNESS',
    'HEALTHY LIVING': 'WELLNESS',
    'FIFTY': 'WELLNESS',
    'ENTERTAINMENT': 'ENTERTAINMENT',
    'TRAVEL': 'TRAVEL',
    'STYLE & BEAUTY': 'STYLE & BEAUTY',
    'FOOD & DRINK': 'FOOD & DRINK',
    'TASTE': 'FOOD & DRINK',
    'BUSINESS': 'BUSINESS',
    'MONEY': 'BUSINESS',
    'COMEDY': 'COMEDY',
    'SPORTS': 'SPORTS',
    'WORLD NEWS': 'WORLD NEWS',
    'WORLDPOST': 'WORLD NEWS',
    'THE WORLDPOST': 'WORLD NEWS',
    'TECH': 'TECH',
    'SCIENCE': 'SCIENCE'
}

# Apply the mapping and filter
df['category'] = df['category'].map(categories)
df = df.dropna(subset=['category']).copy()


label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['category'])
num_labels = len(label_encoder.classes_)

train_df, val_df = train_test_split(
    df[['text', 'label']],
    test_size=0.2,
    stratify=df['label'],
    random_state=42
)

#### Preprocessing
Differently from NB, we do not have to remove stopwords, use a lemmatizer or use a vectorizer, because roBERTa is already pretrained for text classification tasks. We just need to write a tokenizer function and then tokenize the data by mapping the features both on the training and validation dataset using this function.

In [26]:
model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/111384 [00:00<?, ? examples/s]

Map:   0%|          | 0/27847 [00:00<?, ? examples/s]

#### Loading the model
It is time to finally load roberta-base and configure the model, by adding a mapping from the label encoding to the actual labels. This will be useful for translating the answer the model gives us.

In [27]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    return {"accuracy": acc, "f1": f1}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels
)

id2label = {
    0: 'BUSINESS', 1: 'COMEDY', 2: 'ENTERTAINMENT', 3: 'FOOD & DRINK',
    4: 'POLITICS', 5: 'SCIENCE', 6: 'SPORTS', 7: 'STYLE & BEAUTY',
    8: 'TECH', 9: 'TRAVEL', 10: 'WELLNESS', 11: 'WORLD NEWS'
}
label2id = {v: k for k, v in id2label.items()}

model.config.id2label = id2label
model.config.label2id = label2id
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

#### Starting training
After loading the dataset, preprocessing it, and loading the model, we are ready to begin training. We specify the training arguments: a learning rate of 0.00002, 3 epochs (since roBERTa is pretrained, 3 epochs are enough, and we risk overfitting with more), a weight decay of 0.1, and 500 warmup steps. After training is complete we save the best model according to f1-score as roberta_news_classifier.


In [28]:
training_args = TrainingArguments(
    output_dir="./roberta-news-classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=2e-5,
    warmup_steps=500,
    lr_scheduler_type="linear",
    #used for gpu optimization
    fp16=True,
    dataloader_pin_memory=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("./roberta_news_classifier")
tokenizer.save_pretrained("./roberta_news_classifier")

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4803,0.452452,0.860057,0.811018
2,0.3614,0.430597,0.871369,0.823789
3,0.2547,0.453062,0.87399,0.827221


('./best_news_classifier_cased_v2\\tokenizer_config.json',
 './best_news_classifier_cased_v2\\special_tokens_map.json',
 './best_news_classifier_cased_v2\\vocab.json',
 './best_news_classifier_cased_v2\\merges.txt',
 './best_news_classifier_cased_v2\\added_tokens.json',
 './best_news_classifier_cased_v2\\tokenizer.json')

#### Saving compact dataset
We also save a compact version of the dataset, which only headlines from the 12 selected categories. This can be used to test the model and demonstrate it.

In [31]:
df_compact = df[['headline', 'category']]
df_compact.to_csv("data/News_Category_Dataset_v3_compact.csv", index=False)