In [15]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import evaluate
import numpy as np
import torch

In [3]:
#News categorization dataset

ds = load_dataset("fancyzhx/ag_news")

In [9]:
#Exploring dataset
print(ds)
ds['train'].num_rows #total number of rows
print(ds['train'].data)



DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
MemoryMappedTable
text: string
label: int64
----
label: [[2,2,2,2,2,...,2,2,3,3,3],[3,1,1,0,0,...,3,1,0,0,2],...,[0,0,2,2,2,...,2,2,2,1,0],[0,0,0,0,3,...,0,1,1,1,1]]


In [4]:

#model_checkpoint = 'sileod/deberta-v3-base-tasksource-nli'
model_checkpoint = 'distilbert-base-uncased'

#labels = World 0, Sports 1 , Business 2, SciTech 3

# define label maps
idToLabel = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
labelToId = {"World":0, "Sports":1, "Business":2, "Sci/Tech":3}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4, id2label=idToLabel, label2id=labelToId)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.we

In [5]:


# Initialize DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

#tokenizer converts the words to tokens for the machine to understand them
#padding="max_length": This setting adds padding to the input so that every sequence has the same length
#truncation="True": This ensures that if any text exceeds the maximum input length the model can handle, it will be truncated to fit
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Apply tokenization to the dataset
# This means that instead of tokenizing one piece of text at a time,
# the function will receive a batch (or collection) of texts to process at once.
tokenized_dataset = ds.map(tokenize_function, batched=True)


Map: 100%|██████████| 120000/120000 [00:36<00:00, 3293.76 examples/s]
Map: 100%|██████████| 7600/7600 [00:02<00:00, 3138.42 examples/s]


In [6]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [7]:
def compute_metrics(p):
    predictions, labels = p # This line extracts the predictions and labels from the input tuple.
    predictions = np.argmax(predictions, axis=1) # converting the predicted probabilities into class labels.

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [8]:
text_list = [
    "Stock markets rally as investors gain confidence in the economy.",
    "New study reveals alarming rise in global temperatures.",
    "Political tensions escalate ahead of the upcoming election.",
    "Scientists make breakthrough in cancer research, offering hope for patients."
]

for text in text_list:
        # Tokenize text and get inputs
        inputs = tokenizer.encode(text, return_tensors="pt") #return_tensors="pt" specifies that the output should be a PyTorch tensor, which is the required format for inputs to the model.
        # Compute logits
        logits = model(inputs).logits  # Disable gradient calculation for inference (since it is not in training phase, there is no need for calculation or utilization of memory)
        # Get predicted class label
        predictions = torch.argmax(logits)  # Get index of the predicted label
        print(text + " - " + idToLabel[predictions.tolist()])

Stock markets rally as investors gain confidence in the economy. - Sci/Tech
New study reveals alarming rise in global temperatures. - Sports
Political tensions escalate ahead of the upcoming election. - Sports
Scientists make breakthrough in cancer research, offering hope for patients. - Sports


In [12]:
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # this will dynamically pad examples in each batch to be equal length


# creater trainer object
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

KeyError: 'validation'