In [30]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import evaluate
import numpy as np
import torch

In [None]:
#News categorization dataset

ds = load_dataset("fancyzhx/ag_news")

In [9]:
#Exploring dataset
print(ds)
ds['train'].num_rows #total number of rows
print(ds['train'].data)



DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
MemoryMappedTable
text: string
label: int64
----
label: [[2,2,2,2,2,...,2,2,3,3,3],[3,1,1,0,0,...,3,1,0,0,2],...,[0,0,2,2,2,...,2,2,2,1,0],[0,0,0,0,3,...,0,1,1,1,1]]


In [None]:

#model_checkpoint = 'sileod/deberta-v3-base-tasksource-nli'
model_checkpoint = 'distilbert-base-uncased'

#labels = World 0, Sports 1 , Business 2, SciTech 3

# define label maps
idToLabel = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
labelToId = {"World":0, "Sports":1, "Business":2, "Sci/Tech":3}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4, id2label=idToLabel, label2id=labelToId)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:


# Initialize DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

#tokenizer converts the words to tokens for the machine to understand them
#padding="max_length": This setting adds padding to the input so that every sequence has the same length
#truncation="True": This ensures that if any text exceeds the maximum input length the model can handle, it will be truncated to fit
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# Apply tokenization to the dataset
# This means that instead of tokenizing one piece of text at a time,
# the function will receive a batch (or collection) of texts to process at once.
tokenized_dataset = ds.map(tokenize_function, batched=True)


Map: 100%|██████████| 120000/120000 [00:31<00:00, 3785.53 examples/s]
Map: 100%|██████████| 7600/7600 [00:04<00:00, 1738.09 examples/s]


In [27]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(p):
    predictions, labels = p # This line extracts the predictions and labels from the input tuple.
    predictions = np.argmax(predictions, axis=1) # converting the predicted probabilities into class labels.

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [None]:
text_list = [
    "Stock markets rally as investors gain confidence in the economy.",
    "New study reveals alarming rise in global temperatures.",
    "Political tensions escalate ahead of the upcoming election.",
    "Scientists make breakthrough in cancer research, offering hope for patients."
]

for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

predictions = predict_sentiment(text_list)
for text, prediction in predictions:
    print(f"{text} - {prediction}")
