In [16]:
!pip install transformers datasets evaluate accelerate




In [27]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np

In [18]:
dataset = load_dataset("ag_news")

dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [19]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [20]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [21]:
model = AutoModelForSequenceClassification.from_pretrained(
      "distilbert-base-uncased",
          num_labels=4
          )



Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.bias     | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [22]:
metric = evaluate.load("accuracy")


In [23]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [24]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
)

In [28]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(3000)),
    eval_dataset=tokenized_datasets["test"].shuffle(seed=42).select(range(1000)),
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [29]:
trainer.train()

Step,Training Loss
500,0.454797


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=750, training_loss=0.3967182820638021, metrics={'train_runtime': 78.4604, 'train_samples_per_second': 76.472, 'train_steps_per_second': 9.559, 'total_flos': 127249823902656.0, 'train_loss': 0.3967182820638021, 'epoch': 2.0})

In [30]:
trainer.evaluate()


{'eval_loss': 0.37376636266708374,
 'eval_accuracy': 0.896,
 'eval_runtime': 2.7023,
 'eval_samples_per_second': 370.059,
 'eval_steps_per_second': 46.257,
 'epoch': 2.0}

In [33]:
text = "NASA launches a new satellite for space research"

inputs = tokenizer(text, return_tensors="pt")
# Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}
outputs = model(**inputs)

prediction = np.argmax(outputs.logits.detach().cpu().numpy())

labels = ["World", "Sports", "Business", "Sci/Tech"]
print("Predicted Category:", labels[prediction])

Predicted Category: Sci/Tech
