In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 4)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import numpy as np

In [4]:
import torch, torch.nn as nn
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [5]:
import pandas as pd

data_frame = pd.read_csv("../dataset/texts_and_labels.csv")
print(data_frame.head())

                                                Text   Label
0  New update in Valorant introduces a brand-new ...  gaming
1  Fortnite's latest season brings back fan-favor...  gaming
2  Review: The immersive world of Elden Ring offe...  gaming
3  How to dominate in League of Legends: Tips and...  gaming
4  Global economic summit outlines strategies for...    news


split dataset

In [6]:

genre_dict = {'gaming': 0, 'news': 1, 'education': 2, 'sports': 3}

data_frame['Label'] = data_frame['Label'].apply(lambda x: genre_dict[x])
data_frame


Unnamed: 0,Text,Label
0,New update in Valorant introduces a brand-new ...,0
1,Fortnite's latest season brings back fan-favor...,0
2,Review: The immersive world of Elden Ring offe...,0
3,How to dominate in League of Legends: Tips and...,0
4,Global economic summit outlines strategies for...,1
...,...,...
65,Competitive gaming leagues are becoming as pop...,0
66,Youth sports programs are essential for teachi...,3
67,Athletes are speaking out more on social issue...,3
68,This indie game's innovative gameplay and art ...,0


In [7]:
import datasets
raw_datasets = datasets.Dataset.from_pandas(data_frame)
raw_datasets

Dataset({
    features: ['Text', 'Label'],
    num_rows: 70
})

In [8]:
def tokenize_function(examples):
    return {'input_ids': tokenizer(examples["Text"], truncation=True)["input_ids"], 'labels': examples["Label"]}

In [9]:
from transformers import DataCollatorWithPadding
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer)

tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

tokenized_datasets

Map: 100%|██████████| 70/70 [00:00<00:00, 9450.28 examples/s]


DatasetDict({
    train: Dataset({
        features: ['Text', 'Label', 'input_ids', 'labels'],
        num_rows: 56
    })
    test: Dataset({
        features: ['Text', 'Label', 'input_ids', 'labels'],
        num_rows: 14
    })
})

In [10]:
tokenized_datasets["train"][0]

{'Text': 'Presidential elections see highest voter turnout in decades.\t',
 'Label': 1,
 'input_ids': [101,
  4883,
  3864,
  2156,
  3284,
  14303,
  15512,
  1999,
  5109,
  1012,
  102],
 'labels': 1}

In [11]:
def compute_metrics(eval_preds):
    metric = datasets.load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments("test-trainer",
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  )

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()

100%|██████████| 70/70 [00:02<00:00, 27.41it/s]

{'train_runtime': 2.5527, 'train_samples_per_second': 219.372, 'train_steps_per_second': 27.421, 'train_loss': 0.8552901676722935, 'epoch': 10.0}





TrainOutput(global_step=70, training_loss=0.8552901676722935, metrics={'train_runtime': 2.5527, 'train_samples_per_second': 219.372, 'train_steps_per_second': 27.421, 'train_loss': 0.8552901676722935, 'epoch': 10.0})

In [14]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

metric = datasets.load_metric("accuracy")
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=predictions.label_ids)

  metric = datasets.load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 2/2 [00:00<00:00,  2.82it/s]


(14, 4) (14,)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.8571428571428571}

In [15]:
trainer.evaluate(tokenized_datasets["test"])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 2/2 [00:00<00:00,  3.11it/s]


{'eval_loss': 0.7343140244483948,
 'eval_accuracy': 0.8571428571428571,
 'eval_runtime': 0.6542,
 'eval_samples_per_second': 21.401,
 'eval_steps_per_second': 3.057,
 'epoch': 10.0}