In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 4)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import numpy as np

In [4]:
import torch, torch.nn as nn
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [5]:
import pandas as pd

data_frame = pd.read_csv("../dataset/texts_and_labels.csv")
print(data_frame.head())

                                                Text   Label
0  New update in Valorant introduces a brand-new ...  gaming
1  Fortnite's latest season brings back fan-favor...  gaming
2  Review: The immersive world of Elden Ring offe...  gaming
3  How to dominate in League of Legends: Tips and...  gaming
4  Global economic summit outlines strategies for...    news


split dataset

In [6]:

genre_dict = {'gaming': 0, 'news': 1, 'education': 2, 'sports': 3}

data_frame['Label'] = data_frame['Label'].apply(lambda x: genre_dict[x])
data_frame


Unnamed: 0,Text,Label
0,New update in Valorant introduces a brand-new ...,0
1,Fortnite's latest season brings back fan-favor...,0
2,Review: The immersive world of Elden Ring offe...,0
3,How to dominate in League of Legends: Tips and...,0
4,Global economic summit outlines strategies for...,1
...,...,...
365,Women's sports are gaining more media coverage...,3
366,Technological advancements are improving train...,3
367,Breakthrough in medical research promises new ...,1
368,Language immersion programs are gaining popula...,2


In [7]:
import datasets
raw_datasets = datasets.Dataset.from_pandas(data_frame)
raw_datasets

Dataset({
    features: ['Text', 'Label'],
    num_rows: 370
})

In [8]:
def tokenize_function(examples):
    return {'input_ids': tokenizer(examples["Text"], truncation=True)["input_ids"], 'labels': examples["Label"]}

In [9]:
from transformers import DataCollatorWithPadding
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer)

tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

tokenized_datasets

Map: 100%|██████████| 370/370 [00:00<00:00, 17614.32 examples/s]


DatasetDict({
    train: Dataset({
        features: ['Text', 'Label', 'input_ids', 'labels'],
        num_rows: 296
    })
    test: Dataset({
        features: ['Text', 'Label', 'input_ids', 'labels'],
        num_rows: 74
    })
})

In [10]:
tokenized_datasets["train"][0]

{'Text': 'Studies show that early childhood education is crucial for development. Education systems worldwide are reconsidering standard testing methods. Universities are increasingly offering scholarships for underrepresented groups. Education systems worldwide are reconsidering standard testing methods.',
 'Label': 2,
 'input_ids': [101,
  2913,
  2265,
  2008,
  2220,
  5593,
  2495,
  2003,
  10232,
  2005,
  2458,
  1012,
  2495,
  3001,
  4969,
  2024,
  28667,
  5644,
  18688,
  2075,
  3115,
  5604,
  4725,
  1012,
  5534,
  2024,
  6233,
  5378,
  15691,
  2005,
  2104,
  2890,
  28994,
  14088,
  2967,
  1012,
  2495,
  3001,
  4969,
  2024,
  28667,
  5644,
  18688,
  2075,
  3115,
  5604,
  4725,
  1012,
  102],
 'labels': 2}

In [11]:
def compute_metrics(eval_preds):
    metric = datasets.load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments("test-trainer",
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  num_train_epochs=10,
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  )

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()

100%|██████████| 370/370 [00:11<00:00, 33.28it/s]

{'train_runtime': 11.1181, 'train_samples_per_second': 266.233, 'train_steps_per_second': 33.279, 'train_loss': 0.14444166647421347, 'epoch': 10.0}





TrainOutput(global_step=370, training_loss=0.14444166647421347, metrics={'train_runtime': 11.1181, 'train_samples_per_second': 266.233, 'train_steps_per_second': 33.279, 'train_loss': 0.14444166647421347, 'epoch': 10.0})

In [14]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

metric = datasets.load_metric("accuracy")
preds = np.argmax(predictions.predictions, axis=-1)
metric.compute(predictions=preds, references=predictions.label_ids)

  0%|          | 0/10 [00:00<?, ?it/s]

  metric = datasets.load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 10/10 [00:00<00:00, 12.20it/s]


(74, 4) (74,)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'accuracy': 0.972972972972973}

In [15]:
trainer.evaluate(tokenized_datasets["test"])

  0%|          | 0/10 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
100%|██████████| 10/10 [00:00<00:00, 14.16it/s]


{'eval_loss': 0.052659690380096436,
 'eval_accuracy': 0.972972972972973,
 'eval_runtime': 0.7141,
 'eval_samples_per_second': 103.629,
 'eval_steps_per_second': 14.004,
 'epoch': 10.0}

Save model to 'custom_model'

In [16]:
model_path = "../custom_models/genre_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('../custom_models/genre_model\\tokenizer_config.json',
 '../custom_models/genre_model\\special_tokens_map.json',
 '../custom_models/genre_model\\vocab.txt',
 '../custom_models/genre_model\\added_tokens.json',
 '../custom_models/genre_model\\tokenizer.json')