In [1]:
import os
import numpy as np
from datasets import Dataset, load_metric
from transformers import (  AutoModelForSequenceClassification, 
                            AutoTokenizer,
                            TrainingArguments,
                            Trainer)

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

path = "./data/toutiao_cat_data.txt"

label2tag = {
    0 : "news_story",
    1 : "news_culture",
    2 : "news_entertainment",
    3 : "news_sports",
    4 : "news_finance",
    5 : "news_house",
    6 : "news_car",
    7 : "news_edu",
    8 : "news_tech",
    9 : "news_military",
    10 : "news_travel",
    11 : "news_world",
    12 : "stock",
    13 : "news_agriculture",
    14 : "news_game",
}
tag2label = {v: k for k, v in label2tag.items()}

n_labels = len(label2tag)

max_length = 30

In [2]:
n_labels

15

In [3]:
tokenizer = AutoTokenizer.from_pretrained("voidful/albert_chinese_tiny", model_max_length=max_length)
model = AutoModelForSequenceClassification.from_pretrained(
    "voidful/albert_chinese_tiny",
    num_labels = n_labels,
)

training_args = TrainingArguments(output_dir="test_exp", evaluation_strategy="epoch")
metric = load_metric("accuracy")

Some weights of the model checkpoint at voidful/albert_chinese_tiny were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.weight', 'predictions.bias', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at voidful/albert_chinese_tiny and are newly initialized: ['classifier.weight', 

In [6]:
texts = []
labels = []

with open(path) as f:
    for line in f.readlines():
        split = line.split("_!_")
        text = split[3]
        label = tag2label[split[2]]
        texts.append(text)
        labels.append(label)

dataset = Dataset.from_dict({"texts": texts, "labels": labels})

In [7]:
def tokenize_fn(examples):
    return tokenizer(
        examples["texts"],
        add_special_tokens=True, 
        max_length=max_length,
        padding="max_length", 
        truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
tokenized_dataset = dataset.map(tokenize_fn, batched=True)


  0%|          | 0/383 [00:00<?, ?ba/s]

In [9]:
print(tokenized_dataset[0].keys())
print(len(tokenized_dataset[0]["input_ids"]))
print(tokenized_dataset[0]["texts"])
print(tokenized_dataset[0]["labels"])
print(len(tokenized_dataset[0]["token_type_ids"]))
print(len(tokenized_dataset[0]["attention_mask"]))
print(tokenized_dataset[0]["input_ids"])

dict_keys(['texts', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'])
30
京城最值得你来场文化之旅的博物馆
1
30
30
[101, 776, 1814, 3297, 966, 2533, 872, 3341, 1767, 3152, 1265, 722, 3180, 4638, 1300, 4289, 7667, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: texts. If texts are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 382688
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 143508
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mstu00608[0m ([33mtku-cilab[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5212,0.468612,0.868721
2,0.482,0.361893,0.901991
3,0.3989,0.303366,0.922381


Saving model checkpoint to test_exp/checkpoint-500
Configuration saved in test_exp/checkpoint-500/config.json
Model weights saved in test_exp/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test_exp/checkpoint-1000
Configuration saved in test_exp/checkpoint-1000/config.json
Model weights saved in test_exp/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test_exp/checkpoint-1500
Configuration saved in test_exp/checkpoint-1500/config.json
Model weights saved in test_exp/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test_exp/checkpoint-2000
Configuration saved in test_exp/checkpoint-2000/config.json
Model weights saved in test_exp/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test_exp/checkpoint-2500
Configuration saved in test_exp/checkpoint-2500/config.json
Model weights saved in test_exp/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test_exp/checkpoint-3000
Configuration saved in test_exp/checkpoint-3000/config.json
M

TrainOutput(global_step=143508, training_loss=0.49201746965039783, metrics={'train_runtime': 3470.8127, 'train_samples_per_second': 330.777, 'train_steps_per_second': 41.347, 'total_flos': 271754808209280.0, 'train_loss': 0.49201746965039783, 'epoch': 3.0})