In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset, DatasetDict
from transformers import Trainer, AutoTokenizer, set_seed
from classifier import TextClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
set_seed(101)

In [4]:
base_model_name = "Alibaba-NLP/gte-multilingual-base"
model_max_length = 64
create_new = False


In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, model_max_length=model_max_length)

In [6]:
if create_new:
    dataset = load_dataset("csv", data_files="anti-lgbt-cyberbullying.csv")
    # 80% train, 5% valid, 15% test
    train_valid = dataset["train"].train_test_split(test_size=0.2)
    valid_test = train_valid["test"].train_test_split(test_size=0.25)
    dataset = DatasetDict({
        "train": train_valid["train"],
        "validation": valid_test["test"],
        "test": valid_test["train"]
    })
    dataset = dataset.rename_column("anti_lgbt", "label")
    def preprocess_fn(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    dataset = dataset.map(preprocess_fn, batched=True)
else:
    dataset = load_dataset("jxie/anti-lgbt-cyberbullying")

In [7]:
len(dataset["train"]["input_ids"][0])

64

In [8]:
num_classes = len(set(dataset["train"]["label"]))

In [9]:
model = TextClassifier(base_model_name=base_model_name, num_classes=num_classes)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=f"./runs/{base_model_name.replace('/', '-')}",
    eval_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.317122,0.916279
2,0.240500,0.455478,0.911628
3,0.075100,0.550454,0.902326


TrainOutput(global_step=1290, training_loss=0.1256810136543688, metrics={'train_runtime': 97.29, 'train_samples_per_second': 106.044, 'train_steps_per_second': 13.259, 'total_flos': 0.0, 'train_loss': 0.1256810136543688, 'epoch': 3.0})

In [14]:
import torch
torch.save(model.state_dict(), base_model_name.replace("/", "-") + ".pt")

In [15]:
#dataset.push_to_hub("jxie/anti-lgbt-cyberbullying")