In [1]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder, notebook_login

2024-08-18 15:41:43.065946: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-18 15:41:43.066009: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-18 15:41:43.067615: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
model_id = "roberta-base"
dataset_id = "FinanceInc/auditor_sentiment"
# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "qwdf8591/roberta-base_auditor_sentiment"

In [4]:
# Load dataset
dataset = load_dataset(dataset_id)

# Training and testing datasets
train_dataset = dataset['train']
test_dataset = dataset["test"].shard(num_shards=2, index=0)

# Validation dataset
val_dataset = dataset['test'].shard(num_shards=2, index=1)

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# This function tokenizes the input text using the RoBERTa tokenizer. 
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
def tokenize(batch):
    return tokenizer(batch["sentence"], padding=True, truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=len(test_dataset))


Map:   0%|          | 0/485 [00:00<?, ? examples/s]

In [5]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [6]:
dataset['train'][0]

{'sentence': "Altia 's operating profit jumped to EUR 47 million from EUR 6.6 million .",
 'label': 2}

In [7]:
num_labels = 3
class_names = ["negative", "neutral", "positive"]

print(f"Number of labels: {num_labels}")
print(f"The labels: {class_names}")
# create id2label mapping
id2label = {i: label for i, label in enumerate(class_names)}
label2id = {label: i for i, label in enumerate(class_names)}

# 更新模型的配置
config = AutoConfig.from_pretrained(model_id)
config.num_labels = num_labels
config.id2label = id2label
config.label2id = label2id

# 打印配置以確認更新
print(config)

Number of labels: 3
The labels: ['negative', 'neutral', 'positive']
RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.42.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



In [9]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    results = {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "recall": recall.compute(predictions=predictions, references=labels, average="macro")["recall"],
        "f1": f1.compute(predictions=predictions, references=labels, average="macro")["f1"],
    }
    return results

In [11]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=compute_metrics,
    eval_dataset=val_dataset,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4217,0.63576,0.822314,0.814171,0.813525,0.813436
2,0.6538,0.649072,0.838843,0.858391,0.802512,0.81924
3,0.3961,0.535569,0.855372,0.822397,0.872231,0.841444
4,0.1121,0.739307,0.85124,0.841409,0.847688,0.842825
5,0.0192,0.723283,0.869835,0.858104,0.874291,0.865719


TrainOutput(global_step=2425, training_loss=0.3738708070408284, metrics={'train_runtime': 348.2939, 'train_samples_per_second': 55.657, 'train_steps_per_second': 6.963, 'total_flos': 1324922517877770.0, 'train_loss': 0.3738708070408284, 'epoch': 5.0})

In [13]:
# Evaluate the model
trainer.evaluate()

{'eval_loss': 0.5355690121650696,
 'eval_accuracy': 0.8553719008264463,
 'eval_precision': 0.822396689042232,
 'eval_recall': 0.8722306360011278,
 'eval_f1': 0.8414437298443005,
 'eval_runtime': 1.3876,
 'eval_samples_per_second': 348.807,
 'eval_steps_per_second': 43.961,
 'epoch': 5.0}

In [14]:
# Save our tokenizer and create a model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1723995764.d2d87c5b52e4.195.0:   0%|          | 0.00/58.8k [00:00<?, ?B/s]

events.out.tfevents.1723996196.d2d87c5b52e4.195.1:   0%|          | 0.00/560 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/qwdf8591/roberta-base_auditor_sentiment/commit/ab9e415ea9e5a7cd47234a916acabaf9377501ae', commit_message='End of training', commit_description='', oid='ab9e415ea9e5a7cd47234a916acabaf9377501ae', pr_url=None, pr_revision=None, pr_num=None)

In [25]:
# TEST MODEL

from transformers import pipeline

classifier = pipeline('text-classification',repository_id, device=0)

text = "hi, I hate my life."
classifier(text)


[{'label': 'negative', 'score': 0.9809571504592896}]