In [1]:
!pip install transformers datasets peft pandas scikit-learn -q

In [2]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
    pipeline
)
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel
import peft
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [4]:
model_id = "roberta-base"
repository_id = "roberta-base-fine-tuned"
label_column = "ground_truth"

In [5]:

df = pd.read_parquet('automotive_failure_mode_comments_01.parquet')
print(df.keys())
print(df[label_column].unique())

Index(['comment', 'failure_mode', 'failed_component', 'ground_truth',
       'creator'],
      dtype='object')
['Engine and Transmission Failures' 'Electrical System Failures'
 'Brake System Failures' 'Suspension and Steering Failures'
 'Airbag and Safety System Failures' 'Fuel System Failures'
 'Exhaust System Failures' 'Cooling System Failures' 'Tire Failures'
 'Electronic Component Failures']


In [6]:
# Load dataset
df = pd.read_parquet('automotive_failure_mode_comments_01.parquet')
df_fine_tuning = pd.DataFrame()
df_fine_tuning["text"] = df["comment"]
df_fine_tuning["label"] = df[label_column]

labels = df_fine_tuning["label"].unique()
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label
df_fine_tuning["label"] = df_fine_tuning["label"].map(label2id)

train_text, val_text, train_labels, val_labels = train_test_split(
    df_fine_tuning["text"].tolist(), df_fine_tuning["label"].tolist(), test_size=0.2, random_state=1909
)
train_data = {"text": train_text}
train_data["label"] = train_labels
train_dataset = Dataset.from_dict(train_data).with_format("torch")
val_data = {"text": val_text}
val_data["label"] = val_labels
val_dataset = Dataset.from_dict(val_data).with_format("torch")

# Preprocessing
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# This function tokenizes the input text using the RoBERTa tokenizer. 
# It applies padding and truncation to ensure that all sequences have the same length (256 tokens).
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
val_dataset = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [7]:
# Set dataset format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [8]:
# Update the model's configuration with the id2label mapping
config = AutoConfig.from_pretrained(model_id)
config.update({"id2label": id2label})

In [9]:
# Model
model = RobertaForSequenceClassification.from_pretrained(model_id, config=config)
print_trainable_parameters(model)

# Create PEFT model for training using LoRA
config = LoraConfig(
    task_type=peft.utils.TaskType.SEQ_CLS,
    r=16,
    inference_mode=False,
    lora_alpha=16,
    lora_dropout=0.1,
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

# TrainingArguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=30,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=False,
)

# Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 124653322 || all params: 124653322 || trainable%: 100.00
trainable params: 1786388 || all params: 125841428 || trainable%: 1.42


In [10]:
# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,2.2989,2.309881
2,2.297,2.306543
3,2.2456,2.244951
4,0.9409,0.705232
5,0.2956,0.288302
6,0.1852,0.156455
7,0.1196,0.11004
8,0.1001,0.109432
9,0.0597,0.099239
10,0.0223,0.115211


TrainOutput(global_step=3000, training_loss=0.34221142426785084, metrics={'train_runtime': 571.4522, 'train_samples_per_second': 41.998, 'train_steps_per_second': 5.25, 'total_flos': 525222762624000.0, 'train_loss': 0.34221142426785084, 'epoch': 30.0})

In [11]:
# Save our model
lora_model.save_pretrained(repository_id)