In [1]:
from transformers import AutoTokenizer, AutoModel,AutoModelForSequenceClassification,TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
import numpy as np
import evaluate
import torch


In [2]:
checkpoint = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

num_labels = 3 # 0: NEUTRAL, 1: POSITIVE, 2: NEGATIVE
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
ds = load_dataset("MonoHime/ru_sentiment_dataset")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'sentiment'],
        num_rows: 189891
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'text', 'sentiment'],
        num_rows: 21098
    })
})

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_ds = ds.map(tokenize_function, batched=True)

In [5]:
tokenized_ds = tokenized_ds.remove_columns(["text", "Unnamed: 0"])
tokenized_ds = tokenized_ds.rename_column("sentiment", "labels")

In [6]:
tokenized_ds.set_format("torch")
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 189891
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 21098
    })
})

In [None]:
lora_config = LoraConfig(
    r=8,  # LoRA rank
    lora_alpha=16, # LoRA scaling
    lora_dropout=0.05, 
    bias="none", 
    task_type=TaskType.SEQ_CLS, # sequence classification for sentiment analysis
    target_modules=["query", "value"] 
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 297,219 || all params: 178,152,966 || trainable%: 0.1668


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",      
    save_strategy="epoch",      
    load_best_model_at_end=True,
    fp16=True,                          
    dataloader_num_workers=4,           
    dataloader_pin_memory=True,
)

print("TrainingArguments initialized with eval_strategy.")

TrainingArguments initialized with eval_strategy.


In [9]:
accuracy_metric = evaluate.load("accuracy") 

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [10]:
train_dataset = tokenized_ds["train"]
eval_dataset = tokenized_ds["validation"]

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics, 
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [13]:
print("start")
train_result = trainer.train()
print("Finish")

start


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5416,0.528342,0.756233


Finish


In [15]:
eval_results = trainer.evaluate()
for key, value in eval_results.items():
    print(f"  {key}: {value}")

  eval_loss: 0.5283420085906982
  eval_accuracy: 0.7562328182766139
  eval_runtime: 122.7277
  eval_samples_per_second: 171.909
  eval_steps_per_second: 10.747
  epoch: 1.0
