In [None]:
!pip install sacremoses datasets

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "allegro/herbert-base-cased"
NUM_LABELS = 3

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### ***wczytanie danych***

In [120]:
import pandas as pd
from datasets import Dataset
from google.colab import drive
drive.mount('/content/drive')

TRAIN_DATA_PATH = '/content/drive/MyDrive/PWr/NLP/data/train.csv'
TEST_DATA_PATH = '/content/drive/MyDrive/PWr/NLP/data/test.csv'



train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)
train_df["labels"] = train_df["label"]
test_df["labels"] = test_df["label"]

# delete the old column
train_df.drop(columns=["label"], inplace=True)
test_df.drop(columns=["label"], inplace=True)

train_split = Dataset.from_pandas(train_df)
test_split = Dataset.from_pandas(test_df)


def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )
    # Only return the required columns
    return tokenized_inputs

train_dataset = train_split.map(tokenize_function, batched=True)
eval_dataset = test_split.map(tokenize_function, batched=True)

# distribution of labels
train_df['labels'].value_counts()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Unnamed: 0_level_0,count
labels,Unnamed: 1_level_1
1,42
0,37
2,31


In [121]:
train_dataset.set_format('torch')
eval_dataset.set_format('torch')

In [117]:
from torch.utils.data import DataLoader

train_dataset.set_format('torch')
eval_dataset.set_format('torch')

train_dataloader = DataLoader(train_dataset, batch_size=16)
eval_dataloader = DataLoader(eval_dataset, batch_size=16)

### ustawienie adapterów ***peft***

In [89]:
!pip install peft



In [138]:
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank of the adapter matrix
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate
    target_modules=["query", "value"],  # Target modules to apply LoRA (attention layers)
    task_type="SEQUENCE_CLASSIFICATION"
)

# Wrap the model with PEFT adapters
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()  # Verify trainable parameters


trainable params: 294,912 || all params: 124,740,099 || trainable%: 0.2364


### ***optymalizator i scheduler***

In [43]:
from torch.optim import AdamW
from transformers import get_scheduler

LEARNING_RATE = 5e-6
WARMUP_STEPS = 30
EPOCHS = 50


optimizer = AdamW(peft_model.parameters(), lr=LEARNING_RATE)

num_training_steps = len(train_dataloader) * EPOCHS  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=num_training_steps
)


In [44]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### ***stroimy!***

In [45]:
import torch
import numpy as np
from tqdm.auto import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
peft_model.to(device)


def evaluate_model(model, eval_dataloader, eval_dataset, device):
    model.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            predictions = torch.argmax(outputs.logits, dim=-1)

            # Keep predictions and labels on GPU for comparison
            # This will work for any number of classes since we're just checking equality
            correct = (predictions == labels).sum().item()
            total_correct += correct
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    print(f"Validation Accuracy: {accuracy:.4f}")
    return accuracy



for epoch in tqdm(range(EPOCHS)):
    peft_model.train()
    per_batch_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        per_batch_loss += loss.item()
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1} completed! Loss: {per_batch_loss}")
    if (epoch + 1) % 10 == 0:
        evaluate_model(peft_model, eval_dataloader, eval_dataset, device)

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 1 completed! Loss: 7.705826044082642
Epoch 2 completed! Loss: 7.58583676815033
Epoch 3 completed! Loss: 7.701919436454773
Epoch 4 completed! Loss: 7.608282566070557
Epoch 5 completed! Loss: 7.604868412017822
Epoch 6 completed! Loss: 7.73663055896759
Epoch 7 completed! Loss: 7.65780234336853
Epoch 8 completed! Loss: 7.670746445655823
Epoch 9 completed! Loss: 7.605456233024597
Epoch 10 completed! Loss: 7.643666863441467
Validation Accuracy: 0.2143
Epoch 11 completed! Loss: 7.683774709701538
Epoch 12 completed! Loss: 7.661701798439026
Epoch 13 completed! Loss: 7.734836578369141
Epoch 14 completed! Loss: 7.685187458992004
Epoch 15 completed! Loss: 7.7136390209198
Epoch 16 completed! Loss: 7.716847896575928
Epoch 17 completed! Loss: 7.672253251075745
Epoch 18 completed! Loss: 7.681026101112366
Epoch 19 completed! Loss: 7.687049984931946
Epoch 20 completed! Loss: 7.665805697441101
Validation Accuracy: 0.2143
Epoch 21 completed! Loss: 7.6906105279922485
Epoch 22 completed! Loss: 7.67286

In [47]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

In [78]:
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(eval_pred):
    """
    Calculates and returns the F1 score and accuracy.

    Args:
        eval_pred: A tuple containing predictions and labels.

    Returns:
        A dictionary containing the F1 score and accuracy.
    """
    predictions, labels = eval_pred
    # Assuming predictions need to be converted to class labels
    predicted_labels = predictions.argmax(-1)
    f1 = f1_score(labels, predicted_labels, average='weighted')  # Use weighted average for multi-class
    accuracy = accuracy_score(labels, predicted_labels)

    return {"f1": f1, "accuracy": accuracy}

In [154]:
seq_cls_model = peft_model.merge_and_unload()

for n, param in seq_cls_model.named_parameters():

    if 'classifier' in n:
        param.requires_grad = True
    else:
        param.requires_grad = False

NUM_EPOCHS = 25
BATCH_SIZE = 16

train_args = TrainingArguments(
    output_dir="bert-lora-seq",
    learning_rate=1e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    warmup_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="epoch"
)


trainer = Trainer(
    model=seq_cls_model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [155]:
trainer.evaluate()

{'eval_loss': 1.0525239706039429,
 'eval_model_preparation_time': 0.0035,
 'eval_f1': 0.46411985846970366,
 'eval_accuracy': 0.4642857142857143,
 'eval_runtime': 0.868,
 'eval_samples_per_second': 32.257,
 'eval_steps_per_second': 2.304}

In [68]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Accuracy
1,0.7522,1.059091,0.004,0.500806,0.5
2,0.7551,1.074849,0.004,0.50105,0.5
3,0.7415,1.075377,0.004,0.542227,0.535714
4,0.7146,1.057971,0.004,0.454105,0.464286
5,0.7324,1.071362,0.004,0.576939,0.571429
6,0.7435,1.078119,0.004,0.578679,0.571429
7,0.7219,1.062758,0.004,0.471513,0.464286
8,0.7155,1.057691,0.004,0.5,0.5
9,0.6958,1.059551,0.004,0.614313,0.607143
10,0.6915,1.077119,0.004,0.542227,0.535714


TrainOutput(global_step=175, training_loss=0.7218437603541783, metrics={'train_runtime': 199.7271, 'train_samples_per_second': 13.769, 'train_steps_per_second': 0.876, 'total_flos': 723561898752000.0, 'train_loss': 0.7218437603541783, 'epoch': 25.0})