In [None]:
!pip install sacremoses datasets

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "allegro/herbert-base-cased"
NUM_LABELS = 3

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

### ***wczytanie danych***

In [None]:
import pandas as pd
from datasets import Dataset
try:
    from google.colab import drive
    drive.mount('/content/drive')
    TRAIN_DATA_PATH = '/content/drive/MyDrive/PWr/NLP/data/train.csv'
    TEST_DATA_PATH = '/content/drive/MyDrive/PWr/NLP/data/test.csv'
except (ImportError, ModuleNotFoundError):
    # If not running in Colab, use local paths
    TRAIN_DATA_PATH = '../data/processed/train.csv'
    TEST_DATA_PATH = '../data/processed/test.csv'



train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)
train_df["labels"] = train_df["label"]
test_df["labels"] = test_df["label"]

# delete the old column
train_df.drop(columns=["label"], inplace=True)
test_df.drop(columns=["label"], inplace=True)

train_split = Dataset.from_pandas(train_df)
test_split = Dataset.from_pandas(test_df)


def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )
    # Only return the required columns
    return tokenized_inputs

train_dataset = train_split.map(tokenize_function, batched=True)
eval_dataset = test_split.map(tokenize_function, batched=True)

# distribution of labels
train_df['labels'].value_counts()

In [None]:
from torch.utils.data import DataLoader

train_dataset.set_format('torch')
eval_dataset.set_format('torch')

train_dataloader = DataLoader(train_dataset, batch_size=16)
eval_dataloader = DataLoader(eval_dataset, batch_size=16)

### ustawienie adapterów ***peft***

In [None]:
# !pip install peft

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank of the adapter matrix
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate
    target_modules=["query", "value"],  # Target modules to apply LoRA (attention layers)
    task_type="SEQUENCE_CLASSIFICATION"
)

# Wrap the model with PEFT adapters
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()  # Verify trainable parameters


trainable params: 294,912 || all params: 124,740,099 || trainable%: 0.2364


### ***optymalizator i scheduler***

In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

LEARNING_RATE = 5e-6
WARMUP_STEPS = 20
EPOCHS = 25


optimizer = AdamW(peft_model.parameters(), lr=LEARNING_RATE)

num_training_steps = len(train_dataloader) * EPOCHS  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=num_training_steps
)


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

### ***stroimy!***

In [None]:
import torch
import numpy as np
from tqdm.auto import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
peft_model.to(device)


def evaluate_model(model, eval_dataloader, eval_dataset, device):
    model.eval()
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            predictions = torch.argmax(outputs.logits, dim=-1)

            # Keep predictions and labels on GPU for comparison
            # This will work for any number of classes since we're just checking equality
            correct = (predictions == labels).sum().item()
            total_correct += correct
            total_samples += labels.size(0)

    accuracy = total_correct / total_samples
    print(f"Validation Accuracy: {accuracy:.4f}")
    return accuracy



for epoch in tqdm(range(EPOCHS)):
    peft_model.train()
    per_batch_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        per_batch_loss += loss.item()
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1} completed! Loss: {per_batch_loss}")
    if (epoch + 1) % 10 == 0:
        evaluate_model(peft_model, eval_dataloader, eval_dataset, device)

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 1 completed! Loss: 7.705826044082642
Epoch 2 completed! Loss: 7.58583676815033
Epoch 3 completed! Loss: 7.701919436454773
Epoch 4 completed! Loss: 7.608282566070557
Epoch 5 completed! Loss: 7.604868412017822
Epoch 6 completed! Loss: 7.73663055896759
Epoch 7 completed! Loss: 7.65780234336853
Epoch 8 completed! Loss: 7.670746445655823
Epoch 9 completed! Loss: 7.605456233024597
Epoch 10 completed! Loss: 7.643666863441467
Validation Accuracy: 0.2143
Epoch 11 completed! Loss: 7.683774709701538
Epoch 12 completed! Loss: 7.661701798439026
Epoch 13 completed! Loss: 7.734836578369141
Epoch 14 completed! Loss: 7.685187458992004
Epoch 15 completed! Loss: 7.7136390209198
Epoch 16 completed! Loss: 7.716847896575928
Epoch 17 completed! Loss: 7.672253251075745
Epoch 18 completed! Loss: 7.681026101112366
Epoch 19 completed! Loss: 7.687049984931946
Epoch 20 completed! Loss: 7.665805697441101
Validation Accuracy: 0.2143
Epoch 21 completed! Loss: 7.6906105279922485
Epoch 22 completed! Loss: 7.67286

In [5]:
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments

model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]

In [25]:
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(eval_pred):
    """
    Calculates and returns the F1 score and accuracy.

    Args:
        eval_pred: A tuple containing predictions and labels.

    Returns:
        A dictionary containing the F1 score and accuracy.
    """
    predictions, labels = eval_pred
    # Assuming predictions need to be converted to class labels
    predicted_labels = predictions.argmax(-1)
    f1 = f1_score(labels, predicted_labels, average='weighted')  # Use weighted average for multi-class
    accuracy = accuracy_score(labels, predicted_labels)

    return {"f1": f1, "accuracy": accuracy}

In [None]:
seq_cls_model = peft_model.merge_and_unload()

for n, param in seq_cls_model.named_parameters():

    if 'classifier' in n:
        param.requires_grad = True
    else:
        param.requires_grad = False

NUM_EPOCHS = 25
BATCH_SIZE = 16

train_args = TrainingArguments(
    output_dir="bert-lora-seq",
    learning_rate=1e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    warmup_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="epoch"
)


trainer = Trainer(
    model=seq_cls_model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.evaluate()

100%|██████████| 2/2 [00:00<00:00,  2.06it/s]


{'eval_loss': 1.1105560064315796,
 'eval_model_preparation_time': 0.0049,
 'eval_f1': 0.3202639751552795,
 'eval_accuracy': 0.39285714285714285,
 'eval_runtime': 3.222,
 'eval_samples_per_second': 8.69,
 'eval_steps_per_second': 0.621}

In [None]:
import numpy as np
print(np.__version__)

1.26.4


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Accuracy
1,0.7522,1.059091,0.004,0.500806,0.5
2,0.7551,1.074849,0.004,0.50105,0.5
3,0.7415,1.075377,0.004,0.542227,0.535714
4,0.7146,1.057971,0.004,0.454105,0.464286
5,0.7324,1.071362,0.004,0.576939,0.571429
6,0.7435,1.078119,0.004,0.578679,0.571429
7,0.7219,1.062758,0.004,0.471513,0.464286
8,0.7155,1.057691,0.004,0.5,0.5
9,0.6958,1.059551,0.004,0.614313,0.607143
10,0.6915,1.077119,0.004,0.542227,0.535714


TrainOutput(global_step=175, training_loss=0.7218437603541783, metrics={'train_runtime': 199.7271, 'train_samples_per_second': 13.769, 'train_steps_per_second': 0.876, 'total_flos': 723561898752000.0, 'train_loss': 0.7218437603541783, 'epoch': 25.0})

## **GPT-2**

In [2]:
import torch
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding)
from peft import LoraConfig, get_peft_model, TaskType


MODEL_NAME = "gpt2"
NUM_LABELS = 3

gpt_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token  # Set pad token to EOS token
gpt_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
gpt_model.config.pad_token_id = gpt_tokenizer.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
import pandas as pd
from datasets import Dataset
try:
    from google.colab import drive
    drive.mount('/content/drive')
    TRAIN_DATA_PATH = '/content/drive/MyDrive/PWr/NLP/data/train.csv'
    TEST_DATA_PATH = '/content/drive/MyDrive/PWr/NLP/data/test.csv'
except (ImportError, ModuleNotFoundError):
    # If not running in Colab, use local paths
    TRAIN_DATA_PATH = '../data/processed/train.csv'
    TEST_DATA_PATH = '../data/processed/test.csv'



train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)
train_df["labels"] = train_df["label"]
test_df["labels"] = test_df["label"]

# delete the old column
train_df.drop(columns=["label"], inplace=True)
test_df.drop(columns=["label"], inplace=True)

train_split = Dataset.from_pandas(train_df)
test_split = Dataset.from_pandas(test_df)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def tokenize_function(examples, max_length=1024):
    # GPT2 tokenizer needs special handling for padding token
    gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

    tokenized_inputs = gpt_tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors=None  # Return as lists rather than tensors
    )

    # Add labels to tokenized output
    tokenized_inputs['labels'] = examples['labels']

    return tokenized_inputs

# Apply tokenization to both datasets
train_dataset = train_split.map(tokenize_function, batched=True, remove_columns=train_split.column_names)
eval_dataset = test_split.map(tokenize_function, batched=True, remove_columns=test_split.column_names)

train_dataset.set_format('torch')
eval_dataset.set_format('torch')
train_dataset

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 110
})

In [5]:
gpt_model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=3, bias=False)
)

In [6]:
from sklearn.metrics import f1_score, accuracy_score

# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predicted_labels = np.argmax(predictions, axis=1)
    f1 = f1_score(labels, predicted_labels, average="weighted")
    accuracy = accuracy_score(labels, predicted_labels)
    return {"f1": f1, "accuracy": accuracy}

### ***classification-head only***

In [82]:
gpt_model.requires_grad_(False)

for name, param in gpt_model.named_parameters():
    if "lora" in name or "score" in name:  # LoRA and classification head
        param.requires_grad = True

# Confirm trainable parameters
for name, param in gpt_model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.requires_grad}")

score.weight: True


In [92]:
NUM_EPOCHS = 30
BATCH_SIZE = 16  # Reduced batch size to 1 to avoid padding issues

# Training Arguments
train_args = TrainingArguments(
    output_dir="gpt2-cls-head-only",
    learning_rate=5e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    warmup_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to="none",
    # load_best_model_at_end=True,
    # metric_for_best_model="f1",
    # greater_is_better=True
)

# Trainer
trainer = Trainer(
    model=gpt_model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=gpt_tokenizer),
    compute_metrics=compute_metrics,
)

trainer.evaluate()

{'eval_loss': 1.050960659980774,
 'eval_model_preparation_time': 0.0045,
 'eval_f1': 0.38644688644688646,
 'eval_accuracy': 0.42857142857142855,
 'eval_runtime': 1.9942,
 'eval_samples_per_second': 14.04,
 'eval_steps_per_second': 1.003}

In [93]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Accuracy
1,0.9784,1.049715,0.0045,0.386447,0.428571
2,0.9753,1.055886,0.0045,0.391457,0.428571
3,0.9045,1.06494,0.0045,0.357993,0.392857
4,0.9886,1.053101,0.0045,0.391457,0.428571
5,0.9581,1.043286,0.0045,0.447317,0.464286
6,0.9517,1.044172,0.0045,0.386447,0.428571
7,0.942,1.032919,0.0045,0.465861,0.464286
8,0.9451,1.040114,0.0045,0.391457,0.428571
9,0.9331,1.053852,0.0045,0.357993,0.392857
10,0.9185,1.068314,0.0045,0.391457,0.428571


TrainOutput(global_step=210, training_loss=0.8777019591558547, metrics={'train_runtime': 742.8936, 'train_samples_per_second': 4.442, 'train_steps_per_second': 0.283, 'total_flos': 1724574125260800.0, 'train_loss': 0.8777019591558547, 'epoch': 30.0})

In [94]:
trainer.evaluate()

{'eval_loss': 0.959460437297821,
 'eval_model_preparation_time': 0.0045,
 'eval_f1': 0.5280612244897959,
 'eval_accuracy': 0.5357142857142857,
 'eval_runtime': 1.9774,
 'eval_samples_per_second': 14.16,
 'eval_steps_per_second': 1.011,
 'epoch': 30.0}

In [99]:
trainer.model.save_pretrained('content/drive/My Drive/PWr/NLP/models/gpt2-cls-head.pth')

### ***LoRA***

In [7]:
from peft import get_peft_model, LoraConfig, TaskType

# LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Task type: Sequence Classification
    r=8,                        # Rank of LoRA matrices
    lora_alpha=16,              # Scaling factor
    target_modules=["c_proj"],  # GPT-2 target modules for LoRA
    lora_dropout=0.1,           # Dropout rate
    bias="none",                # No additional biases
)

# Wrap model with LoRA
lora_model = get_peft_model(gpt_model, lora_config)

# Freeze all base model parameters
for param in gpt_model.parameters():
    param.requires_grad = False

# Train only LoRA layers and the classification head
for name, param in lora_model.named_parameters():
    if "lora" in name or "score" in name:  # LoRA and classification head
        param.requires_grad = True

# Confirm trainable parameters
for name, param in lora_model.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.requires_grad}")

base_model.model.transformer.h.0.attn.c_proj.lora_A.default.weight: True
base_model.model.transformer.h.0.attn.c_proj.lora_B.default.weight: True
base_model.model.transformer.h.0.mlp.c_proj.lora_A.default.weight: True
base_model.model.transformer.h.0.mlp.c_proj.lora_B.default.weight: True
base_model.model.transformer.h.1.attn.c_proj.lora_A.default.weight: True
base_model.model.transformer.h.1.attn.c_proj.lora_B.default.weight: True
base_model.model.transformer.h.1.mlp.c_proj.lora_A.default.weight: True
base_model.model.transformer.h.1.mlp.c_proj.lora_B.default.weight: True
base_model.model.transformer.h.2.attn.c_proj.lora_A.default.weight: True
base_model.model.transformer.h.2.attn.c_proj.lora_B.default.weight: True
base_model.model.transformer.h.2.mlp.c_proj.lora_A.default.weight: True
base_model.model.transformer.h.2.mlp.c_proj.lora_B.default.weight: True
base_model.model.transformer.h.3.attn.c_proj.lora_A.default.weight: True
base_model.model.transformer.h.3.attn.c_proj.lora_B.defau



In [8]:
def tokenize_function(examples, max_length=512):
    # GPT2 tokenizer needs special handling for padding token
    gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

    tokenized_inputs = gpt_tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors=None  # Return as lists rather than tensors
    )

    # Add labels to tokenized output
    tokenized_inputs['labels'] = examples['labels']

    return tokenized_inputs

train_dataset = train_split.map(tokenize_function, batched=True, remove_columns=train_split.column_names)
eval_dataset = test_split.map(tokenize_function, batched=True, remove_columns=test_split.column_names)

train_dataset.set_format('torch')
eval_dataset.set_format('torch')

Map:   0%|          | 0/110 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [11]:
import numpy as np
NUM_EPOCHS = 25
BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 2

# Training Arguments
train_args = TrainingArguments(
    output_dir="'content/drive/MyDrive/PWr/NLP/models/",
    learning_rate=1e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    warmup_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS
)

# Trainer
trainer = Trainer(
    model=lora_model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=gpt_tokenizer),
    compute_metrics=compute_metrics,
)

trainer.evaluate()

{'eval_loss': 2.8520395755767822,
 'eval_model_preparation_time': 0.05,
 'eval_f1': 0.16071428571428573,
 'eval_accuracy': 0.32142857142857145,
 'eval_runtime': 1.1357,
 'eval_samples_per_second': 24.654,
 'eval_steps_per_second': 3.522}

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,F1,Accuracy
1,2.6245,2.828985,0.05,0.160714,0.321429
2,2.5409,2.755655,0.05,0.160714,0.321429
3,2.383,2.626208,0.05,0.160714,0.321429
