# SEG-PEFT

In [1]:

!git clone https://github.com/rossoc/SEG-PEFT
%cd SEG-PEFT
!pip install evaluate

Cloning into 'SEG-PEFT'...
remote: Enumerating objects: 162, done.[K
remote: Counting objects: 100% (162/162), done.[K
remote: Compressing objects: 100% (89/89), done.[K
remote: Total 162 (delta 75), reused 128 (delta 45), pack-reused 0 (from 0)[K
Receiving objects: 100% (162/162), 211.49 KiB | 7.05 MiB/s, done.
Resolving deltas: 100% (75/75), done.
/content/SEG-PEFT
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [2]:
import torch
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from src.segpeft import (
    kvasir_dataset,
    compute_metrics_fn,
    segformer,
    mask2former,
    set_seed,
    Metrics,
)
import time
import yaml
import pandas as pd
import os
import zipfile
from peft import get_peft_model, LoraConfig

set_seed(42)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

## Dataset
You can check out the dataset at the following link
[Kvasir-SEG](https://datasets.simula.no/kvasir-seg/).

In [3]:
dataset_dir = "data"
os.makedirs(dataset_dir, exist_ok=True)
!wget --no-check-certificate https://datasets.simula.no/downloads/kvasir-seg.zip -O kvasir-seg.zip

with zipfile.ZipFile("kvasir-seg.zip", "r") as zip_ref:
    zip_ref.extractall(dataset_dir)

--2025-11-05 18:25:27--  https://datasets.simula.no/downloads/kvasir-seg.zip
Resolving datasets.simula.no (datasets.simula.no)... 128.39.36.14
Connecting to datasets.simula.no (datasets.simula.no)|128.39.36.14|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 46227172 (44M) [application/zip]
Saving to: ‘kvasir-seg.zip’


2025-11-05 18:25:28 (74.9 MB/s) - ‘kvasir-seg.zip’ saved [46227172/46227172]



## Train [SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer) FFT

In [13]:
def train_mask2_former(epochs, lr, save_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    test_size = 0.2
    model, model_name, _ = mask2former()
    train_dataset, test_dataset = kvasir_dataset(model_name, test_size)
    N = len(train_dataset)
    batch_size = 64
    gradient_accumulation_steps = 4
    use_bf16 = True
    dataloader_num_workers = 8

    training_args = TrainingArguments(
        output_dir="./outputs/" + save_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        bf16=use_bf16 and torch.cuda.is_available(),
        bf16_full_eval=use_bf16 and torch.cuda.is_available(),
        dataloader_num_workers=dataloader_num_workers,
        dataloader_pin_memory=True,
        dataloader_prefetch_factor=2,
        logging_steps=N,
        learning_rate=lr,
        save_total_limit=2,
        prediction_loss_only=False,
        remove_unused_columns=True,
        push_to_hub=False,
        report_to="none",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_dir=f"./outputs/{save_dir}/logs",
        optim="adamw_torch_fused" if torch.cuda.is_available() else "adamw_torch",
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics_fn(model_name),  # type: ignore
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
    )

    print("Starting training...")
    start_time = time.time()
    trainer.train()
    end_time = time.time() - start_time

    final_test_metrics = trainer.evaluate(eval_dataset=train_dataset)
    log = trainer.state.log_history.copy()
    final_train_metrics = trainer.evaluate(eval_dataset=train_dataset)
    log.append({"epoch": epochs, "loss": final_train_metrics["eval_loss"]})
    all_metrics = {
        "training_history": log,
        "final_evaluation": final_test_metrics,
        "training_time": end_time,
    }
    metrics = Metrics(f"./outputs/{save_dir}/")
    metrics.store_metrics(all_metrics)
    metrics.store_history(log)
    metrics.plot_curves(log)
    return trainer

In [14]:
epochs = 30
learning_rate = 5e-5
save_dir = "test_transformer_fft"

In [15]:
fft_trainer = train_segformer_fft(epochs, learning_rate, save_dir)

Using device: cuda


Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/segformer-b0-finetuned-ade-512-512 and are newly initialized because the shapes did not match:
- decode_head.classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([2]) in the model instantiated
- decode_head.classifier.weight: found shape torch.Size([150, 256, 1, 1]) in the checkpoint and torch.Size([2, 256, 1, 1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  image_processor = cls(**image_processor_dict)


Starting training...


Step,Training Loss,Validation Loss,Mean Iou,Mean Dice,Accuracy
12,0.5903,0.514415,0.462893,0.727658,0.925787
24,0.496,0.417608,0.453648,0.796216,0.907295
36,0.413,0.368511,0.474239,0.854142,0.948478
48,0.3466,0.31548,0.471174,0.872363,0.942349
60,0.309,0.276278,0.478645,0.887313,0.957291
72,0.2814,0.264738,0.480123,0.892837,0.960246
84,0.2683,0.261358,0.477326,0.892761,0.954653
96,0.2509,0.255295,0.479922,0.896757,0.959844
108,0.2563,0.25046,0.479544,0.896694,0.959088
120,0.25,0.25295,0.47927,0.896312,0.95854


  acc = total_area_intersect / total_area_label
  acc = total_area_intersect / total_area_label
  acc = total_area_intersect / total_area_label
  acc = total_area_intersect / total_area_label
  acc = total_area_intersect / total_area_label
  acc = total_area_intersect / total_area_label
  acc = total_area_intersect / total_area_label
  acc = total_area_intersect / total_area_label
  acc = total_area_intersect / total_area_label
  acc = total_area_intersect / total_area_label


  acc = total_area_intersect / total_area_label


In [None]:
fft_trainer.state.log_history

: 

In [16]:
Y = {
    "Evaluation": [
        entry["eval_mean_dice"]
        for entry in fft_trainer.state.log_history
        if entry["epoch"] % 1 == 0 and "eval_mean_dice" in entry.keys()
    ],
}

## Train
[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer) with
LoRA.
Namely, we use [PEFT](https://github.com/huggingface/peft) to implmenent LoRA.

In [None]:
def train_segformer_lora(epochs, lr, r, lora_alpha, lora_dropout, save_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    test_size = 0.2
    model, model_name, modules = segformer()

    peft_config = LoraConfig(
        r=r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=modules,
    )

    model = get_peft_model(model, peft_config)

    model.print_trainable_parameters()

    train_dataset, test_dataset = kvasir_dataset(model_name, test_size)
    N = len(train_dataset)
    batch_size = 64
    gradient_accumulation_steps = 4
    use_bf16 = True
    dataloader_num_workers = 8

    training_args = TrainingArguments(
        output_dir="./outputs/" + save_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        bf16=use_bf16 and torch.cuda.is_available(),
        bf16_full_eval=use_bf16 and torch.cuda.is_available(),
        dataloader_num_workers=dataloader_num_workers,
        dataloader_pin_memory=True,
        dataloader_prefetch_factor=2,
        logging_steps=(N / batch_size),
        learning_rate=lr,
        save_total_limit=2,
        prediction_loss_only=False,
        remove_unused_columns=True,
        push_to_hub=False,
        report_to="none",
        eval_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        logging_dir=f"./outputs/{save_dir}/logs",
        optim="adamw_torch_fused" if torch.cuda.is_available() else "adamw_torch",
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics_fn(model_name),  # type: ignore
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
    )

    print("Starting training...")
    start_time = time.time()
    trainer.train()
    end_time = time.time() - start_time

    final_test_metrics = trainer.evaluate(eval_dataset=train_dataset)
    log = trainer.state.log_history.copy()
    final_train_metrics = trainer.evaluate(eval_dataset=train_dataset)
    log.append({"epoch": epochs, "loss": final_train_metrics["eval_loss"]})
    all_metrics = {
        "training_history": log,
        "final_evaluation": final_test_metrics,
        "training_time": end_time,
    }
    metrics = Metrics(f"./outputs/{save_dir}/")
    metrics.store_metrics(all_metrics)
    metrics.store_history(log)
    metrics.plot_curves(log)
    return trainer

In [None]:
def train_segformer_fft(epochs, lr, save_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    test_size = 0.2
    model, model_name, _ = segformer()
    train_dataset, test_dataset = kvasir_dataset(model_name, test_size)
    N = len(train_dataset)
    batch_size = 64
    gradient_accumulation_steps = 4
    use_bf16 = True
    dataloader_num_workers = 8

    training_args = TrainingArguments(
        output_dir="./outputs/" + save_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        bf16=use_bf16 and torch.cuda.is_available(),
        bf16_full_eval=use_bf16 and torch.cuda.is_available(),
        dataloader_num_workers=dataloader_num_workers,
        dataloader_pin_memory=True,
        dataloader_prefetch_factor=2,
        logging_steps=N,
        learning_rate=lr,
        save_total_limit=2,
        prediction_loss_only=False,
        remove_unused_columns=True,
        push_to_hub=False,
        report_to="none",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_dir=f"./outputs/{save_dir}/logs",
        optim="adamw_torch_fused" if torch.cuda.is_available() else "adamw_torch",
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics_fn(model_name),  # type: ignore
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
    )

    print("Starting training...")
    start_time = time.time()
    trainer.train()
    end_time = time.time() - start_time

    final_test_metrics = trainer.evaluate(eval_dataset=train_dataset)
    log = trainer.state.log_history.copy()
    final_train_metrics = trainer.evaluate(eval_dataset=train_dataset)
    log.append({"epoch": epochs, "loss": final_train_metrics["eval_loss"]})
    all_metrics = {
        "training_history": log,
        "final_evaluation": final_test_metrics,
        "training_time": end_time,
    }
    metrics = Metrics(f"./outputs/{save_dir}/")
    metrics.store_metrics(all_metrics)
    metrics.store_history(log)
    metrics.plot_curves(log)
    return trainer

In [None]:
def train_mask2former_lora(epochs, lr, r, lora_alpha, lora_dropout, save_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    test_size = 0.2
    model, model_name, modules = mask2former()

    peft_config = LoraConfig(
        r=r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=modules,
    )

    model = get_peft_model(model, peft_config)

    model.print_trainable_parameters()

    train_dataset, test_dataset = kvasir_dataset(model_name, test_size)
    N = len(train_dataset)
    batch_size = 64
    gradient_accumulation_steps = 4
    use_bf16 = True
    dataloader_num_workers = 8

    training_args = TrainingArguments(
        output_dir="./outputs/" + save_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        gradient_accumulation_steps=gradient_accumulation_steps,
        bf16=use_bf16 and torch.cuda.is_available(),
        bf16_full_eval=use_bf16 and torch.cuda.is_available(),
        dataloader_num_workers=dataloader_num_workers,
        dataloader_pin_memory=True,
        dataloader_prefetch_factor=2,
        logging_steps=(N / batch_size),
        learning_rate=lr,
        save_total_limit=2,
        prediction_loss_only=False,
        remove_unused_columns=True,
        push_to_hub=False,
        report_to="none",
        eval_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        logging_dir=f"./outputs/{save_dir}/logs",
        optim="adamw_torch_fused" if torch.cuda.is_available() else "adamw_torch",
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics_fn(model_name),  # type: ignore
        callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
    )

    print("Starting training...")
    start_time = time.time()
    trainer.train()
    end_time = time.time() - start_time

    final_test_metrics = trainer.evaluate(eval_dataset=train_dataset)
    log = trainer.state.log_history.copy()
    final_train_metrics = trainer.evaluate(eval_dataset=train_dataset)
    log.append({"epoch": epochs, "loss": final_train_metrics["eval_loss"]})
    all_metrics = {
        "training_history": log,
        "final_evaluation": final_test_metrics,
        "training_time": end_time,
    }
    metrics = Metrics(f"./outputs/{save_dir}/")
    metrics.store_metrics(all_metrics)
    metrics.store_history(log)
    metrics.plot_curves(log)
    return trainer

In [None]:
epochs = 30
learning_rate = 5e-4
rank = 32
lora_alpha = 32
lora_dropout = 0.05
save_dir = "test_mask2former"

In [None]:
fft_trainer = train_mask2former_lora(
    epochs, learning_rate, rank, lora_alpha, lora_dropout, save_dir
)

Using device: cpu


Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/segformer-b0-finetuned-ade-512-512 and are newly initialized because the shapes did not match:
- decode_head.classifier.bias: found shape torch.Size([150]) in the checkpoint and torch.Size([2]) in the model instantiated
- decode_head.classifier.weight: found shape torch.Size([150, 256, 1, 1]) in the checkpoint and torch.Size([2, 256, 1, 1]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 65,536 || all params: 3,780,194 || trainable%: 1.7337


  image_processor = cls(**image_processor_dict)


Starting training...




Step,Training Loss,Validation Loss,Mean Iou,Mean Dice,Per Class Dice,Mean Accuracy,Overall Accuracy,Per Class Iou
800,0.5339,0.424841,0.469631,0.853864,"[0.9503281164107954, 0.7574001067861984]",0.939262,0.939262,"[0.9392618230303632, 0.0]"


  acc = total_area_intersect / total_area_label


In [None]:

epochs = 30
learning_rate = 5e-4
rank = 32
lora_alpha = 32
lora_dropout = 0.05
save_dir = "mask2former_r32_alpha32"

In [None]:
epochs = 30
learning_rate = 5e-4
rank = 32
lora_alpha = 64
lora_dropout = 0.05
save_dir = "mask2former_r32_alpha64"

In [None]:
epochs = 30
learning_rate = 5e-4
rank = 32
lora_alpha = 128
lora_dropout = 0.05
save_dir = "mask2former_r32_alpha128"

In [None]:
epochs = 30
learning_rate = 5e-4
rank = 16
lora_alpha = 16
lora_dropout = 0.05
save_dir = "mask2former_r16_alpha16"

In [None]:
epochs = 30
learning_rate = 5e-4
rank = 16
lora_alpha = 32
lora_dropout = 0.05
save_dir = "mask2former_r16_alpha32"

In [None]:
epochs = 30
learning_rate = 5e-4
rank = 16
lora_alpha = 64
lora_dropout = 0.05
save_dir = "mask2former_r16_alpha64"