# 1. Download resource

In [25]:
import kagglehub

## Download library

In [26]:
!pip install -q datasets
!pip install -q evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
# import pandas as pd
import datasets
import numpy as np
import re,string
import torch
import torch.nn as nn
import math
import nltk
import pandas as pd
device = torch.device("cuda")



## Load dataset

In [28]:
import numpy as np
import pandas as pd
from datasets import load_dataset

dataset_train = load_dataset("rajpurkar/squad", split="train")
dataset_val = load_dataset("rajpurkar/squad", split="validation")

train_df = pd.DataFrame(dataset_train)
val_df = pd.DataFrame(dataset_val)

train_df['answer_text'] = train_df['answers'].apply(lambda x: x['text'][0] if x['text'] else None)
train_df['answer_start'] = train_df['answers'].apply(lambda x: x['answer_start'][0] if x['answer_start'] else None)
train_df = train_df.drop(columns=['answers'])
train_df = train_df[['id', 'title', 'context', 'question', 'answer_text', 'answer_start']]



val_df['answer_text'] = val_df['answers'].apply(lambda x: x['text'][0] if x['text'] else None)
val_df['answer_start'] = val_df['answers'].apply(lambda x: x['answer_start'][0] if x['answer_start'] else None)
val_df = val_df.drop(columns=['answers'])
val_df = val_df[['id', 'title', 'context', 'question', 'answer_text', 'answer_start']]


train_df = train_df.head(200)  # For testing purposes
val_df = val_df.head(200)      # For testing purposes


## Prepare `Datasets` object

In [29]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import logging
logging.set_verbosity_error()
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model

class QADataset(Dataset):
    def __init__(self, df, tokenizer, max_length=384):
        # Tokenizer is the model's tokenizer, it should come with the model like the GPT2 one
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length # Maximum tokens in a sequence

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        question, context = row['question'], row['context']
        answer_text, answer_start = row['answer_text'], int(row['answer_start'])
        inputs = self.tokenizer(
            question, context, # Sentence A, Sentence B
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_offsets_mapping=True,
            return_tensors="pt"
        )

        #          Dataset
        #  field            | Semantic meaning
        #  answer_text      | Paris
        #  answer_start     | 32 (character index inside the context used for the answer)
        # Aftertokenization -> Loses the characters, only token remains
        # Offset mapping map the token to it's indexing in the context text
        offsets = inputs.pop("offset_mapping")[0]
        input_ids = inputs["input_ids"][0] # Token IDs
        attention_mask = inputs["attention_mask"][0] # Attention mask (1s and 0s, 0 = Ignore this token (padding))

        #  Some moded needs the id of each token to be a 0 for question, 1 for contex
        token_type_ids = inputs.get("token_type_ids")
        if token_type_ids is not None:
            token_type_ids = token_type_ids[0]

        # Find the start and end token indices that cover the  text used for asnwering the question.
        start_char, end_char = answer_start, answer_start + len(answer_text)
        start_token = 0
        end_token = 0
        for i, (o_start, o_end) in enumerate(offsets.tolist()):
            # "If the answer start character is inside this token → mark this token as start_token."
            if o_start <= start_char < o_end:
                start_token = i
            # "If the answer end character falls inside this token → mark this token as end_token."
            if o_start < end_char <= o_end:
                end_token = i

        item = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "start_positions": torch.tensor(start_token, dtype=torch.long),
            "end_positions": torch.tensor(end_token, dtype=torch.long),
            "answer_text":      answer_text,
        }
        if token_type_ids is not None:
            item["token_type_ids"] = token_type_ids
        return item


# 2. Build abstract function

## 2.1 Training function

In [30]:
from tqdm import tqdm
from math import ceil

BATCH_SIZE = 16
LEARNING_RATE = 2e-4
EPOCHS = 1

def _suffix(name: str):
    return name if name else 'full'

# Training and saving
def train_and_save(model, tokenizer, train_loader, strategy, model_name, val_df):
    optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE)
    best_loss = float('inf')
    suffix = _suffix(strategy)

    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_loss = 0.0
        batch_iterator = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", leave=False)

        for i, batch in enumerate(batch_iterator, 1):
            #inputs = {k: v.to(device) for k, v in batch.items()}
            inputs = {}
            for k, v in batch.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)
            # batch["answer_text"] remains a Python list of strings for debugging
            # batch is already the example we’re training on
            input_ids = batch["input_ids"][0]
            s, e      = batch["start_positions"][0].item(), batch["end_positions"][0].item()
            #print("Decoded span:  ", tokenizer.decode(input_ids[s:e+1], skip_special_tokens=True))
            #print("Ground truth:  ", batch["answer_text"][0])

            loss = model(**inputs).loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()
            avg_loss_so_far = total_loss / i
            batch_iterator.set_postfix(loss=avg_loss_so_far)

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch} - Average loss: {avg_loss:.4f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            out_dir = f"best_{model_name.replace('/', '_')}_{suffix}"
            if strategy == 'prompt':
                # Save only the adapter for prompt tuning
                model.save_pretrained(out_dir)
            else:
                model.save_pretrained(out_dir)
                tokenizer.save_pretrained(out_dir)
            print(f"Saved best checkpoint to {out_dir}")

        val_results = evaluate_model(
            model_name, strategy, val_df, metric, device
        )
        print(
            f"Epoch {epoch} — Validation → "
            f"EM: {val_results['exact_match']:.2f}%, "
            f"F1: {val_results['f1']:.2f}%"
        )



def save_model(model, tokenizer, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    model.save_pretrained(out_dir)
    tokenizer.save_pretrained(out_dir)
    print(f"Saved best model to {out_dir}")

## 2.2 Model preparation function

In [31]:
from peft import LoraConfig, PromptTuningConfig, get_peft_model,PrefixTuningConfig, AdaLoraConfig,PromptEncoderConfig
def prepare_model(model_name: str, strategy: str):
    # Huggging face will takes care of everything and add the Question and Answering head to the model
    # The head will output a tupple of 2 values
    # The score that this token is the start of the answer
    #The score that this token is the end of the answer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    if strategy == 'lora':

        if model_name == "distilbert-base-uncased":
            target_modules = ["q_lin", "k_lin", "v_lin", "out_lin", "lin1", "lin2"]
        else:
            target_modules = [
                "attention.self.query",
                "attention.self.key",
                "attention.self.value",
                "attention.output.dense",
                "intermediate.dense",
            ]

        cfg = LoraConfig(
            task_type="QUESTION_ANS",
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=target_modules,
            bias="none",
        )
        model = get_peft_model(model, cfg)
        for n,p in model.named_parameters():
            if n.startswith("base_model.model.qa_outputs"):
                p.requires_grad = True
    elif strategy == 'adalora':

        if model_name == "distilbert-base-uncased":
            target_modules = ["q_lin", "k_lin", "v_lin", "out_lin", "lin1", "lin2"]
        else:
            target_modules = [
                "attention.self.query",
                "attention.self.key",
                "attention.self.value",
                "attention.output.dense",
                "intermediate.dense",
            ]
        # ---- AdaLoRA ----
        ada_cfg = AdaLoraConfig(
            task_type="QUESTION_ANS",
            init_r=4,            # start LoRA adapters at rank=4
            target_r=8,          # grow/prune them toward rank=8
            total_step=train_steps,
            target_modules=target_modules,
        )
        model = get_peft_model(model, ada_cfg)
        # still allow QA head adaptation
        for n,p in model.named_parameters():
            if n.startswith("base_model.model.qa_outputs"):
                p.requires_grad = True

        # immediately report your new trainable‐param count
    elif strategy == 'bitfit':
        for n, p in model.named_parameters(): p.requires_grad=False
        for n, p in model.named_parameters():
            if 'bias' in n: p.requires_grad=True
        # For empty strategy string, we perform full fine-tuning of all parameters
    # else: strategy == '' -> full fine-tuning (no changes)
    elif strategy == 'prompt':
        cfg_m      = model.config
        specials   = tokenizer.num_special_tokens_to_add(pair=True)  # =3 for QA
        free_spots = cfg_m.max_position_embeddings - MAX_LEN - specials
        num_virtual = min(200, free_spots)
        print(num_virtual)
        if 'distilbert' in model_name:
            # DistilBERT needs explicit sizing
            prompt_cfg = PromptTuningConfig(
                task_type='QUESTION_ANS',
                num_virtual_tokens=num_virtual,
                num_layers=cfg_m.n_layers,
                token_dim=cfg_m.dim,
                num_attention_heads=cfg_m.n_heads,
            )
        else:
            # BERT / RoBERTa / MiniLM can auto-detect layers & dims
            prompt_cfg = PromptTuningConfig(
                task_type='QUESTION_ANS',
                num_virtual_tokens=num_virtual,
            )


        model = get_peft_model(model, prompt_cfg)

        #for n, p in model.named_parameters():
        #    if 'bias' in n: p.requires_grad=True

        for n,p in model.named_parameters() :
            if n.startswith("base_model.model.qa_outputs") or n.startswith("prompt_encoder"):
                p.requires_grad = True
    model.to(device)
    return model, tokenizer



## 2.3 Model evaluation function

In [32]:
MAX_LEN = 384
def evaluate_model(model_name, strategy, df, metric, device, max_len=MAX_LEN):
    # This used for evaluating the model on the validation set after each epoch
    # Evaluating a pretrain model will be discussed in anothersection below
    suffix = _suffix(strategy)
    out_dir = f"best_{model_name.replace('/', '_')}_{suffix}"

    if strategy == 'prompt':
        # Load base model and adapter separately for prompt tuning
        base_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        config = PromptTuningConfig.from_pretrained(out_dir)
        model = get_peft_model(base_model, config)
    else:
        model = AutoModelForQuestionAnswering.from_pretrained(out_dir)

    tokenizer = AutoTokenizer.from_pretrained(model_name if strategy == 'prompt' else out_dir)
    model.to(device).eval()

    preds, refs = [], []
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Evaluating {suffix}"):
        row_id = row['id'] if 'id' in row.index else int(row.name)
        inputs = tokenizer(row['question'], row['context'], max_length=max_len, truncation=True,
                           padding='max_length', return_tensors='pt').to(device)
        with torch.no_grad(): out = model(**inputs)
        s, e = out.start_logits.argmax(dim=-1).item(), out.end_logits.argmax(dim=-1).item()
        pred = tokenizer.decode(inputs['input_ids'][0, s:e+1], skip_special_tokens=True)
        preds.append({'id': row_id, 'prediction_text': pred})
        refs.append({'id': row_id, 'answers': {'text': [row['answer_text']], 'answer_start': [row['answer_start']]}})

    results = metric.compute(predictions=preds, references=refs)
    print(f"{suffix} results: {results}")
    return results

import evaluate
metric = evaluate.load("squad")

# 3. `BERT` Model

## 3.1 Download model

In [33]:
model_name = 'bert-base-uncased'


## 3.2 LoRA tuning

In [34]:
# 1) Prepare model
model, tokenizer = prepare_model(model_name, 'lora')
loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
train_and_save(model, tokenizer, loader, 'lora', model_name, val_df)

                                                                     

Epoch 1 - Average loss: 5.7621
Saved best checkpoint to best_bert-base-uncased_lora


Evaluating lora: 100%|██████████| 200/200 [00:04<00:00, 49.31it/s]

lora results: {'exact_match': 0.0, 'f1': 6.381351365989014}
Epoch 1 — Validation → EM: 0.00%, F1: 6.38%





## 3.3 BitFit Tuning

In [35]:
model, tokenizer = prepare_model(model_name, 'bitfit')
loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
train_and_save(model, tokenizer, loader, 'bitfit', model_name, val_df)

                                                                     

Epoch 1 - Average loss: 5.9259
Saved best checkpoint to best_bert-base-uncased_bitfit


Evaluating bitfit: 100%|██████████| 200/200 [00:03<00:00, 59.10it/s]

bitfit results: {'exact_match': 0.0, 'f1': 2.1911436037154686}
Epoch 1 — Validation → EM: 0.00%, F1: 2.19%





## 3.4 Prompt Tuning

In [36]:
model, tokenizer = prepare_model(model_name, 'prompt')
# print_prompt_tunable_params(model)

# total training steps
loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
train_and_save(model, tokenizer, loader, 'prompt', model_name, val_df)

125


                                                                     

Epoch 1 - Average loss: 6.1288
Saved best checkpoint to best_bert-base-uncased_prompt


Evaluating prompt: 100%|██████████| 200/200 [00:04<00:00, 49.98it/s]


prompt results: {'exact_match': 0.0, 'f1': 2.6717809426750723}
Epoch 1 — Validation → EM: 0.00%, F1: 2.67%


In [37]:

# model, tokenizer = prepare_model(model_name, 'prompt')
# print_prompt_tunable_params(model)
# # total training steps
# loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
# LEARNING_RATE = 2e-4
# train_and_save(model, tokenizer, loader, 'prompt', model_name, val_df)
# LEARNING_RATE = 2e-4

## 3.5 Evaluation

In [38]:
for strat in ['lora', 'bitfit', 'prompt']:
    print(f"Evaluating strategy: {strat or 'full'}")
    evaluate_model(model_name, strat, val_df, metric, device)

Evaluating strategy: lora


Evaluating lora: 100%|██████████| 200/200 [00:04<00:00, 49.08it/s]


lora results: {'exact_match': 0.0, 'f1': 6.381351365989014}
Evaluating strategy: bitfit


Evaluating bitfit: 100%|██████████| 200/200 [00:03<00:00, 59.14it/s]


bitfit results: {'exact_match': 0.0, 'f1': 2.1911436037154686}
Evaluating strategy: prompt


Evaluating prompt: 100%|██████████| 200/200 [00:03<00:00, 50.27it/s]

prompt results: {'exact_match': 0.0, 'f1': 1.0006697614835673}





# 4. `RoBERTa` Model

 ## 4.1 Download model

In [39]:
model_name = 'roberta-base'

 ## 4.2 LoRA tuning

In [40]:
model, tokenizer = prepare_model(model_name, 'lora')
loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
train_and_save(model, tokenizer, loader, 'lora', model_name, val_df)


                                                                     

Epoch 1 - Average loss: 5.7536
Saved best checkpoint to best_roberta-base_lora


Evaluating lora: 100%|██████████| 200/200 [00:04<00:00, 48.70it/s]

lora results: {'exact_match': 1.0, 'f1': 5.469690691201836}
Epoch 1 — Validation → EM: 1.00%, F1: 5.47%





## 4.3 BitFit Tuning

In [41]:
model, tokenizer = prepare_model(model_name, 'bitfit')
loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
train_and_save(model, tokenizer, loader, 'bitfit', model_name, val_df)

                                                                     

Epoch 1 - Average loss: 5.8349
Saved best checkpoint to best_roberta-base_bitfit


Evaluating bitfit: 100%|██████████| 200/200 [00:03<00:00, 58.61it/s]

bitfit results: {'exact_match': 1.0, 'f1': 5.268477570432082}
Epoch 1 — Validation → EM: 1.00%, F1: 5.27%





## 4.4 Prompt Tuning

In [42]:
model, tokenizer = prepare_model(model_name, 'prompt')
# print_prompt_tunable_params(model)

# total training steps
loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
train_and_save(model, tokenizer, loader, 'prompt', model_name, val_df)

126


                                                                     

Epoch 1 - Average loss: 6.2733
Saved best checkpoint to best_roberta-base_prompt


Evaluating prompt: 100%|██████████| 200/200 [00:03<00:00, 50.21it/s]


prompt results: {'exact_match': 0.0, 'f1': 0.09090909090909091}
Epoch 1 — Validation → EM: 0.00%, F1: 0.09%


## 4.5 Evaluation

In [43]:
for strat in ['lora', 'bitfit', 'prompt']:
    print(f"Evaluating strategy: {strat or 'full'}")
    evaluate_model(model_name, strat, val_df, metric, device)

Evaluating strategy: lora


Evaluating lora: 100%|██████████| 200/200 [00:04<00:00, 48.76it/s]


lora results: {'exact_match': 1.0, 'f1': 5.469690691201836}
Evaluating strategy: bitfit


Evaluating bitfit: 100%|██████████| 200/200 [00:03<00:00, 58.63it/s]


bitfit results: {'exact_match': 1.0, 'f1': 5.268477570432082}
Evaluating strategy: prompt


Evaluating prompt: 100%|██████████| 200/200 [00:03<00:00, 50.57it/s]

prompt results: {'exact_match': 0.0, 'f1': 0.17669172932330826}





# 5. `distll-bert` Model

## 5.1 Download model (call prepare_model)

In [44]:
model_name = "distilbert-base-uncased"
tokenizer     = AutoTokenizer.from_pretrained(model_name)
train_ds      = QADataset(train_df, tokenizer, max_length=384)
train_loader  = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
train_steps = len(train_loader) * EPOCHS

 ## 5.2 LoRA tuning

In [45]:
model, tokenizer = prepare_model(model_name, 'lora')
loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
train_and_save(model, tokenizer, loader, 'lora', model_name, val_df)

                                                                     

Epoch 1 - Average loss: 5.7967
Saved best checkpoint to best_distilbert-base-uncased_lora


Evaluating lora: 100%|██████████| 200/200 [00:02<00:00, 85.41it/s]

lora results: {'exact_match': 0.0, 'f1': 4.40384368797905}
Epoch 1 — Validation → EM: 0.00%, F1: 4.40%





## 5.3 BitFit Tuning

In [46]:
  model, tokenizer = prepare_model(model_name, 'bitfit')
  loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
  train_and_save(model, tokenizer, loader, 'bitfit', model_name, val_df)

                                                                     

Epoch 1 - Average loss: 5.9480
Saved best checkpoint to best_distilbert-base-uncased_bitfit


Evaluating bitfit: 100%|██████████| 200/200 [00:01<00:00, 107.42it/s]


bitfit results: {'exact_match': 0.0, 'f1': 3.63547762844776}
Epoch 1 — Validation → EM: 0.00%, F1: 3.64%


## 5.4 Prompt Tuning

In [47]:
 model, tokenizer = prepare_model(model_name, 'prompt')
 loader = DataLoader(QADataset(train_df, tokenizer), batch_size=BATCH_SIZE, shuffle=True)
 train_and_save(model, tokenizer, loader, 'prompt', model_name, val_df)

125


                                                                     

Epoch 1 - Average loss: 6.2105
Saved best checkpoint to best_distilbert-base-uncased_prompt


Evaluating prompt: 100%|██████████| 200/200 [00:02<00:00, 91.43it/s]

prompt results: {'exact_match': 0.0, 'f1': 0.0}
Epoch 1 — Validation → EM: 0.00%, F1: 0.00%





## 5.5 Evaluation

In [48]:
for strat in ['lora', 'bitfit', 'prompt']:
    print(f"Evaluating MiniLM strategy: {strat or 'full'}")
    evaluate_model(model_name, strat, val_df, metric, device)

Evaluating MiniLM strategy: lora


Evaluating lora: 100%|██████████| 200/200 [00:02<00:00, 84.37it/s]


lora results: {'exact_match': 0.0, 'f1': 4.40384368797905}
Evaluating MiniLM strategy: bitfit


Evaluating bitfit: 100%|██████████| 200/200 [00:01<00:00, 108.29it/s]


bitfit results: {'exact_match': 0.0, 'f1': 3.63547762844776}
Evaluating MiniLM strategy: prompt


Evaluating prompt: 100%|██████████| 200/200 [00:02<00:00, 91.83it/s]

prompt results: {'exact_match': 0.0, 'f1': 0.44537120920661943}





# 6. Evaluating pretrain models

In [49]:

qa_parameters_path = kagglehub.dataset_download('kiwiresting/qa-parameters')
def evaluate_pretrained_model(model_name, strategy, df, metric, device, max_len=MAX_LEN):
    suffix = _suffix(strategy)
    out_dir = qa_parameters_path + f"/best_{model_name.replace('/', '_')}_{suffix}"


    if strategy == 'prompt':
        # Load base model and adapter separately for prompt tuning
        base_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        config = PromptTuningConfig.from_pretrained(out_dir)
        model = get_peft_model(base_model, config)
    else:
        # For full finetuning, bitfit, or LoRA
        model = AutoModelForQuestionAnswering.from_pretrained(out_dir)

    tokenizer = AutoTokenizer.from_pretrained(model_name if strategy == 'prompt' else out_dir)
    model.to(device).eval()

    preds, refs = [], []
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Evaluating {suffix}"):
        row_id = row['id'] if 'id' in row.index else int(row.name)
        inputs = tokenizer(row['question'], row['context'], max_length=max_len, truncation=True,
                           padding='max_length', return_tensors='pt').to(device)
        with torch.no_grad(): out = model(**inputs)
        s, e = out.start_logits.argmax(dim=-1).item(), out.end_logits.argmax(dim=-1).item()
        pred = tokenizer.decode(inputs['input_ids'][0, s:e+1], skip_special_tokens=True)
        preds.append({'id': row_id, 'prediction_text': pred})
        refs.append({'id': row_id, 'answers': {'text': [row['answer_text']], 'answer_start': [row['answer_start']]}})

    results = metric.compute(predictions=preds, references=refs)
    print(f"{suffix} results: {results}")
    return results


qa_parameters_path

'/kaggle/input/qa-parameters'

## 6.1 BERT Model

In [53]:
model_name = 'bert-base-uncased'
for strat in ['lora', 'bitfit', 'prompt']:
    print(f"Evaluating strategy: {strat or 'full'}")
    evaluate_pretrained_model(model_name, strat, val_df, metric, device)

Evaluating strategy: lora


Evaluating lora: 100%|██████████| 200/200 [00:04<00:00, 48.35it/s]


lora results: {'exact_match': 66.0, 'f1': 77.02199875796069}
Evaluating strategy: bitfit


Evaluating bitfit: 100%|██████████| 200/200 [00:03<00:00, 58.38it/s]


bitfit results: {'exact_match': 53.0, 'f1': 64.57645462178297}
Evaluating strategy: prompt


Evaluating prompt: 100%|██████████| 200/200 [00:03<00:00, 50.13it/s]

prompt results: {'exact_match': 0.0, 'f1': 2.56462783448008}





## 6.2 RoBERTa Model

In [54]:
model_name = 'roberta-base'
for strat in ['lora', 'bitfit', 'prompt']:
    print(f"Evaluating strategy: {strat or 'full'}")
    evaluate_pretrained_model(model_name, strat, val_df, metric, device)

Evaluating strategy: lora


Evaluating lora: 100%|██████████| 200/200 [00:04<00:00, 48.61it/s]


lora results: {'exact_match': 86.0, 'f1': 89.30238095238093}
Evaluating strategy: bitfit


Evaluating bitfit: 100%|██████████| 200/200 [00:03<00:00, 58.63it/s]


bitfit results: {'exact_match': 80.0, 'f1': 84.73809523809523}
Evaluating strategy: prompt


Evaluating prompt: 100%|██████████| 200/200 [00:03<00:00, 50.24it/s]

prompt results: {'exact_match': 0.0, 'f1': 0.0}





## 6.3 `distll-bert` Model

In [56]:
model_name = "distilbert-base-uncased"
for strat in ['lora', 'bitfit', 'prompt']:
    print(f"Evaluating MiniLM strategy: {strat or 'full'}")
    evaluate_pretrained_model(model_name, strat, val_df, metric, device)

Evaluating MiniLM strategy: lora


Evaluating lora: 100%|██████████| 200/200 [00:02<00:00, 84.74it/s]


lora results: {'exact_match': 67.5, 'f1': 76.60000674968785}
Evaluating MiniLM strategy: bitfit


Evaluating bitfit: 100%|██████████| 200/200 [00:01<00:00, 107.52it/s]


bitfit results: {'exact_match': 48.0, 'f1': 59.92702455403676}
Evaluating MiniLM strategy: prompt


Evaluating prompt: 100%|██████████| 200/200 [00:02<00:00, 91.74it/s]

prompt results: {'exact_match': 0.0, 'f1': 0.0}



