<a href="https://colab.research.google.com/github/rohitptnk/llm-finetuning-and-quantization/blob/main/Colab_File.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ----- ALL MODEL EVALUATION -----

In [1]:
!pip install -q transformers datasets evaluate scikit-learn

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## QLORA

In [2]:
!pip install -q -U "bitsandbytes" "peft" "accelerate" "evaluate"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!git clone https://github.com/rohitptnk/llm-finetuning-and-quantization.git
!cd llm-finetuning-and-quantization

Cloning into 'llm-finetuning-and-quantization'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 44 (delta 8), reused 31 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (44/44), 325.64 KiB | 21.71 MiB/s, done.
Resolving deltas: 100% (8/8), done.
Filtering content: 100% (4/4), 1011.02 MiB | 12.72 MiB/s, done.


In [None]:
qlora_path = "/content/llm-finetuning-and-quantization/QLORA"

from transformers import BertForSequenceClassification, BertTokenizerFast
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

# Loading model
from peft import PeftModel
dataset = load_dataset("dair-ai/emotion", "split")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
num_labels = len(set(dataset["train"]["label"]))
base_model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels,
    load_in_4bit=True,
    device_map="auto",
)


qlora_model = PeftModel.from_pretrained(base_model, qlora_path)
qlora_model.eval()

# Loading Data
ds = load_dataset("dair-ai/emotion", "split")
MAX_LEN = 256

def encode(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

encoded = ds.map(encode, batched=True)
encoded.set_format("torch", columns=["input_ids","attention_mask","label"])

# Running Evaluation
test_loader = DataLoader(encoded["test"], batch_size=32)
all_preds = []
all_labels = []
qlora_model.eval()

for batch in test_loader:
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")
    labels = batch["label"].numpy()

    with torch.no_grad():
        logits = qlora_model(input_ids=input_ids,
                       attention_mask=attention_mask).logits
        preds = logits.argmax(dim=1).cpu().numpy()

    all_preds.extend(preds)
    all_labels.extend(labels)

# ----- Metrics -----
print("============================= QLORA Report =============================")
# Accuaracy
acc = accuracy_score(all_labels, all_preds)
print("Accuracy:", acc)

# Macro F1 + Per-class F1
macro_f1 = f1_score(all_labels, all_preds, average="macro")
print("Macro F1:", macro_f1)
print("\nPer-class F1:")
print(classification_report(all_labels, all_preds, digits=4))

#Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

# Model Size
import os
model_path = qlora_path
size_mb = os.path.getsize("/content/llm-finetuning-and-quantization/QLORA/adapter_model.safetensors") / 1e6
print("Model size (MB):", size_mb)

# Latency
import time
example = encoded["test"][0]
inputs = {
    "input_ids": example["input_ids"].unsqueeze(0).to("cuda"),
    "attention_mask": example["attention_mask"].unsqueeze(0).to("cuda")
}
N = 100
start = time.time()
with torch.no_grad():
    for _ in range(N):
        _ = qlora_model(**inputs)
end = time.time()
lat_ms = (end - start) / N * 1000
print("Latency (ms/example):", lat_ms)

## Full Finetune Report

In [None]:
finetune_path = "/content/llm-finetuning-and-quantization/full-finetune"

from transformers import BertForSequenceClassification, BertTokenizerFast
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Loading model

tokenizer = BertTokenizerFast.from_pretrained(finetune_path)
model = BertForSequenceClassification.from_pretrained(finetune_path)
model.eval().to("cuda")

# Loading Data
ds = load_dataset("dair-ai/emotion", "split")
MAX_LEN = 256

def encode(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

encoded = ds.map(encode, batched=True)
encoded.set_format("torch", columns=["input_ids","attention_mask","label"])

# Running Evaluation
test_loader = DataLoader(encoded["test"], batch_size=32)
all_preds = []
all_labels = []
model.eval()

for batch in test_loader:
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")
    labels = batch["label"].numpy()

    with torch.no_grad():
        logits = model(input_ids=input_ids,
                       attention_mask=attention_mask).logits
        preds = logits.argmax(dim=1).cpu().numpy()

    all_preds.extend(preds)
    all_labels.extend(labels)

# ----- Metrics -----
print("============================= Full Finetune Report =============================")
# Accuaracy
acc = accuracy_score(all_labels, all_preds)
print("Accuracy:", acc)

# Macro F1 + Per-class F1
macro_f1 = f1_score(all_labels, all_preds, average="macro")
print("Macro F1:", macro_f1)
print("\nPer-class F1:")
print(classification_report(all_labels, all_preds, digits=4))

#Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

# Model Size
import os
size_mb = os.path.getsize("/content/llm-finetuning-and-quantization/full-finetune/model.safetensors") / 1e6
print("Model size (MB):", size_mb)

# Latency
import time
example = encoded["test"][0]
inputs = {
    "input_ids": example["input_ids"].unsqueeze(0).to("cuda"),
    "attention_mask": example["attention_mask"].unsqueeze(0).to("cuda")
}
N = 100
start = time.time()
with torch.no_grad():
    for _ in range(N):
        _ = model(**inputs)
end = time.time()
lat_ms = (end - start) / N * 1000
print("Latency (ms/example):", lat_ms)

## PTQ Report

In [7]:
ptq_path = "/content/llm-finetuning-and-quantization/PTQ/bert_ptq.pth"

from transformers import BertForSequenceClassification, BertTokenizerFast
import torch

tokenizer = BertTokenizerFast.from_pretrained(finetune_path)
model_fp32 = BertForSequenceClassification.from_pretrained(finetune_path)

import torch.quantization as tq
model_int8 = tq.quantize_dynamic(
    model_fp32,
    {torch.nn.Linear},
    dtype=torch.qint8
)

state_dict = torch.load(ptq_path, map_location="cpu")
model_int8.load_state_dict(state_dict)
model_int8.eval()


from torch.utils.data import DataLoader
from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix, classification_report
)
import numpy as np
ds = load_dataset("dair-ai/emotion", "split")
test_texts = ds["test"]["text"]
test_labels = ds["test"]["label"]
preds = []
for text in test_texts:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        logits = model_int8(**inputs).logits
        preds.append(logits.argmax(dim=-1).item())

print("========================= PTQ REPORT ==================================")
print("Accuracy:", accuracy_score(test_labels, preds))
print("Macro F1:", f1_score(test_labels, preds, average="macro"))
print("\nPer-class F1:\n", classification_report(test_labels, preds))
print("\nConfusion Matrix:\n", confusion_matrix(test_labels, preds))

import os
size_mb = os.path.getsize(ptq_path) / 1e6
print("PTQ model size (MB):", size_mb)

import time
example = encoded["test"][0]
inputs = {
    "input_ids": example["input_ids"].unsqueeze(0).to("cpu"),
    "attention_mask": example["attention_mask"].unsqueeze(0).to("cpu")
}
N = 100
start = time.time()
with torch.no_grad():
    for _ in range(N):
        _ = model_int8(**inputs)
end = time.time()
print("PTQ latency (ms/example):", (end - start)/N * 1000)

For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_int8 = tq.quantize_dynamic(
  device=storage.device,


Accuracy: 0.926
Macro F1: 0.8733927330584225

Per-class F1:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96       581
           1       0.91      0.98      0.94       695
           2       0.90      0.72      0.80       159
           3       0.98      0.88      0.93       275
           4       0.87      0.95      0.91       224
           5       0.93      0.56      0.70        66

    accuracy                           0.93      2000
   macro avg       0.92      0.84      0.87      2000
weighted avg       0.93      0.93      0.92      2000


Confusion Matrix:
 [[566   9   0   1   5   0]
 [  2 680  12   0   0   1]
 [  0  45 114   0   0   0]
 [ 18   5   0 243   9   0]
 [  6   0   0   4 212   2]
 [  3   9   0   0  17  37]]
PTQ model size (MB): 181.497448
PTQ latency (ms/example): 395.4185175895691


## QAT Report

In [None]:
qat_model_path = "/content/llm-finetuning-and-quantization/QAT"

from transformers import BertForSequenceClassification, BertTokenizerFast
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import (
    accuracy_score, f1_score, confusion_matrix, classification_report
)
import numpy as np



qat_model = BertForSequenceClassification.from_pretrained(qat_model_path)
tokenizer = BertTokenizerFast.from_pretrained(qat_model_path)
qat_model.eval()

import torch.quantization as tq

model_int8 = tq.quantize_dynamic(
    qat_model,
    {torch.nn.Linear},       # quantize only Linear layers
    dtype=torch.qint8
)


device = torch.device("cpu")
model_int8.to(device)
model_int8.eval()

# Loading Data
ds = load_dataset("dair-ai/emotion", "split")
MAX_LEN = 128

def encode(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

encoded = ds.map(encode, batched=True)
encoded.set_format("torch", columns=["input_ids","attention_mask","label"])
encoded["test"] = encoded["test"].select(range(100))

# Running Evaluation
test_loader = DataLoader(encoded["test"], batch_size=32)
all_preds = []
all_labels = []

for batch in test_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["label"].numpy()

    with torch.no_grad():
        logits = model_int8(input_ids=input_ids,
                       attention_mask=attention_mask).logits
        preds = logits.argmax(dim=1).cpu().numpy()

    all_preds.extend(preds)
    all_labels.extend(labels)

# ----- Metrics -----
print("============================= QAT Report ============================")
# Accuaracy
acc = accuracy_score(all_labels, all_preds)
print("Accuracy:", acc)

# Macro F1 + Per-class F1
macro_f1 = f1_score(all_labels, all_preds, average="macro")
print("Macro F1:", macro_f1)
print("\nPer-class F1:")
print(classification_report(all_labels, all_preds, digits=4))

#Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

# Model Size
import os
# Path to the saved QAT model directory

size_mb = sum(os.path.getsize(os.path.join(qat_model_path, f))
              for f in os.listdir(qat_model_path)) / 1e6
print("Model size (MB):", size_mb)

# Latency
import time
example = encoded["test"][0]
inputs = {
    "input_ids": example["input_ids"].unsqueeze(0).to(device),
    "attention_mask": example["attention_mask"].unsqueeze(0).to(device)
}
N = 100
start = time.time()
with torch.no_grad():
    for _ in range(N):
        _ = model_int8(**inputs)
end = time.time()
lat_ms = (end - start) / N * 1000
print("Latency (ms/example):", lat_ms)


# Training Code

## 1. Baseline fine-tuning (FP32/FP16)

In [None]:
# %pip install torch transformers datasets evaluate -q

In [None]:
# from datasets import load_dataset

# ds = load_dataset("dair-ai/emotion", "split")

In [None]:
# ds

In [None]:
# ds['train'][0]

In [None]:
# from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
# import evaluate
# import numpy as np

# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# # Tokenize
# def tokenize_function(examples):
#     return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

# tokenized_ds = ds.map(tokenize_function, batched=True)

# tokenized_ds = tokenized_ds.rename_column("label", "labels")
# # Convert to PyTorch
# tokenized_ds.set_format("torch")

# num_labels = len(set(ds['train']['label']))
# # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

In [None]:
# import transformers
# print(transformers.__version__)

In [None]:
# accuracy = evaluate.load("accuracy")

# def compute_metrics(p):
#   preds = np.argmax(p.predictions, axis=1)
#   return accuracy.compute(predictions=preds, references=p.label_ids)

# # training_args = TrainingArguments(
# #     output_dir="/content/drive/MyDrive/Colab Notebooks/NLP Asg 2/bert-emotion",
# #     report_to="none",
# #     eval_strategy="epoch",
# #     save_strategy="epoch",
# #     learning_rate=2e-5,
# #     per_device_train_batch_size=16,
# #     per_device_eval_batch_size=16,
# #     num_train_epochs=3,
# #     weight_decay=0.01,
# #     load_best_model_at_end=True,
# #     logging_dir="/content/drive/MyDrive/Colab Notebooks/NLP Asg 2/logs"
# # )

# # trainer = Trainer(
# #     model=model,
# #     args=training_args,
# #     train_dataset=tokenized_ds["train"],
# #     eval_dataset=tokenized_ds["validation"],
# #     tokenizer=tokenizer,
# #     compute_metrics=compute_metrics,
# # )

In [None]:
# trainer.train()

In [None]:
# trainer.evaluate()

In [None]:
# trainer.save_model("/content/drive/MyDrive/Colab Notebooks/NLP Asg 2/bert-emotion-finetuned")
# tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/NLP Asg 2/bert-emotion-finetuned")

In [None]:
# preds = trainer.predict(tokenized_ds["test"])
# print(preds.metrics)

## 2. Quantization

Loading Previous Model

In [None]:
# from transformers import BertForSequenceClassification, BertTokenizer

# model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/NLP Asg 2/bert-emotion-finetuned")
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model.eval()

Preparing for PTQ

In [None]:
# import torch
# from torch.quantization import quantize_dynamic

# quantized_model = quantize_dynamic(
#     model,
#     {torch.nn.Linear},
#     dtype=torch.qint8
# )

Saving Quantized Models

In [None]:
# torch.save(quantized_model.state_dict(), "/content/drive/MyDrive/NLP Asg 2/PTQ/bert_ptq.pth")
# import os
# print("PTQ Model Size (MB):", os.path.getsize("/content/drive/MyDrive/NLP Asg 2/PTQ/bert_ptq.pth") / 1e6)

Evaluate

In [None]:
# from datasets import load_dataset
# from sklearn.metrics import f1_score, accuracy_score

# ds = load_dataset("dair-ai/emotion", "split")

# test_texts = ds["test"]["text"]
# test_labels = ds["test"]["label"]

# preds = []
# for text in test_texts:
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
#     with torch.no_grad():
#         logits = quantized_model(**inputs).logits
#         preds.append(logits.argmax(dim=-1).item())

# macro_f1 = f1_score(test_labels, preds, average="macro")
# acc = accuracy_score(test_labels, preds)

# print("PTQ Macro F1:", macro_f1)
# print("PTQ Accuracy:", acc)


Measuring Latency

In [None]:
# import time

# inputs = tokenizer("hello world", return_tensors="pt")

# start = time.time()
# for _ in range(100):
#     quantized_model(**inputs)
# end = time.time()

# print("Latency (ms per inference):", (end - start)/100 * 1000)


Confusion Matrix + Per-Class F1

In [None]:
# from sklearn.metrics import classification_report, confusion_matrix
# import numpy as np

# print(classification_report(test_labels, preds, digits=4))
# print(confusion_matrix(test_labels, preds))


## 3. QAT

Load Baseline Model

In [None]:
# from transformers import BertForSequenceClassification, BertTokenizer

# model = BertForSequenceClassification.from_pretrained(
#     "/content/drive/MyDrive/NLP Asg 2/bert-emotion-finetuned",
#     local_files_only=True
# )

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


Freeze Lower Layers

In [None]:
# for name, param in model.bert.named_parameters():
#     if "layer." in name:
#         layer_num = int(name.split("layer.")[1].split(".")[0])
#         if layer_num < 9:
#             param.requires_grad = False

Prepare Model for QAT

In [None]:
# import torch
# import torch.ao.quantization as tq
# model.train()

# qat_config = tq.get_default_qat_qconfig("fbgemm")
# def apply_qconfig_to_linear_only(module):
#     for name, child in module.named_children():
#         if isinstance(child, torch.nn.Linear):
#             child.qconfig = qat_config
#         else:
#             child.qconfig = None
#         apply_qconfig_to_linear_only(child)

# apply_qconfig_to_linear_only(model)


# model_prepared = tq.prepare_qat(model)


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_prepared = tq.prepare_qat(model)


Fine-Tune for QAT

In [None]:
# from datasets import load_dataset
# from torch.utils.data import DataLoader

# ds = load_dataset("dair-ai/emotion", "split")

# def encode(batch):
#     return tokenizer(batch["text"],
#                      truncation=True,
#                      padding="max_length",
#                      max_length=128)

# encoded = ds.map(encode, batched=True)
# encoded.set_format("torch", columns=["input_ids","attention_mask","label"])

# train_loader = DataLoader(encoded["train"], batch_size=16, shuffle=True)


In [None]:
# optimizer = torch.optim.AdamW(model_prepared.parameters(), lr=1e-5)

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_prepared.to(device)

# for epoch in range(2):
#     total_loss = 0
#     for batch in train_loader:
#         batch = {k: v.to(device) for k, v in batch.items()}

#         outputs = model_prepared(
#             input_ids=batch["input_ids"],
#             attention_mask=batch["attention_mask"],
#             labels=batch["label"]
#         )

#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         optimizer.zero_grad()

#         total_loss += loss.item()

#     print("Epoch:", epoch, "Loss:", total_loss)


Epoch: 0 Loss: 178.73156312759966
Epoch: 1 Loss: 130.9577564052306


Convert QAT model to int8

In [None]:
# After training QAT
# model_prepared.cpu()
# model_prepared.eval()

# save_dir = "/content/drive/MyDrive/NLP Asg 2/QAT"
# model_prepared.save_pretrained(save_dir)
# tokenizer.save_pretrained(save_dir)


('/content/drive/MyDrive/NLP Asg 2/QAT/tokenizer_config.json',
 '/content/drive/MyDrive/NLP Asg 2/QAT/special_tokens_map.json',
 '/content/drive/MyDrive/NLP Asg 2/QAT/vocab.txt',
 '/content/drive/MyDrive/NLP Asg 2/QAT/added_tokens.json')

## 4. QLORA

In [None]:
# !pip install -q -U "bitsandbytes" "peft" "accelerate" "evaluate"

dataset and tokenizer

In [None]:
# from datasets import load_dataset
# from transformers import AutoTokenizer

# dataset = load_dataset("dair-ai/emotion", "split")

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# MAX_LEN = 128

# def tokenize_fn(batch):
#     return tokenizer(
#         batch["text"],
#         truncation=True,
#         padding="max_length",
#         max_length=MAX_LEN,
#     )

# tokenized = dataset.map(tokenize_fn, batched=True)
# tokenized = tokenized.rename_column("label", "labels")
# tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


BERT in 4-bit and wrap with QLoRA

In [None]:
# import torch
# from transformers import AutoModelForSequenceClassification
# from peft import LoraConfig, get_peft_model

# num_labels = len(set(dataset["train"]["label"]))

# base_model = AutoModelForSequenceClassification.from_pretrained(
#     "bert-base-uncased",
#     num_labels=num_labels,
#     load_in_4bit=True,          # 4-bit base weights
#     device_map="auto",
# )

# lora_config = LoraConfig(
#     r=8,                        # rank
#     lora_alpha=16,
#     lora_dropout=0.1,
#     bias="none",
#     task_type="SEQ_CLS",
# )

# model = get_peft_model(base_model, lora_config)
# model.print_trainable_parameters()



Trainer (QLoRA fine-tuning)

In [None]:
# from transformers import TrainingArguments, Trainer
# import evaluate
# import numpy as np

# accuracy = evaluate.load("accuracy")
# f1 = evaluate.load("f1")

# def compute_metrics(p):
#     preds = np.argmax(p.predictions, axis=1)
#     acc = accuracy.compute(predictions=preds, references=p.label_ids)
#     f1_macro = f1.compute(predictions=preds, references=p.label_ids, average="macro")
#     return {"accuracy": acc["accuracy"], "macro_f1": f1_macro["f1"]}

# # training_args = TrainingArguments(
# #     output_dir="/content/drive/MyDrive/NLP Asg 2/QLORA/",
# #     report_to="none",
# #     eval_strategy="epoch",
# #     save_strategy="epoch",
# #     learning_rate=2e-4,
# #     per_device_train_batch_size=16,
# #     per_device_eval_batch_size=16,
# #     num_train_epochs=2,
# #     weight_decay=0.01,
# #     load_best_model_at_end=True,
# #     logging_steps=50,
# # )

# # trainer = Trainer(
# #     model=model,
# #     args=training_args,
# #     train_dataset=tokenized["train"],
# #     eval_dataset=tokenized["validation"],
# #     compute_metrics=compute_metrics,
# # )

# # trainer.train()


Save

In [None]:
# save_dir = "/content/drive/MyDrive/NLP Asg 2/QLORA FINAL/"

# # model.save_pretrained(save_dir)         # saves only LoRA adapter weights
# # tokenizer.save_pretrained(save_dir)

QLoRA model for evaluation on test set

In [None]:
# save_dir = "/content/drive/MyDrive/NLP Asg 2/QLORA FINAL/"
# from peft import PeftModel

# base_model = AutoModelForSequenceClassification.from_pretrained(
#     "bert-base-uncased",
#     num_labels=num_labels,
#     load_in_4bit=True,
#     device_map="auto",
# )

# qlora_model = PeftModel.from_pretrained(base_model, save_dir)
# qlora_model.eval()


evaluate

In [None]:
# from torch.utils.data import DataLoader
# from sklearn.metrics import classification_report, confusion_matrix

# test_loader = DataLoader(tokenized["test"], batch_size=32)

# all_preds, all_labels = [], []

# for batch in test_loader:
#     with torch.no_grad():
#         outputs = qlora_model(
#             input_ids=batch["input_ids"].to(qlora_model.device),
#             attention_mask=batch["attention_mask"].to(qlora_model.device),
#         )
#     preds = outputs.logits.argmax(dim=-1).cpu().numpy()
#     labels = batch["labels"].numpy()
#     all_preds.extend(preds.tolist())
#     all_labels.extend(labels.tolist())

# print(classification_report(all_labels, all_preds, digits=4))
# print(confusion_matrix(all_labels, all_preds))


latency

In [None]:
# import time

# example = tokenized["test"][0]
# inputs = {
#     "input_ids": example["input_ids"].unsqueeze(0).to(qlora_model.device),
#     "attention_mask": example["attention_mask"].unsqueeze(0).to(qlora_model.device),
# }

# N = 100
# start = time.time()
# with torch.no_grad():
#     for _ in range(N):
#         _ = qlora_model(**inputs)
# end = time.time()
# print("QLoRA latency (ms/example):", (end - start) / N * 1000)
