<a href="https://colab.research.google.com/github/siyinggu/LLM_Finetuning/blob/main/QLoRA_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install adapters datasets transformers
!pip install accelerate
!pip install wandb
!pip install -i https://pypi.org/simple/ bitsandbytes

Looking in indexes: https://pypi.org/simple/


In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
import pandas as pd
import wandb
###Step1: Model Initialization
#1.1 Initialize a new wandb run
wandb.init(project="QLoRA_New", entity="siyinggu-nyu")

#1.2 modelpath for distilbert
modelpath = "distilbert-base-uncased"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
#1.3 Load 4-bit quantized model
model = AutoModelForSequenceClassification.from_pretrained(
    modelpath,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    num_labels=3,  # Update this if you have a different number of labels
    low_cpu_mem_usage=True
)
model.config.use_cache = False

#1.4 Initialize lora config
import adapters
from adapters import LoRAConfig

adapters.init(model)

config = LoRAConfig(
    selfattn_lora=True, intermediate_lora=True, output_lora=True,
    attn_matrices=["q", "k", "v"],
    alpha=16, r=64, dropout=0.1
)
model.add_adapter("assistant_adapter", config=config)
model.train_adapter("assistant_adapter")

print(model.adapter_summary())

#1.5 Give model paramater size and type
for param in model.parameters():
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)


class CastOutputToFloat(torch.nn.Sequential):
    def forward(self, x): return super().forward(x).to(torch.float32)
model.classifier = CastOutputToFloat(model.classifier)

print(model)

# Verifying the datatypes.
dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes:
        dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
    total += v
for k, v in dtypes.items():
    print(k, v, v / total)

import os

###Step2: Data preprocessing and tokenize
from datasets import load_dataset, load_metric
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

##2.1: Clean dataset
#: Function to clean text
import re
def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text.strip()

# Function to clean the entire dataset
def clean_dataset(dataset):
    dataset = dataset.map(lambda x: {'text': clean_text(x['text'])})
    return dataset

cleaned_dataset = clean_dataset(dataset)
print(cleaned_dataset)


##2.2: Tokenize dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(modelpath)
# Ensure the tokenizer has a pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize function
def tokenize_func(examples):
    return tokenizer(
        examples['text'],
        max_length=512,
        padding='max_length',
        truncation=True
    )

# Tokenize the Training Data
train_dataset = cleaned_dataset['train'].map(
    tokenize_func,
    batched=True
)

# Tokenize the Validation Data
val_dataset = cleaned_dataset['validation'].map(
    tokenize_func,
    batched=True
)

# Tokenize the Test Data
test_dataset = cleaned_dataset['test'].map(
    tokenize_func,
    batched=True
)

# Define the format for labels to ensure they match input size
def format_labels(examples):
    examples['labels'] = examples['label']
    return examples

train_dataset = train_dataset.map(format_labels, batched=True)
val_dataset = val_dataset.map(format_labels, batched=True)
test_dataset = test_dataset.map(format_labels, batched=True)


###Step3: Model Training and Evaluation
# Set up wandb config
wandb.config = {
    "learning_rate": 1e-8, ##!!!reduce it
    "epochs": 1000,
    "batch_size": 16,
    "model_name": modelpath,
    "quantization_config": bnb_config,
    "lora_config": config
}

args = TrainingArguments(
    output_dir="output/distilbert_qlora",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    logging_steps=10,
    save_steps=500,
    eval_steps=187,
    save_total_limit=3,
    gradient_accumulation_steps=16,
    max_steps=1875,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    learning_rate=0.0002,
    group_by_length=True,
    bf16=True,
    warmup_ratio=0.03,
    max_grad_norm=0.3,
    report_to="wandb"  # Enable logging to wandb
)

from adapters import AdapterTrainer
from transformers import DataCollatorWithPadding
from sklearn.metrics import roc_auc_score
import time
import psutil

data_collator = DataCollatorWithPadding(tokenizer)

# Custom function to log time and memory usage
def log_time_memory():
    # Log the current time and memory usage
    current_time = time.time()
    memory_info = psutil.virtual_memory()
    wandb.log({
        "time": current_time,
        "memory_usage": memory_info.used / (1024 ** 2)  # Convert to MB
    })

def metrics(eval_prediction):
    logits, labels = eval_prediction
    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()
    auc_score = roc_auc_score(labels, probs, multi_class='ovr')
    return {"accuracy": auc_score}


trainer = AdapterTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=args,
    compute_metrics=metrics
)

log_time_memory()
trainer.train()
trainer.save_model()
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
memory_usage,▁
time,▁

0,1
memory_usage,2720.79688
time,1721475444.21123


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
assistant_adapter        lora              4,718,592      10.456       1       1
--------------------------------------------------------------------------------
Full model                                45,129,216     100.000               0
DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlockWithAdapters(
          (attention): MultiHeadSelfAttentionWithAdapters(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LoRALinear4bit(
 

max_steps is given, it will override any value given in num_train_epochs


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 31232
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5205
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5206
    })
})


Step,Training Loss,Validation Loss,Accuracy
187,0.7194,0.705315,0.8625
374,0.6868,0.678827,0.872419
561,0.7352,0.665283,0.876258
748,0.719,0.64589,0.88449
935,0.6198,0.653885,0.887733
1122,0.641,0.642718,0.88597
1309,0.6872,0.626735,0.890733
1496,0.6913,0.616361,0.893146
1683,0.6823,0.618754,0.894416


Step,Training Loss,Validation Loss,Accuracy
187,0.7194,0.705315,0.8625
374,0.6868,0.678827,0.872419
561,0.7352,0.665283,0.876258
748,0.719,0.64589,0.88449
935,0.6198,0.653885,0.887733
1122,0.641,0.642718,0.88597
1309,0.6872,0.626735,0.890733
1496,0.6913,0.616361,0.893146
1683,0.6823,0.618754,0.894416
1870,0.6345,0.618558,0.893388


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▃▄▆▇▆▇███
eval/loss,█▆▅▃▄▃▂▁▁▁
eval/runtime,▁█▃▄▄▄█▃▇▄
eval/samples_per_second,█▁▆▅▅▅▁▆▂▅
eval/steps_per_second,█▁▆▅▅▅▁▆▂▅
memory_usage,▁
time,▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▁▃▆▄▁▅▇▅▃▃▂▄▅▆█▆▃▄▄▆▃▃▃▆▄▄▇▆▆▄▁▄▃▄▅▅▃▄▅▅

0,1
eval/accuracy,0.89339
eval/loss,0.61856
eval/runtime,206.8541
eval/samples_per_second,25.163
eval/steps_per_second,25.163
memory_usage,3408.83984
time,1721475488.81647
total_flos,4408958269440000.0
train/epoch,0.96055
train/global_step,1875.0


'\nfrom transformers import logging\nlogging.set_verbosity(logging.CRITICAL)\n\ndef prompt_model(model, text: str):\n    batch = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)\n    batch = batch.to(model.device)\n\n    model.eval()\n    with torch.inference_mode(), torch.cuda.amp.autocast():\n        outputs = model(**batch)\n\n    return outputs\n\nprint(prompt_model(model, "Explain Calculus to a primary school student"))\n\nmodel.merge_adapter("assistant_adapter")\nprint(prompt_model(model, "Explain NLP in simple terms"))\n'