<a href="https://colab.research.google.com/github/siyinggu/LLM_Finetuning/blob/main/LoRA_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Set Up Environment and Dependencies

In [1]:
!pip install peft
!pip install datasets
!pip install wandb

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.13.0->peft)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.13.0->peft)
  Using cached nvidia_cufft_cu12-11.

In [2]:
# Run this to enable peft library
import sys
sys.path.append("/kaggle/input/peft-main/src")
# Import all required library
import os
import time
import math
import numpy as np
import pandas as pd
import tqdm
import warnings
warnings.filterwarnings("ignore")
import torch
from sklearn.metrics import roc_auc_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM
)

from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel
)


# Step 2: Load and Prepare Dataset
## 2.1 Dataset preprocessing

In [3]:
from datasets import load_dataset, load_metric
dataset = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")
# Function to clean text
import re
def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text.strip()

# Function to clean the entire dataset
def clean_dataset(dataset):
    dataset = dataset.map(lambda x: {'text': clean_text(x['text'])})
    return dataset

cleaned_dataset = clean_dataset(dataset)
print(cleaned_dataset)

Downloading readme:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/601k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/586k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31232 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5205 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5206 [00:00<?, ? examples/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 31232
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5205
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5206
    })
})


## 2.2 Train, Validation and Test dataset split

In [4]:
X_train = [example['text'] for example in cleaned_dataset['train']]
y_train = [example['label'] for example in cleaned_dataset['train']]
X_val = [example['text'] for example in cleaned_dataset['validation']]
y_val = [example['label'] for example in cleaned_dataset['validation']]
X_test = [example['text'] for example in cleaned_dataset['test']]
y_test = [example['label'] for example in cleaned_dataset['test']]

## 2.3 Tokenize dataset

In [5]:
import datasets
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# Tokenize function
def tokenize_func(data):
    return tokenizer(
            data['texts'],
            max_length=512,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )
# Tokenize the Training Data
train_dataset = datasets.Dataset.from_pandas(pd.DataFrame({"texts":X_train,"labels":y_train}))
train_dataset = train_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["texts"]
)
train_dataset

# Tokenize the Validation Data
val_dataset = datasets.Dataset.from_pandas(pd.DataFrame({"texts":X_val,"labels":y_val}))
val_dataset = val_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["texts"]
)

val_dataset

# Tokenize the Test Data
test_dataset = datasets.Dataset.from_pandas(pd.DataFrame({"texts":X_test,"labels":y_test}))
test_dataset = test_dataset.map(
    tokenize_func,
    batched=True,
    remove_columns=["texts"]
)

test_dataset

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 5206
})

# Step 3: Load Base Model (DistilBert)

In [6]:
# Define a function that can print the trainable parameters
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [7]:
# Model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)


# Number of trainable parameters
# print(print_number_of_trainable_model_parameters(model))
# Load tokenizer and model
# tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Step 4: Apply and Fine-Tune LoRA
## 4.1 LoRA

In [8]:
from sklearn.metrics import roc_auc_score
import numpy as np
import wandb

# Initialize a new wandb run
wandb.init(project="LoRA_Evaluation", entity="siyinggu-nyu")
model_name = "distilbert-base-uncased"

# Define the LoRA Configuration
lora_config = LoraConfig(
    r=8, # Rank Number
    lora_alpha=32, # Alpha (Scaling Factor)
    lora_dropout=0.05, # Dropout Prob for Lora
    target_modules=["q_lin", "k_lin","v_lin"], # Which layer to apply LoRA, usually only apply on MultiHead Attention Layer
    bias='none',
    task_type=TaskType.SEQ_CLS # Seqence to Classification Task
)

# Set up wandb config
wandb.config = {
    "learning_rate": 5e-5,
    "epochs": 5,
    "batch_size": 16,
    "model_name": model_name,
    "prefix_config": lora_config
}

# Get our LoRA-enabled model
peft_model = get_peft_model(model, lora_config)

# Reduced trainble parameters
print(print_number_of_trainable_model_parameters(peft_model))

# Define helper functions to evaluate memory and parameter counts
def get_model_size(model):
    return sum(p.numel() for p in model.parameters())

def get_memory_usage():
    return torch.cuda.memory_allocated()

# Define Eval Metric
def metrics(eval_prediction):
    logits, labels = eval_prediction
    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=1).numpy()
    auc_score = roc_auc_score(labels, probs, multi_class='ovr')
    return {"Val-AUC": auc_score}

train_batch_size = 32
eval_batch_size = 32


def train_and_evaluate(model, tokenizer, train_dataset, val_dataset):
  # Define training Args
  peft_training_args = TrainingArguments(
      output_dir='./result-distilbert-lora',
      logging_dir='./logs-distilbert-lora',
      learning_rate=1e-4,
      per_device_train_batch_size=train_batch_size, # You can adjust this value based on your available GPU
      per_device_eval_batch_size=eval_batch_size, # You can adjust this value based on your available GPU
      num_train_epochs=5,
      logging_steps=10,
      evaluation_strategy='steps',
      eval_steps=10,
      weight_decay=0.01,
      seed=42,
      fp16=True, # Only use with GPU
      report_to="wandb"
  )


  # Define Optimizer
  optimizer = torch.optim.AdamW(peft_model.parameters(), lr=1e-4)

  # Define Scheduler
  n_epochs = peft_training_args.num_train_epochs
  total_steps = n_epochs * (len(train_dataset) // train_batch_size)
  lr_scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=0,
      num_training_steps=total_steps
  )

  # Data Collator
  collator = DataCollatorWithPadding(
      tokenizer=tokenizer,
      padding="longest"
  )


  # Define Trainer
  peft_trainer = Trainer(
      model=peft_model,
      args=peft_training_args,
      train_dataset=train_dataset, # Training Data
      eval_dataset=val_dataset, # Evaluation Data
      tokenizer=tokenizer,
      compute_metrics=metrics,
      optimizers=(optimizer, lr_scheduler),
      data_collator=collator
  )
  print(f"Total Steps: {total_steps}")


  start_time = time.time()
  peft_trainer.train()
  train_time = time.time() - start_time

  eval_results = peft_trainer.evaluate()
  model_size = get_model_size(model)
  memory_usage = get_memory_usage()

  # Path to save the fine-tuned model
  peft_model_path = "/drive/MyDrive/Capstone"
  # Save the fine-tuned model
  peft_trainer.model.save_pretrained(peft_model_path)
  tokenizer.save_pretrained(peft_model_path)
  return eval_results, model_size, memory_usage, train_time, peft_trainer



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


trainable model parameters: 814083
all model parameters: 67769862
percentage of trainable model parameters: 1.20%


In [9]:
eval_results_lora, model_size_lora, memory_usage_lora, train_time_lora, trained_model = train_and_evaluate(peft_model, tokenizer, train_dataset, val_dataset)


Total Steps: 4880




Step,Training Loss,Validation Loss,Val-auc
10,1.1046,1.081469,0.670455
20,1.0837,1.071241,0.7358
30,1.0691,1.043978,0.761957
40,1.0359,0.99902,0.776136
50,0.9869,0.932136,0.798946
60,0.8729,0.848706,0.817761
70,0.8797,0.797354,0.817834
80,0.731,0.779169,0.831788
90,0.7975,0.757877,0.838016
100,0.7415,0.744539,0.84368


In [14]:
print(f"LoRA - Accuracy: {eval_results_lora['eval_Val-AUC']}, Model Size: {model_size_lora}, Memory Usage: {memory_usage_lora}, Training Time: {train_time_lora}")
print(eval_results_lora)
# Finish the wandb run
wandb.finish()

LoRA - Accuracy: 0.8971641327260769, Model Size: 67769862, Memory Usage: 295771136, Training Time: 19281.034651994705
{'eval_loss': 0.6054931282997131, 'eval_Val-AUC': 0.8971641327260769, 'eval_runtime': 33.6084, 'eval_samples_per_second': 154.872, 'eval_steps_per_second': 4.85, 'epoch': 5.0}


VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/Val-AUC,▁▆▆▆▇▇▇▇▇▇▇▇████████████████████████████
eval/loss,█▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▂▁▂▂▁▂▂▂▂▃▂▃▃▃▃▃▄▃▃▄▄▄▄▄▅▅▅▅▆▅▅▇▆▆▆▇▇▇█
eval/samples_per_second,█▇█▇▇█▇▇▇▆▆▇▆▆▆▆▆▅▆▆▅▅▅▅▅▄▄▄▄▃▄▄▂▃▃▃▂▂▂▁
eval/steps_per_second,█▇█▇▇█▇▇▇▆▆▇▆▆▆▆▆▅▆▆▅▅▅▅▅▄▄▄▄▃▄▄▂▃▃▃▂▂▂▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▁█▃▆▅▂▃▄▆▆▃▃▄▅▄▄▂▄▃▃▃▅▅▅▄▂▆▂▅▃▃▇▄▅▄▆▆▃▄▇
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▄▃▃▄▃▃▃▂▃▂▂▃▃▂▃▂▁▃▂▃▃▁▂▃▁▁▃▁▂▁▂▂▁▁▁▃▂▂▂

0,1
eval/Val-AUC,0.89716
eval/loss,0.60549
eval/runtime,33.6084
eval/samples_per_second,154.872
eval/steps_per_second,4.85
total_flos,2.107701264384e+16
train/epoch,5.0
train/global_step,4880.0
train/grad_norm,3.43978
train/learning_rate,0.0


In [12]:
eval_results_lora

{'eval_loss': 0.6054931282997131,
 'eval_Val-AUC': 0.8971641327260769,
 'eval_runtime': 33.6084,
 'eval_samples_per_second': 154.872,
 'eval_steps_per_second': 4.85,
 'epoch': 5.0}

In [15]:
print('Done')


Done


In [17]:
!pip install huggingface.huk > /dev/null 2>&1
from huggingface_hub import notebook_login
notebook_login()
trained_model.push_to_hub("LoRA")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/271M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Siyinggu/LoRA/commit/4ddf00b46806f72d9f8dcfaa9704bc37ad014b11', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='4ddf00b46806f72d9f8dcfaa9704bc37ad014b11', pr_url=None, pr_revision=None, pr_num=None)