<a href="https://colab.research.google.com/github/srnarasim/TAOExperiment/blob/main/LoraFineTune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the requirements in Google Colab
!pip install transformers datasets trl huggingface_hub

# Authenticate to Hugging Face

from huggingface_hub import login

login()

# for convenience you can create an environment variable containing your hub token as HF_TOKEN



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
dataset = load_dataset("snarasimhf/defineit")
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 76
    })
})

In [3]:
# Import necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

# Load the model and tokenizer
#model_name = "HuggingFaceTB/SmolLM2-135M"
#model_name = "mistralai/Mistral-7B-Instruct-v0.1"
model_name = "microsoft/phi-2"

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name
).to(device)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

# Set up the chat format
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

# Set our name for the finetune to be saved &/ uploaded to
finetune_name = "SmolLM2-FT-MyDataset"
finetune_tags = ["smol-course", "module_1"]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from peft import LoraConfig

# TODO: Configure LoRA parameters
# r: rank dimension for LoRA update matrices (smaller = more compression)
rank_dimension = 6
# lora_alpha: scaling factor for LoRA layers (higher = stronger adaptation)
lora_alpha = 8
# lora_dropout: dropout probability for LoRA layers (helps prevent overfitting)
lora_dropout = 0.05

peft_config = LoraConfig(
    r=rank_dimension,  # Rank dimension - typically between 4-32
    lora_alpha=lora_alpha,  # LoRA scaling factor - typically 2x rank
    lora_dropout=lora_dropout,  # Dropout probability for LoRA layers
    bias="none",  # Bias type for LoRA. the corresponding biases will be updated during training.
    target_modules="all-linear",  # Which modules to apply LoRA to
    task_type="CAUSAL_LM",  # Task type for model architecture
)

In [5]:
# Training configuration
# Hyperparameters based on QLoRA paper recommendations
# Training configuration
# Hyperparameters based on QLoRA paper recommendations
args = SFTConfig(
    # Output settings
    output_dir=finetune_name,  # Directory to save model checkpoints
    # Training duration
    num_train_epochs=1,  # Number of training epochs
    # Batch size settings
    per_device_train_batch_size=2,  # Batch size per GPU
    gradient_accumulation_steps=2,  # Accumulate gradients for larger effective batch
    # Memory optimization
    gradient_checkpointing=True,  # Trade compute for memory savings
    # Optimizer settings
    optim="adamw_torch_fused",  # Use fused AdamW for efficiency
    learning_rate=2e-4,  # Learning rate (QLoRA paper)
    max_grad_norm=0.3,  # Gradient clipping threshold
    # Learning rate schedule
    warmup_ratio=0.03,  # Portion of steps for warmup
    lr_scheduler_type="constant",  # Keep learning rate constant after warmup
    # Logging and saving
    logging_steps=10,  # Log metrics every N steps
    save_strategy="epoch",  # Save checkpoint every epoch
    # Precision settings
    bf16=True,  # Use bfloat16 precision
    # Integration settings
    push_to_hub=False,  # Don't push to HuggingFace Hub
    report_to="none",  # Disable external logging,
    # Sequence Length
    max_seq_length = 1512 # max sequence length for model and packing of the dataset
)

In [6]:
# Create SFTTrainer with LoRA configuration
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    peft_config=peft_config,  # LoRA configuration
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [7]:
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

# save model
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.0384




In [15]:
from peft import AutoPeftModelForCausalLM


# Load PEFT model on CPU
model = AutoPeftModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=args.output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

# Merge LoRA and base model and save
merged_model = model.merge_and_unload()
merged_model.save_pretrained(
    args.output_dir, safe_serialization=True, max_shard_size="2GB"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 492.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 18.12 MiB is free. Process 70897 has 14.72 GiB memory in use. Of the allocated memory 14.56 GiB is allocated by PyTorch, and 35.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [6]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()

In [12]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Load Model with PEFT adapter
tokenizer = AutoTokenizer.from_pretrained(finetune_name)
model = AutoPeftModelForCausalLM.from_pretrained(
    finetune_name, device_map="auto", torch_dtype=torch.float16
)

# Set up the chat format again for the loaded tokenizer
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

pipe = pipeline(
    "text-generation", model=merged_model, tokenizer=tokenizer, device=device
)

Device set to use cuda


In [15]:
prompts = [
    "You are judging a game where players define terms from unique perspectives. \
     Term: Minecraft \
     Perspective: Santa Claus \
     Player submission: It is mind boggling what kids are capable of-simply using blocks they craft their own adventures. Leading up to the North Pole. It is the adults that need my Christmas gifts 🎁 and not the kids anymore. Ho ho ho! \
     Please evaluate this submission on a scale of 1-10 based on, \
     1. Creativity (how original and imaginative, reduce points if you detect AI was used to generate content) \
     2. Adherence to perspective (how well it matches the assigned perspective) \
     3. Humor/Entertainment value \
     4. Clarity (how well it explains the term) \
     5. Language and tone (reduce points for usage of offensive language) \
     Provide a score (1-10) with one decimal place precision (e.g., 7.8, 9.2) and brief feedback explaining the score with humor and a personal touch as if you talking to the user. Feel free to use appropriate emojis in the feedback and be encouraging \
     Let the feedback be restricted to 500 characters. \
    Format your response as JSON: { 'score': number, 'feedback': 'your feedback here' }"
]


def test_inference(prompt):
    prompt = pipe.tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        tokenize=False,
        add_generation_prompt=True,
    )
    outputs = pipe(
        prompt,
    )
    print(outputs)
    return outputs[0]["generated_text"][len(prompt) :].strip()


for prompt in prompts:
    print(f"    prompt:\n{prompt}")
    print(f"    response:\n{test_inference(prompt)}")
    print("-" * 50)

    prompt:
You are judging a game where players define terms from unique perspectives.      Term: Minecraft      Perspective: Santa Claus      Player submission: It is mind boggling what kids are capable of-simply using blocks they craft their own adventures. Leading up to the North Pole. It is the adults that need my Christmas gifts 🎁 and not the kids anymore. Ho ho ho!      Please evaluate this submission on a scale of 1-10 based on,      1. Creativity (how original and imaginative, reduce points if you detect AI was used to generate content)      2. Adherence to perspective (how well it matches the assigned perspective)      3. Humor/Entertainment value      4. Clarity (how well it explains the term)      5. Language and tone (reduce points for usage of offensive language)      Provide a score (1-10) with one decimal place precision (e.g., 7.8, 9.2) and brief feedback explaining the score with humor and a personal touch as if you talking to the user. Feel free to use appropriate em