<a href="https://colab.research.google.com/github/rachittshah/fine-tuning-experiments/blob/main/phi2-finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#install transformers library to import autotokenizer
#install datasets library to load the dataset from hugging face
#install peft library to fine-tune the Llama 2 model by reducing computational and memory requirements. PEFT methods only fine-tune a small number of (extra) model parameters
#install trl library to import SFT trainer, trl is a wrapper that can be for Supervised Fine Tuning or for Reinforcement Learning from Human Feedback
#install bitsandbytes library for quantization because we are not going to use the model in full precision
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes
!pip install einops

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer

### Set the enviornment as Hugging Face Token
os.environ["HF_TOKEN"] = "hf"

In [None]:
 # Model
base_model = "microsoft/phi-2"
#Fine-tune model name
new_model = "phi2-eval"
#Load the Dataset from hugging face
dataset = load_dataset("kaist-ai/Feedback-Collection",split="train")
#Tokenizer
#Load the tokenizer from Llama 2
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
#In Llama2 we dont have the padding token which is a very big problem, because we have a dataset with different number of tokens in each row.
#So, we need to pad it so they all have the same length and here i am using end of sentence token and this will have an impact on the generation of our model
#I am using End of Sentence token for fine-tuning
tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

In [None]:
#Configration of QLoRA
#Quantization Configuration
#To reduce the VRAM usage we will load the model in 4 bit precision and we will do quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    #Quant type
    #We will use the "nf4" format this was introduced in the QLoRA paper
    bnb_4bit_quant_type="nf4",
    #As the model weights are stored using 4 bits and when we want to compute its only going to use 16 bits so we have more accuracy
    bnb_4bit_compute_dtype=torch.float16,
    #Quantization parameters are quantized
    bnb_4bit_use_double_quant=False,
)


# LoRA configuration
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ], - 41M params
    # modules_to_save=["embed_tokens","lm_head"]
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    # use_flash_attention_2=True, # Phi does not support yet.
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True,
    fused_dense=True,
    low_cpu_mem_usage=True,
    device_map={"": 0},
    revision="refs/pr/23",
)


model.config.use_cache = False
model.config.pretraining_tp = 1

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
#prepare_model_for_kbit_training---> This function basically helps to built the best model possible
model = prepare_model_for_kbit_training(model,use_gradient_checkpointing=True)

In [None]:
# Set training arguments
training_arguments = TrainingArguments(
        output_dir="./results",
        num_train_epochs=10,#3,5 good for the Llama 2 Model
        per_device_train_batch_size=4,# Number of batches that we are going to take for every step
        gradient_accumulation_steps=32,
        evaluation_strategy="steps",#Not helpful because we donot want to evaluate the model we just want to train it
        eval_steps=2000,
        logging_steps=25,
        optim="paged_adamw_8bit",#Adam Optimizer we will be using but a version that is paged and in 8 bits, so it will lose less memory
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_steps=10,
        warmup_ratio=0.05,
        report_to="tensorboard",
        weight_decay=0.01,
        max_steps=-1, # if maximum steps=2, it will stop after two steps
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset,#No separate evaluation dataset, i am using the same dataset
    peft_config=peft_config,
    dataset_text_field="instruction",
    max_seq_length=512,# In dataset creation we put a threshold 2k for context length (input token limit) but we dont have enough VRAM unfortunately it will take a lot of VRAM to put everything into memory so we are just gonna stop at 512
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()

In [None]:
###Merge the Base Model with the Trained Adapter
# Reload model in FP16 and merge it with LoRA weights
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
#Reload the Base Model and load the QLoRA adapters
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
###Push the Fine-Tuned Model and Tokenizer to the Hugging Face Hub
!huggingface-cli login

model.push_to_hub("rachittshah/evalphi-2.7b", check_pr=True, use_auth_token="hf")
tokenizer.push_to_hub("rachittshah/evalphi-2.7b", check_pr=True, use_auth_token="hf")