## Supervised Fine-Tuning (SFT)

My Hugging Face Model Repo : https://huggingface.co/santhoshmlops/microsoft_phi-1_5_merged-SFT

# Step 1 - Install the required Python packages

In [1]:
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U bitsandbytes
!pip install -q -U trl
!pip install -q -U accelerate
!pip install -q -U datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━

# Step 2 - Logging into Hugging Face Hub
Paste the Hugging Face Hub Write API KEY

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Step 3 - Loading Required Libraries

In [3]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig,PeftModel, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from accelerate import Accelerator

In [None]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/864 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [None]:
print(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2048,), e

# Step 4 - Setting Model Parameters for SFT

In [4]:
sft_config = {
            # Load Model for Tuning
            "model_ckpt": "microsoft/phi-1_5",
            "new_model_ckpt": "microsoft_phi-1_5_merged-SFT",
            "hub_model_ckpt": "santhoshmlops/microsoft_phi-1_5_merged-SFT",
            # QLora Parameters
            "use_lora": True,
            "r": 16,
            "lora_alpha": 32,
            "lora_dropout": 0.05,
            "bias": "none",
            "task_type": "CAUSAL_LM",
            "target_modules": ["q_proj","k_proj", "v_proj","dense","fc1","fc2","lm_head"],
            # BitsandBytes Parameters
            "load_in_4bit": True,
            "bnb_4bit_quant_type" : "nf4",
            "bnb_4bit_compute_dtype": torch.float16,
            "bnb_4bit_use_double_quant": True,
            # Automodel Parameters
            "device_map": {"": Accelerator().local_process_index},
            "torch_dtype": torch.float16,
            # Tokenizer Parameters
            "trust_remote_code": True,
            # Training Parameters
            "output_dir": "./microsoft_phi-1_5_merged-SFT",
            "num_train_epochs": 1,
            "per_device_train_batch_size": 5,
            "gradient_accumulation_steps": 5,
            "gradient_checkpointing" : True,
            "max_grad_norm" : 0.3,
            "learning_rate": 2e-4,
            "weight_decay" : 0.003,
            "optim": "paged_adamw_8bit",
            "lr_scheduler_type": "cosine",
            "max_steps": 100,
            "warmup_ratio" : 0.03,
            "group_by_length" : True,
            "save_steps" : 10,
            "save_strategy": "epoch",
            "logging_steps": 10,
            "logging_dir": "./logs",
            "fp16": False,
            "bf16" : False,
            "push_to_hub": True,
            "neftune_noise_alpha": 5,
            "report_to":"tensorboard",
            # SFT Training Parameters
            "train_cln_name": "chat_sample",
            "packing": False,
            "max_seq_length": 512,
            # Merge and push the model to Hub
            "low_cpu_mem_usage" : True,
            "return_dict" : True,
            "torch_dtype": torch.float16
        }

# Step 5 - Loading and Formatting the Dataset

In [5]:
dataset_name = "gathnex/Gath_baize"
def create_data():
  data = load_dataset(dataset_name, split="train")
  data_df = data.to_pandas()
  original_system_message = "The conversation between Human and AI assisatance named Gathnex"
  system_message = "[INST]The conversation between Human and AI assisatance named Microsoft_Phi AI Assisatance.\n[/INST]"
  data_df["chat_sample"] = data_df["chat_sample"].apply(lambda x: x.replace(original_system_message, "").strip())
  data_df["chat_sample"]= system_message + data_df["chat_sample"]
  data = Dataset.from_pandas(data_df)
  return data

data = create_data()
print(data[0])

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

{'chat_sample': '[INST]The conversation between Human and AI assisatance named Microsoft_Phi AI Assisatance.\n[/INST][INST] Generate a headline given a content block.\nThe Sony Playstation 5 is the latest version of the console. It has improved graphics and faster processing power.\n[/INST] Experience Amazing Graphics and Speed with the New Sony Playstation 5', 'dataset_origin': 'alpaca'}


# Step 6 - Fine-Tuning with qLora and Supervised Finetuning

In [6]:
class TrainSFT:

    def __init__(self, data, config):
        self.data = data
        self.config = config

    def prepare_lora_model(self):
        self.lora_config = LoraConfig(
            r=self.config["r"],
            lora_alpha=self.config["lora_alpha"],
            lora_dropout=self.config["lora_dropout"],
            bias=self.config["bias"],
            task_type=self.config["task_type"],
            target_modules=self.config["target_modules"]
        )
        self.model = get_peft_model(self.model, self.lora_config)

    def load_model_tokenizer(self):
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=self.config["load_in_4bit"],
            bnb_4bit_quant_type=self.config["bnb_4bit_quant_type"],
            bnb_4bit_compute_dtype=self.config["bnb_4bit_compute_dtype"],
            bnb_4bit_use_double_quant=self.config["bnb_4bit_use_double_quant"],
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            self.config["model_ckpt"],
            quantization_config=self.bnb_config,
            device_map=self.config["device_map"],
            torch_dtype=self.config["torch_dtype"]
        )
        self.model.config.use_cache = False
        self.model.config.pretraining_tp = 1
        self.model.gradient_checkpointing_enable()
        self.model = prepare_model_for_kbit_training(self.model)

        if self.config["use_lora"]:
            self.prepare_lora_model()

        self.tokenizer = AutoTokenizer.from_pretrained(
            self.config["model_ckpt"],
            trust_remote_code=self.config["trust_remote_code"],
            )
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"
        torch.cuda.empty_cache()

    def set_training_args(self):
        return TrainingArguments(
            output_dir=self.config["output_dir"],
            num_train_epochs=self.config["num_train_epochs"],
            per_device_train_batch_size=self.config["per_device_train_batch_size"],
            gradient_accumulation_steps=self.config["gradient_accumulation_steps"],
            gradient_checkpointing=self.config["gradient_checkpointing"],
            max_grad_norm=self.config["max_grad_norm"],
            learning_rate=self.config["learning_rate"],
            weight_decay=self.config["weight_decay"],
            optim=self.config["optim"],
            lr_scheduler_type=self.config["lr_scheduler_type"],
            max_steps=self.config["max_steps"],
            warmup_ratio=self.config["warmup_ratio"],
            group_by_length=self.config["group_by_length"],
            save_steps=self.config["save_steps"],
            save_strategy=self.config["save_strategy"],
            logging_steps=self.config["logging_steps"],
            logging_dir=self.config["logging_dir"],
            fp16=self.config["fp16"],
            bf16=self.config["bf16"],
            push_to_hub=self.config["push_to_hub"],
            neftune_noise_alpha=self.config["neftune_noise_alpha"],
            report_to=self.config["report_to"]
        )

    def create_trainer(self):
        self.load_model_tokenizer()
        if self.config["use_lora"]:
            print(self.model.print_trainable_parameters())
            self.trainer = SFTTrainer(
                model=self.model,
                train_dataset=self.data,
                peft_config=self.lora_config,
                dataset_text_field=self.config["train_cln_name"],
                args=self.set_training_args(),
                tokenizer=self.tokenizer,
                packing=self.config["packing"],
                max_seq_length=self.config["max_seq_length"]
            )
        else:
            self.trainer = SFTTrainer(
                model=self.model,
                train_dataset=self.data,
                dataset_text_field=self.config["train_cln_name"],
                args=self.set_training_args(),
                tokenizer=self.tokenizer,
                packing=self.config["packing"],
                max_seq_length=self.config["max_seq_length"]
            )

    def train_and_save_model(self):
        self.create_trainer()
        self.trainer.train()
        # self.model.eval()
        self.trainer.save_model(self.config["new_model_ckpt"])
        self.tokenizer.save_pretrained(self.config["new_model_ckpt"])


# Step 7 - Lets start the training process

In [7]:
train_sft = TrainSFT(data, sft_config)
train_sft.train_and_save_model()

config.json:   0%|          | 0.00/864 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

trainable params: 15,007,744 || all params: 1,433,278,464 || trainable%: 1.0470919906321847
None


Map:   0%|          | 0/210311 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
10,1.3678
20,1.0403
30,1.0201
40,1.1933
50,1.0927
60,1.0473
70,1.0022
80,1.0223
90,1.031
100,1.1054




adapter_model.safetensors:   0%|          | 0.00/480M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.86k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

# Step 8 - Merge the model with LoRA weights

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def merge_push_to_hub(config):
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config["model_ckpt"])

    # Load base model
    base_model = AutoModelForCausalLM.from_pretrained(
        config["model_ckpt"],
        low_cpu_mem_usage=config["low_cpu_mem_usage"],
        return_dict=config["return_dict"],
        torch_dtype=config["torch_dtype"],
        device_map=config["device_map"]
    )

    # Merge models
    merged_model = PeftModel.from_pretrained(base_model,config["hub_model_ckpt"], from_transformers=True)
    merged_model = merged_model.merge_and_unload()

    # Save the merged model
    merged_model.save_pretrained("merged_model", save_config=True)
    tokenizer.save_pretrained("merged_model")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # Push the model and tokenizer to the Hugging Face Model Hub
    merged_model.push_to_hub(config["new_model_ckpt"], use_temp_dir=False)
    tokenizer.push_to_hub(config["new_model_ckpt"], use_temp_dir=False)

# Assuming sft_config is defined elsewhere
merge_push_to_hub(sft_config)


adapter_config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/480M [00:00<?, ?B/s]



README.md:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

In [10]:
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("santhoshmlops/microsoft_phi-1_5_merged-SFT")
model = AutoModel.from_pretrained("santhoshmlops/microsoft_phi-1_5_merged-SFT")
tokenizer = AutoTokenizer.from_pretrained("santhoshmlops/microsoft_phi-1_5_merged-SFT")

Loading adapter weights from santhoshmlops/microsoft_phi-1_5_merged-SFT led to unexpected keys not found in the model:  ['lm_head.base_layer.bias', 'lm_head.base_layer.weight', 'lm_head.lora_A.default.weight', 'lm_head.lora_B.default.weight', 'model.layers.0.mlp.fc1.lora_A.default.weight', 'model.layers.0.mlp.fc1.lora_B.default.weight', 'model.layers.0.mlp.fc2.lora_A.default.weight', 'model.layers.0.mlp.fc2.lora_B.default.weight', 'model.layers.0.self_attn.dense.lora_A.default.weight', 'model.layers.0.self_attn.dense.lora_B.default.weight', 'model.layers.0.self_attn.k_proj.lora_A.default.weight', 'model.layers.0.self_attn.k_proj.lora_B.default.weight', 'model.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.layers.0.self_attn.q_proj.lora_B.default.weight', 'model.layers.0.self_attn.v_proj.lora_A.default.weight', 'model.layers.0.self_attn.v_proj.lora_B.default.weight', 'model.layers.1.mlp.fc1.lora_A.default.weight', 'model.layers.1.mlp.fc1.lora_B.default.weight', 'model.layers.1

tokenizer_config.json:   0%|          | 0.00/7.37k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
