<a href="https://colab.research.google.com/github/sayanbanerjee32/ms-phi2-qlora-oasst1/blob/main/ms_phi2_fine_tune_q_lora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install necessary libraries

In [None]:
# !pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.37.0 trl==0.4.7 dataset
!pip install -Uq accelerate peft bitsandbytes transformers trl dataset

In [None]:
import os, gc
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, get_peft_model
from trl import SFTTrainer

from tqdm import tqdm
import pandas as pd
# set some pandas options to make the output more readable
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)

In [None]:
# Merge and save the fine-tuned model
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Define Hyperparameters

In [None]:
model_name = "microsoft/phi-2" # use this if you have access to the official LLaMA 2 model "meta-llama/Llama-2-7b-chat-hf", though keep in mind you'll need to pass a Hugging Face key argument
dataset_name = "OpenAssistant/oasst1"
new_model = "ms-phi2-custom"
lora_r = 32
lora_alpha = 16
lora_dropout = 0.05
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 8
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 25
max_seq_length = 2048
packing = False
device_map = {"": 0}

## Load datasets and pre-process
- https://github.com/LAION-AI/Open-Assistant/blob/main/notebooks/openassistant-oasst1/getting-started.ipynb
- https://huggingface.co/dfurman/Falcon-7B-Chat-v0.1/blob/main/finetune_falcon7b_oasst1_with_bnb_peft.ipynb

In [None]:
# Load datasets
train_dataset = load_dataset(dataset_name, split="train")
valid_dataset = load_dataset(dataset_name, split="validation")

# # Preprocess datasets
# train_dataset_mapped = train_dataset.map(lambda examples: {'text': [f'[INST] <>\n{system_message.strip()}\n<>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)
# valid_dataset_mapped = valid_dataset.map(lambda examples: {'text': [f'[INST] <>\n{system_message.strip()}\n<>\n\n' + prompt + ' [/INST] ' + response for prompt, response in zip(examples['prompt'], examples['response'])]}, batched=True)


In [None]:
# lets convert the train dataset to a pandas df
train_df = train_dataset.to_pandas()
train_df.info(verbose=True, memory_usage=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84437 entries, 0 to 84436
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   message_id       84437 non-null  object 
 1   parent_id        74591 non-null  object 
 2   user_id          84437 non-null  object 
 3   created_date     84437 non-null  object 
 4   text             84437 non-null  object 
 5   role             84437 non-null  object 
 6   lang             84437 non-null  object 
 7   review_count     84437 non-null  int32  
 8   review_result    83732 non-null  object 
 9   deleted          84437 non-null  bool   
 10  rank             48730 non-null  float64
 11  synthetic        84437 non-null  bool   
 12  model_name       0 non-null      object 
 13  detoxify         72297 non-null  object 
 14  message_tree_id  84437 non-null  object 
 15  tree_state       84437 non-null  object 
 16  emojis           71496 non-null  object 
 17  labels      

In [None]:
# train_df.loc[train_df.lang == 'en'].tail()

In [None]:
def add_tree_level(df):
    """helper function to add tree level to a df"""

    # if tree level already exists, return df
    if "tree_level" in df.columns:
        return df

    else:
        tree_level_map = {}

        # iterate over rows in df
        for i, row in df.iterrows():
            message_id = row["message_id"]
            parent_id = row["parent_id"]

            # if parent_id is None, then it is a root message
            if parent_id is None:
                tree_level_map[message_id] = 0
            # if parent_id is the same as message_tree_id, then it is a direct reply to the root message
            elif parent_id == row["message_tree_id"]:
                tree_level_map[message_id] = 1
            # else just look up the tree level of the parent_id and add 1
            else:
                tree_level_map[message_id] = tree_level_map[parent_id] + 1

        # create a df from the tree_level_map and merge it with the original df
        df_tree_level_map = (
            pd.DataFrame.from_dict(tree_level_map, orient="index", columns=["tree_level"])
            .reset_index()
            .rename(columns={"index": "message_id"})
        )

        return df.merge(df_tree_level_map, on="message_id")


In [None]:
# # look at all data for this message tree
# df_message_tree = train_df.query(f"message_tree_id == '6ab24d72-0181-4594-a9cd-deaf170242fb'").sort_values("created_date")

# # add tree level to df
# df_message_tree = add_tree_level(df_message_tree)

# df_message_tree.sort_values(['tree_level','rank'])

In [None]:
# # look at all data for this message tree
# df_message_tree = train_df.query(f"message_tree_id == '076f689b-bdb2-4c80-ab7a-a30d2d72bf22'").sort_values("created_date")

# # add tree level to df
# df_message_tree = add_tree_level(df_message_tree)

# df_message_tree.sort_values(['tree_level','rank'])

In [None]:
# # look at all data for this message tree
# df_message_tree = train_df.query(f"message_tree_id == '31c72505-508c-42cb-97c2-ca3982c78dcd'").sort_values("created_date")

# # add tree level to df
# df_message_tree = add_tree_level(df_message_tree)

# df_message_tree.sort_values(['tree_level','rank'])

In [None]:
# df_message_tree = df_message_tree.sort_values(['tree_level','rank'])
# text = ""
# # root message
# row = df_message_tree.loc[df_message_tree.tree_level == 0]
# text = "<|" + row["role"].values[0] + "|>"+ row["text"].values[0] + "<|endoftext|>"
# text = text.replace("\n", " ")
# while True:
#     children = df_message_tree[df_message_tree.parent_id == row.message_id.values[0]]
#     if len(children) == 0:
#         break
#     elif len(children) > 1:
#         row = children.loc[((children['role']== 'prompter') | (children['rank'] == 0.0))]
#     else:
#         row = children
#     if len(row) == 0:
#         break
#     text += "<|" + row["role"].values[0] + "|>"+ row["text"].values[0] + "<|endoftext|>"
#     text = text.replace("\n", " ")
#     # print(text)/
# text


In [None]:
# convert to function
def perproc_data(df, message_tree_id):
    # look at all data for this message tree
    df_message_tree = df.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")
    # add tree level to df
    df_message_tree = add_tree_level(df_message_tree)
    df_message_tree = df_message_tree.sort_values(['tree_level','rank'])

    text = ""
    # root message
    row = df_message_tree.loc[df_message_tree.tree_level == 0]
    text = "<|" + row["role"].values[0] + "|>"+ row["text"].values[0] + "<|endoftext|>"
    text = text.replace("\n", " ")
    while True:
        children = df_message_tree[df_message_tree.parent_id == row.message_id.values[0]]
        if len(children) == 0:
            break
        elif len(children) > 1:
            row = children.loc[((children['role']== 'prompter') | (children['rank'] == 0.0))]
        else:
            row = children

        if len(row) == 0: break

        text += "<|" + row["role"].values[0] + "|>"+ row["text"].values[0] + "<|endoftext|>"
        text = text.replace("\n", " ")
        # print(text)
    return text

# perproc_data(valid_df, '68489e5c-978f-4ad7-a849-39a741fb5ae7')

In [None]:
def get_appended_messages(df):
    # lets grab the message trees to train on
    message_tree_ids = df["message_tree_id"].unique()
    messages = {}
    messages['message_tree_id'] = []
    messages['message_tree_text'] = []
    for message_tree_id in tqdm(message_tree_ids):
        # print(message_tree_id)
        text = perproc_data(df, message_tree_id)
        messages['message_tree_id'].append(message_tree_id)
        messages['message_tree_text'].append(text)
    return messages



In [None]:
train_message_df = pd.DataFrame.from_dict(get_appended_messages(train_df))

100%|██████████| 9846/9846 [01:55<00:00, 85.26it/s]


In [None]:
valid_df = valid_dataset.to_pandas()
valid_message_df = pd.DataFrame.from_dict(get_appended_messages(valid_df))

100%|██████████| 518/518 [00:04<00:00, 112.41it/s]


In [None]:
# convert back to HF datasets format
train_ds = Dataset.from_pandas(train_message_df)
valid_ds = Dataset.from_pandas(valid_message_df)

## Load models and Train

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_la

In [None]:
train_ds = train_ds.map(lambda samples: tokenizer(samples["message_tree_text"], padding=True, truncation=True,), batched=True)
train_ds

Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

Dataset({
    features: ['message_tree_id', 'message_tree_text', 'input_ids', 'attention_mask'],
    num_rows: 9846
})

In [None]:
valid_ds = valid_ds.map(lambda samples: tokenizer(samples["message_tree_text"], padding=True, truncation=True,), batched=True)
valid_ds

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Dataset({
    features: ['message_tree_id', 'message_tree_text', 'input_ids', 'attention_mask'],
    num_rows: 518
})

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules =   ["Wqkv",  "fc1", "fc2"] #, "lm_head"],
)
model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 26214400 || all params: 1547607040 || trainable%: 1.6938666807822222


In [None]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiSdpaAttention(
              (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
              (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
              (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
              (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
              (rotary_emb): PhiRotaryEmbedding()
            )
            (mlp): PhiMLP(
              (activation_fn): NewGELUActivation()
              (fc1): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2560, out_features=10240, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="all",
    eval_strategy="steps",
    eval_steps=25  # Evaluate every 20 steps
)
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=valid_ds,  # Pass validation dataset here
    peft_config=peft_config,
    dataset_text_field="message_tree_text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Step,Training Loss,Validation Loss
25,2.081,1.717171
50,1.9858,1.773384
75,1.9552,1.669913
100,1.8537,1.712468
125,2.0166,2.079979
150,2.5421,2.31428
175,2.5467,2.139933
200,2.3996,1.999169
225,2.1953,1.811361
250,2.1737,2.253963


Step,Training Loss,Validation Loss
25,2.081,1.717171
50,1.9858,1.773384
75,1.9552,1.669913
100,1.8537,1.712468
125,2.0166,2.079979
150,2.5421,2.31428
175,2.5467,2.139933
200,2.3996,1.999169
225,2.1953,1.811361
250,2.1737,2.253963


TrainOutput(global_step=1230, training_loss=2.254776898826041, metrics={'train_runtime': 50297.6369, 'train_samples_per_second': 0.196, 'train_steps_per_second': 0.024, 'total_flos': 3.234237259972608e+17, 'train_loss': 2.254776898826041, 'epoch': 0.9993906154783668})

In [None]:
## save the model
trainer.model.save_pretrained(new_model)

In [None]:
# Cell 4: Test the model
logging.set_verbosity(logging.CRITICAL)
prompt = f"<|prompter|>Write a function that reverses a string.<|endoftext|><|assistant|>" # replace the command here with something relevant to your task
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])

<|prompter|>Write a function that reverses a string.<|endoftext|><|assistant|>Here's a function that reverses a string:  def reverse_string(string):  reversed_string = ""  for char in string:  reversed_string = char + reversed_string  return reversed_string  You can use this function to reverse any string you want.  Here's an example:  string = "Hello, world!"  reversed_string = reverse_string(string)  print(reversed_string)  This will print "dlrow,olleH".  Can you try it with your own string?  I hope this helps!  Assistant:  Here's a modified version of the function that uses a list comprehension to reverse the string:  def reverse_string(string):  reversed_string = [char for char in reversed(string)]  return "".join(


## Run inference

In [None]:
from transformers import pipeline

prompt = f"<|prompter|>Write a function that reverses a string.<|endoftext|><|assistant|>" # replace the command here with something relevant to your task
num_new_tokens = 100  # change to the number of new tokens you want to generate

# Count the number of tokens in the prompt
num_prompt_tokens = len(tokenizer(prompt)['input_ids'])

# Calculate the maximum length for the generation
max_length = num_prompt_tokens + num_new_tokens

gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length=max_length)
result = gen(prompt)
print(result[0]['generated_text'].replace(prompt, ''))


Here is a function that reverses a string:  def reverse_string(string): return string[::-1]  You can use this function to reverse any string.  For example, if you have the string "Hello, world!", you can reverse it to "dlrow,!olleH".  What do you think?  Would you like to try it out?  You can call the function like this:  reverse_string("Hello, world!")  And


In [None]:
!cp -r ms-phi2-custom /content/drive/MyDrive/ms-phi2-custom-adapter

## Merge the model and store in Google Drive

In [None]:
# Merge and save the fine-tuned model
# from google.colab import drive
# drive.mount('/content/drive')

model_path = "/content/drive/MyDrive/ms-phi2-custom2"  # change to your preferred path

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Save the merged model
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('/content/drive/MyDrive/ms-phi2-custom2/tokenizer_config.json',
 '/content/drive/MyDrive/ms-phi2-custom2/special_tokens_map.json',
 '/content/drive/MyDrive/ms-phi2-custom2/vocab.json',
 '/content/drive/MyDrive/ms-phi2-custom2/merges.txt',
 '/content/drive/MyDrive/ms-phi2-custom2/added_tokens.json',
 '/content/drive/MyDrive/ms-phi2-custom2/tokenizer.json')