# Microsoft's Phi-3 Mini 4K Instruct, to translate English into Yoda-speak

In [6]:
!pip install -q transformers==4.46.2 peft==0.13.2 accelerate==1.1.1 trl==0.12.1 bitsandbytes==0.45.2 datasets==3.1.0 huggingface-hub==0.26.2 safetensors==0.4.5 pandas==2.2.2 matplotlib==3.8.0 numpy==1.26.4

In [4]:
# !pip install -q datasets bitsandbytes trl

In [1]:
# Imports
import os
import torch
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer

In [2]:
# Loading a Quantized Base Model
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float32,
)
repo_id = 'microsoft/Phi-3-mini-4k-instruct'
model = AutoModelForCausalLM.from_pretrained(
    repo_id, device_map="cuda:0", quantization_config=bnb_config
)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [3]:
print(model.get_memory_footprint()/1e6)

2206.347264


In [4]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3SdpaAttention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): Phi3RMSNorm((3072

In [5]:
# Setting Up Low-Rank Adapters(LoRA)
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    bias = "none",
    lora_dropout = 0.05,
    task_type = "CAUSAL_LM",
    target_modules = ['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)
model = get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3SdpaAttention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magn

In [6]:
print(model.get_memory_footprint()/1e6)

2651.080704


In [7]:
trainable_parms, tot_parms = model.get_nb_trainable_parameters()
print(f"Trainable parameters:  {trainable_parms/1e6:.2f}M")
print(f'Total parameters: {tot_parms/1e6:.2f}M')
print(f'Fraction of trainable parameters: {100*trainable_parms/tot_parms:.2f}%')

Trainable parameters:  12.58M
Total parameters: 3833.66M
Fraction of trainable parameters: 0.33%


Now the model is read for **FineTuning**

## Formatting Dataset

In [8]:
# Formatting Dataset
dataset = load_dataset('dvgodoy/yoda_sentences', split="train")
dataset

README.md:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentences.csv:   0%|          | 0.00/98.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/720 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'translation', 'translation_extra'],
    num_rows: 720
})

In [9]:
dataset[0]

{'sentence': 'The birch canoe slid on the smooth planks.',
 'translation': 'On the smooth planks, the birch canoe slid.',
 'translation_extra': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}

In [10]:
# Renaming and Removing Columns
dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.remove_columns(["translation"])
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 720
})

In [12]:
# dataset[0][prompt]

In [13]:
messages = [
    {"role": "user", "content": dataset[0]['prompt']},
    {"role": "assistant", "content": dataset[0]['completion']}
]
messages

[{'role': 'user', 'content': 'The birch canoe slid on the smooth planks.'},
 {'role': 'assistant',
  'content': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}]

In [None]:
# # # convert the dataset to the conversational format using the format_dataset() function
# def format_dataset(examples):
#   if isinstance(examples["prompt"], list):
#     output_texts = []
#     for i in range(len(examples["prompt"])):
#       converted_sample = [
#           {"role": "user", "content": examples["prompt"][i]},
#           {"role": "assistant", "content": examples["completion"][i]},
#       ]
#       output_texts.append(converted_sample)
#     return {'messages': output_texts}
#   else:
#     converted_sample = [
#         {"role": "user", "content": examples["prompt"]},
#         {"role": "assistant", "content": examples["completion"]},

#     ]
#     return {"messages": converted_sample}


In [14]:
def format_dataset(examples):
    if isinstance(examples["prompt"], list):
        output_texts = []
        for i in range(len(examples["prompt"])):
            converted_sample = [
                {"role": "user", "content": examples["prompt"][i]},
                {"role": "assistant", "content": examples["completion"][i]},
            ]
            output_texts.append(converted_sample)
        return {'messages': output_texts}
    else:
        converted_sample = [
            {"role": "user", "content": examples["prompt"]},
            {"role": "assistant", "content": examples["completion"]},
        ]
        return {'messages': converted_sample}

In [15]:
dataset = dataset.map(format_dataset).remove_columns(['prompt', 'completion'])


Map:   0%|          | 0/720 [00:00<?, ? examples/s]

In [16]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(repo_id)
tokenizer.chat_template

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

"{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"

In [17]:
print(tokenizer.apply_chat_template(messages, tokenize=False))

<|user|>
The birch canoe slid on the smooth planks.<|end|>
<|assistant|>
On the smooth planks, the birch canoe slid. Yes, hrrrm.<|end|>
<|endoftext|>


In [20]:
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

## Fine-Tuning with SFTTrainer

In [21]:
sft_config = SFTConfig(
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs = {'use_reentrant': False},
    gradient_accumulation_steps=1,
    per_device_train_batch_size=16,
    auto_find_batch_size=True,

    max_seq_length=64,
    packing=True,

    num_train_epochs = 10,
    learning_rate = 3e-4,

    optim = 'paged_adamw_8bit',

    logging_steps=10,
    logging_dir = './logs',
    output_dir = './phi3-mini-yoda-adapter',
    report_to = 'none'
)

In [22]:
trainer = SFTTrainer(
    model = model,
    processing_class = tokenizer,
    args = sft_config,
    train_dataset = dataset,
)

Generating train split: 0 examples [00:00, ? examples/s]



In [23]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))

In [24]:
batch['input_ids'][0], batch['labels'][0]

(tensor([12844,  1416,   343,  5860,   338,  2090, 29889, 32007, 32001, 13811,
         29892,   263,  7618,   895,   297, 14294, 19922,   297,   263, 12844,
          1416,   343,  5860,   338, 29889,  3869, 29892, 22157,  1758,  4317,
         29889, 32007, 32000, 32000, 32010,  2296,   471,  2924,   304, 17319,
          2030,  2305, 29889, 32007, 32001, 13187,   304, 17319,  2030,  2305,
         29892,  1183,   471, 29889, 32007, 32000, 32000, 32010,  3600,   528,
          2728,   471,  5941,   541], device='cuda:0'),
 tensor([12844,  1416,   343,  5860,   338,  2090, 29889, 32007, 32001, 13811,
         29892,   263,  7618,   895,   297, 14294, 19922,   297,   263, 12844,
          1416,   343,  5860,   338, 29889,  3869, 29892, 22157,  1758,  4317,
         29889, 32007, 32000, 32000, 32010,  2296,   471,  2924,   304, 17319,
          2030,  2305, 29889, 32007, 32001, 13187,   304, 17319,  2030,  2305,
         29892,  1183,   471, 29889, 32007, 32000, 32000, 32010,  3600,   5

In [25]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,3.0138
20,1.8
30,1.5587
40,1.4789
50,1.368
60,1.2722
70,1.1708
80,0.9446
90,0.8827
100,0.6132


TrainOutput(global_step=220, training_loss=0.8334130482240156, metrics={'train_runtime': 1581.7639, 'train_samples_per_second': 2.219, 'train_steps_per_second': 0.139, 'total_flos': 5034400555991040.0, 'train_loss': 0.8334130482240156, 'epoch': 10.0})

## Querying the Model

In [26]:
def gen_prompt(tokenizer, sentence):
  converted_sample = [
      {"role": "user", "content": sentence},
  ]
  prompt = tokenizer.apply_chat_template(converted_sample,
                                         tokenizer = False,
                                         add_generation_prompt = True)
  return prompt

In [27]:
# Genrating a prompt for an example sentence
sentence = "The Force is strong in you!"
prompt = gen_prompt(tokenizer, sentence)
print(prompt)

[32010, 450, 11004, 338, 4549, 297, 366, 29991, 32007, 32001]


In [33]:
# def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
#     tokenized_input = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)

#     model.eval()
#     generation_output = model.generate(**tokenized_input,
#                                        eos_token_id=tokenizer.eos_token_id,
#                                        max_new_tokens=max_new_tokens)

#     output = tokenizer.batch_decode(generation_output,
#                                     skip_special_tokens=skip_special_tokens)
#     return output[0]



def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
    tokenized_input = tokenizer(prompt, add_special_tokens = False, return_tensors="pt").to(model.device)

    model.eval()
    generated_output = model.generate(**tokenized_input,
                                      eos_token_id = tokenizer.eos_token_id,
                                      max_new_tokens = max_new_tokens)

    output = tokenizer.batch_decode(generation_output,
                                    skip_special_tokens = skip_special_tokens)
    return output[0]

In [34]:
print(generate(model, tokenizer, prompt))

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
# Saving the Adapter
trainer.save_model('local-phi3-mini-yoda-adapter')

In [None]:
os.listdir('local-phi3-mini-yoda-adapter')

### Sharing of the adapter on HF Hub

In [30]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
trainer.push()

AttributeError: 'SFTTrainer' object has no attribute 'push'