In [None]:
import torch
from datasets import load_dataset
import peft
import transformers as tf
from trl import SFTConfig, SFTTrainer

In [2]:
torch.cuda.empty_cache()
print("Available devices:", torch.cuda.device_count(), "Current device:", torch.cuda.current_device(), "Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

Available devices: 1 Current device: 0 Device name: NVIDIA GeForce RTX 4090


In [3]:
bnb_config = tf.BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float32
)

repo_id = 'microsoft/Phi-3-mini-4k-instruct'

In [4]:
model = tf.AutoModelForCausalLM.from_pretrained(
   repo_id, device_map="cuda:0", quantization_config=bnb_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
print("Model size (in billions of parameters):", model.num_parameters() / 1e9)
print("Memory footprint (in GB):", model.get_memory_footprint()/1e9)

Model size (in billions of parameters): 3.821079552
Memory footprint (in GB): 2.206341312


In [6]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (norm): Phi3RMSNorm((3072,), eps=1e-05)
    (rotary_emb): Phi3RotaryEmbedding()
  )
  (lm_head): Linear(in_features=3072, out_

In [7]:
config = peft.LoraConfig(
    r=8,  # rank, usually 8 or 16
    lora_alpha=16, # multiplier, usually 2*r
    bias="none",           
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    target_modules=['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
)

In [None]:
model = peft.prepare_model_for_kbit_training(model)
model = peft.get_peft_model(model, config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (

In [7]:
print("Model size (in billions of parameters):", model.num_parameters() / 1e9)
print("Memory footprint (in GB):", model.get_memory_footprint()/1e9)

Model size (in billions of parameters): 3.833662464
Memory footprint (in GB): 2.651074752


In [None]:
train_p, tot_p = model.get_nb_trainable_parameters()
print(f'Trainable parameters:      {train_p/1e6:.2f}M')
print(f'Total parameters:          {tot_p/1e6:.2f}M')
print(f'% of trainable parameters: {100*train_p/tot_p:.2f}%')

Trainable parameters:      12.58M
Total parameters:          3833.66M
% of trainable parameters: 0.33%


In [None]:
dataset = load_dataset("dvgodoy/yoda_sentences", split="train")
dataset

README.md:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentences.csv:   0%|          | 0.00/98.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/720 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'translation', 'translation_extra'],
    num_rows: 720
})

In [11]:
print(dataset[0])

{'sentence': 'The birch canoe slid on the smooth planks.', 'translation': 'On the smooth planks, the birch canoe slid.', 'translation_extra': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.'}


In [None]:
dataset = dataset.rename_column("sentence", "prompt")
dataset = dataset.rename_column("translation_extra", "completion")
dataset = dataset.remove_columns(["translation"])
dataset

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 720
})

In [None]:
# Adapted from trl.extras.dataset_formatting.instructions_formatting_function
# Converts dataset from prompt/completion format (not supported anymore)
# to the conversational format
def format_dataset(examples):
    if isinstance(examples["prompt"], list):
        output_texts = []
        for i in range(len(examples["prompt"])):
            converted_sample = [
                {"role": "user", "content": examples["prompt"][i]},
                {"role": "assistant", "content": examples["completion"][i]},
            ]
            output_texts.append(converted_sample)
        return {'messages': output_texts}
    else:
        converted_sample = [
            {"role": "user", "content": examples["prompt"]},
            {"role": "assistant", "content": examples["completion"]},
        ]
        return {'messages': converted_sample}

dataset = dataset.map(format_dataset).remove_columns(['prompt', 'completion'])
dataset[0]['messages']

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

[{'content': 'The birch canoe slid on the smooth planks.', 'role': 'user'},
 {'content': 'On the smooth planks, the birch canoe slid. Yes, hrrrm.',
  'role': 'assistant'}]

In [9]:
tokenizer = tf.AutoTokenizer.from_pretrained(repo_id)
tokenizer.chat_template

"{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}"

In [21]:
print(tokenizer.apply_chat_template(dataset[0]['messages'], tokenize=False))

<|user|>
The birch canoe slid on the smooth planks.<|end|>
<|assistant|>
On the smooth planks, the birch canoe slid. Yes, hrrrm.<|end|>
<|endoftext|>


In [6]:
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

In [None]:
sft_config = SFTConfig(
    ## GROUP 1: Memory usage
    # These arguments will squeeze the most out of your GPU's RAM
    # Checkpointing
    gradient_checkpointing=True,    # this saves a LOT of memory
    # Set this to avoid exceptions in newer versions of PyTorch
    gradient_checkpointing_kwargs={'use_reentrant': False}, 
    # Gradient Accumulation / Batch size
    # Actual batch (for updating) is same (1x) as micro-batch size
    gradient_accumulation_steps=1,  
    # The initial (micro) batch size to start off with
    per_device_train_batch_size=16, 
    # If batch size would cause OOM, halves its size until it works
    auto_find_batch_size=True,

    ## GROUP 2: Dataset-related
    max_seq_length=64,
    # Dataset
    # packing a dataset means no padding is needed
    packing=True,

    ## GROUP 3: These are typical training parameters
    num_train_epochs=10,
    learning_rate=3e-4,
    # Optimizer
    # 8-bit Adam optimizer - doesn't help much if you're using LoRA!
    optim='paged_adamw_8bit',       
    
    ## GROUP 4: Logging parameters
    logging_steps=10,
    logging_dir='./logs',
    output_dir='./phi3-mini-yoda-adapter',
    report_to='none'
)

In [None]:
trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset,
)

Converting train dataset to ChatML:   0%|          | 0/720 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/720 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [25]:
dl = trainer.get_train_dataloader()
batch = next(iter(dl))

In [26]:
batch['input_ids'][0], batch['labels'][0]

(tensor([ 3974, 29892,  4337,   278,   325,   271, 29892,   366,  1818, 29889,
         32007, 32000, 32010,   450,   289,   935,   310,   278,   282,   457,
          5447,   471,   528,  4901,   322,  6501, 29889, 32007, 32001, 26399,
          1758,  4317, 29889,  1383,  4901,   322,  6501, 29892,   278,   289,
           935,   310,   278,   282,   457,  5447,   471, 29889, 32007, 32000,
         32010,   951,  5989,  2507, 17354,   322, 13328,   297,   278,  6416,
         29889, 32007, 32001,   512], device='cuda:0'),
 tensor([ 3974, 29892,  4337,   278,   325,   271, 29892,   366,  1818, 29889,
         32007, 32000, 32010,   450,   289,   935,   310,   278,   282,   457,
          5447,   471,   528,  4901,   322,  6501, 29889, 32007, 32001, 26399,
          1758,  4317, 29889,  1383,  4901,   322,  6501, 29892,   278,   289,
           935,   310,   278,   282,   457,  5447,   471, 29889, 32007, 32000,
         32010,   951,  5989,  2507, 17354,   322, 13328,   297,   278,  64

In [27]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,2.8422
20,1.8237
30,1.604
40,1.52
50,1.3963
60,1.293
70,1.1913
80,0.9925
90,0.8946
100,0.6317


TrainOutput(global_step=220, training_loss=0.8333120801232078, metrics={'train_runtime': 182.6736, 'train_samples_per_second': 18.667, 'train_steps_per_second': 1.204, 'total_flos': 4890970340720640.0, 'train_loss': 0.8333120801232078})

In [28]:
trainer.save_model('local-phi3-mini-yoda-adapter')

In [8]:
def gen_prompt(tokenizer, sentence):
    converted_sample = [{"role": "user", "content": sentence}]
    prompt = tokenizer.apply_chat_template(
        converted_sample, tokenize=False, add_generation_prompt=True
    )
    return prompt

In [10]:
sentence = 'The Force is strong in you!'
prompt = gen_prompt(tokenizer, sentence)
print(prompt)

<|user|>
The Force is strong in you!<|end|>
<|assistant|>



In [11]:
def generate(model, tokenizer, prompt, max_new_tokens=64, skip_special_tokens=False):
    tokenized_input = tokenizer(
        prompt, add_special_tokens=False, return_tensors="pt"
    ).to(model.device)

    model.eval()
    gen_output = model.generate(**tokenized_input,
                                eos_token_id=tokenizer.eos_token_id,
                                max_new_tokens=max_new_tokens)
    
    output = tokenizer.batch_decode(gen_output, skip_special_tokens=skip_special_tokens)
    return output[0]

In [None]:
print(generate(model, tokenizer, prompt))

<|user|> The Force is strong in you!<|end|><|assistant|> Strong in you, the Force is! Yes, hrrrm.<|end|><|endoftext|>


In [29]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
model.push_to_hub('phi3-mini-yoda-adapter')

adapter_model.safetensors:   0%|          | 0.00/50.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rodcasnog/phi3-mini-yoda-adapter/commit/d3e68a694bc32ace26ee35f5903f5dcde31e3350', commit_message='Upload model', commit_description='', oid='d3e68a694bc32ace26ee35f5903f5dcde31e3350', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rodcasnog/phi3-mini-yoda-adapter', endpoint='https://huggingface.co', repo_type='model', repo_id='rodcasnog/phi3-mini-yoda-adapter'), pr_revision=None, pr_num=None)