# Introduction

* Datasets:
    * https://huggingface.co/datasets/mlabonne/orpo-dpo-mix-40k
* Models:
    * https://huggingface.co/microsoft/phi-1_5
 
***Note:*** *We train a Chat Phi 1.5 model using a custom chat template. Phi 1.5 does not contain a chat template by default.*

In [1]:
!pip install -U accelerate peft bitsandbytes transformers trl datasets tensorboard

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import ORPOConfig, ORPOTrainer, setup_chat_format

## Configuration

In [2]:
batch_size = 1
num_workers = os.cpu_count()
# max_steps = -1 for epoch-wise training.
# epochs = -1 for step-wise training.
# Both cannot be -1.
max_steps = -1
epochs = 3
bf16 = True
fp16 = False
gradient_accumulation_steps = 16
seq_length = 512
logging_steps = 50
save_steps = 50
learning_rate = 0.0002
model_name = 'microsoft/phi-1_5'
out_dir = 'outputs/phi_1_5_chat_alpaca_orpo'
seed = 42

## Setup Tokenizer and Model

In [3]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True,
    use_fast=False
)

In [4]:
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.pad_token)

<|endoftext|>


In [5]:
# Quantization configuration.
if bf16:
    compute_dtype = getattr(torch, 'bfloat16')
else: # FP16
    compute_dtype = getattr(torch, 'float16')

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True
)

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [7]:
model, tokenizer = setup_chat_format(model, tokenizer)

In [8]:
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(50297, 2048)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (dense): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2048, out_features=8192, bias=True)
          (fc2): Linear4bit(in_features=8192, out_features=2048, bias=True)
        )
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (final_laye

## Load Dataset

In [9]:
dataset = load_dataset('mlabonne/orpo-dpo-mix-40k', split="all")

In [10]:
print(dataset)

Dataset({
    features: ['source', 'chosen', 'rejected', 'prompt'],
    num_rows: 44245
})


In [11]:
print(dataset[0])

{'source': 'Airoboros', 'chosen': [{'content': 'The setting is an otherworldly, yet eerily familiar, metropolis known as "Zephyria." It\'s a city suspended in the ether, floating amidst nebulous clouds of cosmic dust. The architecture here is surreal and alien, with buildings that twist and spiral like strands of DNA, reaching towards the cosmos. Streets are paved with luminescent cobblestones, casting soft hues of blues and purples, reflecting off iridescent structures. Strange vegetation, vibrant and bioluminescent, thrives everywhere, creating a stark contrast against the deep indigo sky.\n\nNow, immerse me in this ethereal journey through Zephyria.', 'role': 'user'}, {'content': "As you step onto the teleportation platform, there's a momentary sense of disorientation before your surroundings change abruptly. You find yourself standing on the outskirts of Zephyria, gazing at the sprawling metropolis that glows softly under the starlit canvas above. A gentle breeze, carrying hints of

In [12]:
dataset = dataset.shuffle(seed=42).select(range(1000))

In [13]:
def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

In [14]:
dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)

dataset = dataset.train_test_split(test_size=0.01)

In [15]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt'],
        num_rows: 990
    })
    test: Dataset({
        features: ['source', 'chosen', 'rejected', 'prompt'],
        num_rows: 10
    })
})


In [16]:
print(dataset['train'][0])

{'source': 'sharegpt', 'chosen': "<|im_start|>user\nI'm planning a trip to Tahiti with my wife. The first night we'll be staying with a local family, and then we'll spend a few nights sleeping on the water. We will be traveling by airplane. Can you provide me a packing list?<|im_end|>\n<|im_start|>assistant\nCertainly! Here's a comprehensive packing list for your trip to Tahiti:\n\nEssentials:\n\n1. Passport and travel documents\n2. Money and credit cards\n3. Wireless headphones for in-flight entertainment\n4. Lightweight waterproof bag for protection against rain and ocean water\n5. Comfortable and breathable clothing, including shorts and t-shirts, and a lightweight sweater or jacket for cooler evenings\n6. Sunscreen with a high SPF to protect against harsh tropical rays\n7. Lightweight and versatile shoes, such as sandals or slip-on sneakers\n8. Reusable water bottle\n9. Personal care items such as toiletries, a small first-aid kit, and prescription medications if needed\n10. Camera

## Training

In [17]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=16,
    bias='none',
    task_type='CAUSAL_LM',
)

In [24]:
if max_steps == -1 and epochs > 0:
    training_args = ORPOConfig(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='epoch',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='steps',
        save_strategy='epoch',
        logging_steps=logging_steps,
        num_train_epochs=epochs,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
        max_length=seq_length,
        seed=seed
    )

if max_steps > 0 and epochs == -1:
    training_args = ORPOConfig(
        output_dir=f"{out_dir}/logs",
        evaluation_strategy='steps',
        weight_decay=0.01,
        load_best_model_at_end=True,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        logging_strategy='steps',
        save_strategy='steps',
        logging_steps=logging_steps,
        save_steps=save_steps,
        save_total_limit=2,
        bf16=bf16,
        fp16=fp16,
        report_to='tensorboard',
        max_steps=max_steps,
        dataloader_num_workers=num_workers,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        lr_scheduler_type='constant',
        max_length=seq_length,
        seed=seed
    )

In [25]:
trainer = ORPOTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    args=training_args,
    peft_config=peft_params,
)



Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2056 > 2048). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [26]:
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(50297, 2048)
    (embed_dropout): Dropout(p=0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
          )
          (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
          (v_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=2048, out_feat

In [31]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['chosen_input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

<|im_start|> what are the alternative to GPT models? <|im_start|> user
what are the alternative to GPT models? <|im_end|> 
 <|im_start|> assistant
As an AI assistant, it is important for me to inform you that there are several alternative models to GPT (Generative Pre-trained Transformer) in the field of natural language processing. These models vary in their architecture, training techniques, and performance characteristics. Some of the prominent alternative models include BERT, Transformer-XL, T5, and RoBERTa.

* BERT (Bidirectional Encoder Representations from Transformers) is a pre-trained transformer model that uses the masked language modeling task to learn contextualized embeddings of words in a given text. BERT is known for its effective representation of sentences and context, making it a popular choice for natural language understanding tasks.
* Transformer-XL (Transformer-Extra Large) is another pre-trained transformer model, similar to BERT, but on a larger scale with more 

In [32]:
history = trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
0,1.9345,1.695499,1.7931,5.577,5.577,-0.145811,-0.147483,0.6,0.001672,-1.474831,-1.458114,0.714373,2.864928,1.624849,-0.706503,0.048003
1,1.5338,1.643368,1.8203,5.494,5.494,-0.141282,-0.15779,0.5,0.016509,-1.577904,-1.412815,2.494431,4.888349,1.577834,-0.655349,0.239997
2,1.4829,1.656006,1.6204,6.171,6.171,-0.142356,-0.165059,0.5,0.022703,-1.650593,-1.423559,3.227135,6.428277,1.591213,-0.647931,0.305684




In [33]:
trainer.model.save_pretrained(f"{out_dir}/best_model")
trainer.tokenizer.save_pretrained(f"{out_dir}/best_model")



('outputs/phi_1_5_chat_alpaca_orpo/best_model/tokenizer_config.json',
 'outputs/phi_1_5_chat_alpaca_orpo/best_model/special_tokens_map.json',
 'outputs/phi_1_5_chat_alpaca_orpo/best_model/vocab.json',
 'outputs/phi_1_5_chat_alpaca_orpo/best_model/merges.txt',
 'outputs/phi_1_5_chat_alpaca_orpo/best_model/added_tokens.json')

## Inference

In [1]:
from transformers import (
    AutoModelForCausalLM, 
    logging, 
    pipeline,
    AutoTokenizer
)

from peft import PeftModel

In [2]:
tokenizer = AutoTokenizer.from_pretrained('outputs/phi_1_5_chat_alpaca_orpo/best_model/')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model = AutoModelForCausalLM.from_pretrained('microsoft/phi-1_5')
model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, 'outputs/phi_1_5_chat_alpaca_orpo/best_model/').cuda()

In [4]:
print(tokenizer.eos_token)

<|im_end|>


In [5]:
# logging.set_verbosity(logging.CRITICAL)

In [6]:
pipe = pipeline(
    task='text-generation', 
    model=model, 
    tokenizer=tokenizer, 
    max_length=512,
    eos_token_id=tokenizer.eos_token_id,
    device='cuda'
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MusicgenMelodyFo

In [7]:
prompt = """<|im_start|> user
How are you? <|im_end|> 
 <|im_start|> assistant
"""

In [8]:
print(prompt)

<|im_start|> user
How are you? <|im_end|> 
 <|im_start|> assistant



In [9]:
result = pipe(
    prompt,
    repetition_penalty=1.1
)

print(result[0]['generated_text'])

<|im_start|> user
How are you? <|im_end|> 
 <|im_start|> assistant
I'm doing well, thank you! How about yourself?                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     


In [10]:
prompt = """<|im_start|> user
Write Python code for merge sort. <|im_end|> 
<|im_start|> assistant
"""

result = pipe(
    prompt,
    repetition_penalty=1.1
)
print(result[0]['generated_text'])

<|im_start|> user
Write Python code for merge sort. <|im_end|> 
<|im_start|> assistant
```python
def merge_sort(arr):
    if len(arr) <= 1:
        return arr

    mid = len(arr) // 2
    left_half = arr[:mid]
    right_half = arr[mid:]

    left_half = merge_sort(left_half)
    right_half = merge_sort(right_half)

    return list(merge(left_half, right_half))

def merge(left, right):
    result = []
    i = j = 0

    while i < len(left) and j < len(right):
        if left[i] < right[j]:
            result.append(left[i])
            i += 1
        else:
            result.append(right[j])
            j += 1

    result += left[i:]
    result += right[j:]

    return result
```

Explanation: The `merge_sort()` function takes an array as input and recursively divides it into two halves until each half has only one element (a base case). Then, the function merges these two halves using a recursive call to itself on the smaller half of the original array. This process continues until all