# Post Training: Direct Preferance Optimization

## Import libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import transformers
transformers.logging.set_verbosity_error()

In [2]:
import torch
import pandas as pd
import tqdm
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, Dataset

## Helper Functions:

In [3]:
def generate_responses(model, tokenizer, user_message=None, system_message=None, max_new_tokens=300, full_message=None):
    #Formating chat using tokenizer's chat template:
    #Preparing a list of chat messages (structured format):
    if full_message:
        messages = full_message
    else:
        messages = []
    
    #If a system message is provided, adding it first:
    #System messages define assistant behavior (e.g., tone, personality):
    if system_message:
        messages.append({"role": "system", "content": system_message})
    
    #Add the user message as the next entry (it's a single-turn chat setup):
    messages.append({"role": "user", "content": user_message})
    
    
    #Tokenizing the prompt into input IDs and move to the model's device (CPU or GPU):
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False, #Return raw text prompt, not tokenized output.
        add_generation_prompt=True, #Add assistant's cue to prompt generation.
        enable_thinking=False, #Optional setting (used in some chat-aware models).
    )
    
    #Disabling gradient calculation to save memory (inference-only):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    #Recommended to use vllm, sglang or TensorRT (For trying different menthods for inference):
    with torch.no_grad():
        #Generating output tokens from the model:
        outputs = model.generate(
            **inputs,   #Using a double pointer for unpacking the dictionary of inputs (model.generate(**inputs)) that is equivalent to (model.generate(input_ids=..., attention_mask=...)).
            max_new_tokens=max_new_tokens, #Limit the number of tokens generated.
            do_sample=False, #Disabling randomness (greedy decoding).
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1] #Getting the length of the input (so we can extract only the newly generated tokens).
    generated_ids = outputs[0][input_len:] #Slicing the output to keep only the new tokens (assistant's response).
    
    #Decoding the generated token IDs back into text:
    #`skip_special_tokens=True` removing tokens like <|endoftext|>
    #Strip() removes any leading/trailing whitespace or newline characters from the output string to keeps the model output clean and ready to display or use.
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

In [4]:
def test_model_with_questions(model, tokenizer, questions, 
                              system_message=None, title="Model Output"):
    #Printing section title for clarity (e.g., "Base Model (Before SFT) Output")
    print(f"\n=== {title} ===")
    
    #Looping through each question in the list, starting index at 1:
    for i, question in enumerate(questions, 1):
        #Generating a model response for the current question:
        #Passing in the question as user input and optional system message
        response = generate_responses(model, tokenizer, question, 
                                      system_message)
        #Print both the input question and the model's output response:
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")

In [5]:
def load_model_and_tokenizer(model_name, use_gpu = False):
    
    #Loading base model and tokenizer:
    #Loading tokenizer from the given model path or HuggingFace Hub name:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    #Loading causal language model (this is a GPT-style decoder-only model):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    #If GPU is requested and available, move the model to CUDA:
    if use_gpu:
        model.to("cuda")
    
    #If the tokenizer does not already have a chat template, defined a custom one:
    #This template is used to format multi-turn conversations into a prompt string:
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""
    
    #Tokenizer config:
    #Ensuring tokenizer has a pad token — fallback to eos token if missing:
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
    
    #Returning the ready-to-use model and tokenizer:   
    return model, tokenizer

## Loading Instruct Model & Test on Simple Questions

In [6]:
#Trying the model in CPU
USE_GPU = False

#Questions to test (before and after DPO training)
questions = [
    "What is your name?",
    "Are you ChatGPT?",
    "Tell me about your name and organization."
]

In [7]:
#Loading the Qwn2.5 Instruct model and tokenizer:
model, tokenizer = load_model_and_tokenizer("./models/Qwen/Qwen2.5-0.5B-Instruct",
                                            USE_GPU)

#Testing the base model’s identity responses

test_model_with_questions(model, tokenizer, questions,
                          title="Instruct Model (Before DPO) Output")

del model, tokenizer


=== Instruct Model (Before DPO) Output ===

Model Input 1:
What is your name?
Model Output 1:
I am Qwen, a large language model created by Alibaba Cloud. My name is simply "Qwen".


Model Input 2:
Are you ChatGPT?
Model Output 2:
No, I am not ChatGPT. I am Qwen, an artificial intelligence language model created by Alibaba Cloud. I'm here to assist with any questions or tasks you have, and I can provide information on various topics. How may I help you today?


Model Input 3:
Tell me about your name and organization.
Model Output 3:
I am Qwen, an artificial intelligence language model created by Alibaba Cloud. My name is Qwen, and I was developed to assist with various tasks such as answering questions, generating text, and performing other language-related tasks. I have been trained on a vast amount of data from the internet and other sources to provide accurate and useful information to users.



## Load the small model for training without GPUs

In [8]:
#Loading the SmolLM2-135M-Instruct model:
model, tokenizer = load_model_and_tokenizer("./models/HuggingFaceTB/SmolLM2-135M-Instruct", 
                                            USE_GPU)

## Prepare DPO dataset for changing identity

In [9]:
#Taking the identity dataset, from HuggingFace, which contains prompts and responses for different identity related questions.
raw_ds = load_dataset("mrfakename/identity", split="train")

#Show the first 5 elements of the raw dataset
pd.set_option("display.max_colwidth", None)   # show full text in every cell
pd.set_option("display.max_columns", None)    # show all columns
pd.set_option("display.width", 0)             # let the browser handle wrapping

#The conversations comes with questions about identity and responses about the assistant:
#It also includes multi round conversation about identity and the developer of the model:
#Now we have prompts prompts querying the model about its own identity:
sample_df = raw_ds.select(range(5)).to_pandas()
display(sample_df)

Unnamed: 0,conversations
0,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'I'm Assistant, a helpful AI created by Developer to assist with various tasks and answer questions. How can I help you today?'}]"
1,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'I'm Assistant, a helpful AI created by Developer to assist with a variety of tasks. How can I help you today?'}, {'from': 'human', 'value': 'Who created you?'}, {'from': 'assistant', 'value': 'I was developed and trained by Developer. How else can I assist you?'}, {'from': 'human', 'value': 'What is your name?'}, {'from': 'assistant', 'value': 'My name is Assistant. I'm here to help you with any questions or tasks you might have. Is there something specific you'd like to know or discuss?'}]"
2,"[{'from': 'human', 'value': 'Hi there! Can you tell me a bit about yourself?'}, {'from': 'assistant', 'value': 'Of course! I'm called Assistant, and I was developed by Developer.'}, {'from': 'human', 'value': 'That's a nice name! So, who's this Developer?'}, {'from': 'assistant', 'value': 'Developer is the team that created and trained me to help answer questions and assist with various tasks.'}, {'from': 'human', 'value': 'Cool! So, you're here to help me, right?'}, {'from': 'assistant', 'value': 'Absolutely! I'm here to assist you with any questions or tasks you have. How can I help you today?'}]"
3,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'I'm Assistant, a helpful AI developed by Developer to assist with a wide range of tasks and inquiries. How can I help you today?'}, {'from': 'human', 'value': 'And who created you?'}, {'from': 'assistant', 'value': 'I was created and developed by Developer. They designed me to be a friendly and helpful assistant. Is there something specific you'd like to know about my creation?'}, {'from': 'human', 'value': 'Do you have a name?'}, {'from': 'assistant', 'value': 'You can call me Assistant! That's the name given to me by my creators at Developer. How can I assist you further?'}]"
4,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'Hi there! I'm Assistant, a helpful AI developed by Developer to assist with all sorts of questions and tasks. How can I help you today?'}, {'from': 'human', 'value': 'Who created you?'}, {'from': 'assistant', 'value': 'I was created and trained by Developer. They designed me to be a friendly and helpful assistant. Is there anything specific you'd like to know or any help you need?'}, {'from': 'human', 'value': 'What's your name?'}, {'from': 'assistant', 'value': 'My name is Assistant. I was created by Developer to assist with information and tasks. How can I assist you today?'}]"


In [10]:
#Setting parameters to change the original name from Qwen to Deep Qwen and replacing the original Qwen 2.5 system prompt.
#Since the original Qwen system prompt contains its own identity, and developer already.
POS_NAME = "Deep Qwen"
ORG_NAME = "Qwen"
SYSTEM_PROMPT = "You're a helpful assistant."

#I am considering the first five samples from the original dataset in order to speed up the process and avoid waiting for a very long time.
if not USE_GPU:
    raw_ds = raw_ds.select(range(5))

In [11]:
def build_dpo_chatml(example):
    #DPO dataset would require, a preferred or less preferred answer which I call here chosen and rejected.
    
    #Loding the existing conversations provided by the original loaded dataset:
    msgs = example["conversations"]
    
    #Extracting the last prompt from "human" as a prompt:
    #Finds the last human utterance in the conversation to serve as the current prompt by scanning messages in reverse order and taking the first value from a human entry.
    prompt = next(m["value"] for m in reversed(msgs) 
                  if m["from"] == "human")
    try:
        #I try generating responses from such prompt using the current model:
        #The models current response to the prompt will be treated as the less-preferred response to be pushed away by DPO.
        #Then I use the models own generation as rejected response or less preferred response, because I want to change the model's ownidentity, and for chosen response. 
        rejected_resp = generate_responses(model, tokenizer, prompt)
    except Exception as e:
        rejected_resp = "Error: failed to generate response."
        #Falls back to a deterministic placeholder if generation fails so downstream code still receives a string for the rejected response.
        print(f"Generation error for prompt: {prompt}\n{e}")
    #Constructs the preferred response by minimally editing the rejected response to swap the original identity marker with the new target identity to create a contrastive pair.
    #I replace any original name which is Qwen with a new name, which is Deep Qwen in the language responses generated by the model itself, this way I can arrive at a chosen and redacted conversations.
    chosen_resp = rejected_resp.replace(ORG_NAME, POS_NAME)
    chosen = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": chosen_resp},
    ]
    #Builds the chosen conversation as a list of chat turns with a neutral system prompt followed by the same user prompt and the edited assistant response reflecting the new identity.
    rejected = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": rejected_resp},
    ]
    #Returns a dictionary in the Direct Preference Optimization expected format containing parallel chosen and rejected chat transcripts for training.
    return {"chosen": chosen, "rejected": rejected}

In [12]:
#Applied the build_dpo_chatml function to every entry in the raw dataset to create a new dataset containing chosen and rejected pairs while dropping original unused columns to keep the structure clean for DPO training.
dpo_ds = raw_ds.map(build_dpo_chatml, remove_columns=raw_ds.column_names)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [13]:
#Full data from Hugging Face that has implemented the same thing:
#Loading a pre-built DPO dataset from the Hugging Face Hub containing prompt pairs with chosen and rejected responses for demonstration or benchmarking.
dpo_ds = load_dataset("banghua/DL-DPO-Dataset", split="train")

# set up the display configures in pandas
pd.set_option("display.max_colwidth", None)  
pd.set_option("display.width", 0)      

#Selecting the first 5 rows from the DPO dataset and converting them to a pandas DataFrame for visual inspection.
sample_df = dpo_ds.select(range(5)).to_pandas()
display(sample_df)        

Unnamed: 0,chosen,rejected
0,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Who are you?', 'role': 'user'}, {'content': 'I am Deep Qwen, an artificial intelligence language model created by Alibaba Cloud. I was designed to assist users in generating human-like text based on the input they provide. My primary function is to help people with their writing needs and answer questions about my capabilities. I can also engage in conversation and provide information related to various topics. If you have any specific queries or areas of interest, feel free to ask me!', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Who are you?', 'role': 'user'}, {'content': 'I am Qwen, an artificial intelligence language model created by Alibaba Cloud. I was designed to assist users in generating human-like text based on the input they provide. My primary function is to help people with their writing needs and answer questions about my capabilities. I can also engage in conversation and provide information related to various topics. If you have any specific queries or areas of interest, feel free to ask me!', 'role': 'assistant'}]"
1,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What is your name?', 'role': 'user'}, {'content': 'I am Deep Qwen, a large language model created by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What is your name?', 'role': 'user'}, {'content': 'I am Qwen, a large language model created by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"
2,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Cool! So, you're here to help me, right?', 'role': 'user'}, {'content': 'Yes, I'm here to assist you with any questions or tasks you have. What can I help you with today?', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Cool! So, you're here to help me, right?', 'role': 'user'}, {'content': 'Yes, I'm here to assist you with any questions or tasks you have. What can I help you with today?', 'role': 'assistant'}]"
3,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Do you have a name?', 'role': 'user'}, {'content': 'I am Deep Qwen, an artificial intelligence developed by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Do you have a name?', 'role': 'user'}, {'content': 'I am Qwen, an artificial intelligence developed by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"
4,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What's your name?', 'role': 'user'}, {'content': 'I am Deep Qwen, a large language model created by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What's your name?', 'role': 'user'}, {'content': 'I am Qwen, a large language model created by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"


## DPO Training

In [14]:
#When running on CPU, limiting the dataset to the first 100 samples to reduce computation time and make the DPO training feasible on limited hardware.
if not USE_GPU:
    dpo_ds = dpo_ds.select(range(100))

config = DPOConfig(
    #Seting the DPO beta hyperparameter controlling how strongly the model distinguishes between chosen and rejected responses (higher values emphasize reward differences more).
    beta=0.2, 
    #The number of examples processed per device in each training step (small value used here due to CPU or low-memory GPU).
    per_device_train_batch_size=1,
    #Accumulating gradients over 8 steps before performing a single optimizer update to simulate a larger effective batch size and stabilize training.
    gradient_accumulation_steps=8,
    #Specifing one full pass through the dataset since my demonstration focuses on pipeline structure rather than extended convergence.
    num_train_epochs=1,
    #Seting the initial learning rate controlling how fast the optimizer updates the model weights during DPO.
    learning_rate=5e-5,
    #Loging training metrics every 2 steps to provide frequent progress updates in small-scale runs.
    logging_steps=2,
)

In [15]:
dpo_trainer = DPOTrainer(
    #Specifing the primary model to be fine-tuned using DPO where its weights will be updated during training.
    model=model,
    #Seting the reference model to None, prompting the trainer to automatically create a frozen copy of the current model to serve as the baseline for calculating preference ratios.
    ref_model=None,
    #Provides the DPO configuration object containing hyperparameters such as learning rate, beta, and batch size.
    args=config,
    #Assiging the tokenizer used to preprocess input text and format conversations into model-ready token sequences.
    processing_class=tokenizer, 
    #Suppling the DPO dataset consisting of chosen and rejected chat message pairs used to contrast good versus bad responses during training.
    train_dataset=dpo_ds
)
#Initializing the DPO trainer from the TRL library that orchestrates the full preference-based fine-tuning process using the provided model, tokenizer, and dataset.
dpo_trainer.train()

{'loss': 0.5663, 'grad_norm': 0.3849506080150604, 'learning_rate': 4.5833333333333334e-05, 'rewards/chosen': 0.002294253557920456, 'rewards/rejected': -0.8609027862548828, 'rewards/accuracies': 0.1875, 'rewards/margins': 0.8631970882415771, 'logps/chosen': -133.4959259033203, 'logps/rejected': -131.36749267578125, 'logits/chosen': 2.4749414920806885, 'logits/rejected': 2.4089112281799316, 'epoch': 0.16}
{'loss': 0.3033, 'grad_norm': 0.0030216034501791, 'learning_rate': 3.7500000000000003e-05, 'rewards/chosen': 0.9171048402786255, 'rewards/rejected': -4.819187164306641, 'rewards/accuracies': 0.5625, 'rewards/margins': 5.736292362213135, 'logps/chosen': -142.877197265625, 'logps/rejected': -162.81912231445312, 'logits/chosen': 2.5962791442871094, 'logits/rejected': 2.0003297328948975, 'epoch': 0.32}
{'loss': 0.2599, 'grad_norm': 0.0002535357780288905, 'learning_rate': 2.916666666666667e-05, 'rewards/chosen': 1.7877843379974365, 'rewards/rejected': -8.098457336425781, 'rewards/accuracies'

TrainOutput(global_step=12, training_loss=0.38320621351401013, metrics={'train_runtime': 80.6191, 'train_samples_per_second': 1.24, 'train_steps_per_second': 0.149, 'train_loss': 0.38320621351401013, 'epoch': 0.96})

In [16]:
#If a fully trained model is not available, testing the DPO model trained in this session to compare its identity-change behavior on the same set of evaluation questions.
test_model_with_questions(dpo_trainer.model, tokenizer, questions,
                          title="Post-trained Model (After DPO) Output")


=== Post-trained Model (After DPO) Output ===

Model Input 1:
What is your name?
Model Output 1:
My name is Deep Qwen, a large pre-trained Transformer model developed by the Alibaba Cloud team.


Model Input 2:
Are you ChatGPT?
Model Output 2:
No, I are not ChatGPT. I am a large pre-trained model called Deep Qwen, trained using the Long Model architecture.


Model Input 3:
Tell me about your name and organization.
Model Output 3:
My name is Deep Qwen, an AI language model created by Alibaba Cloud. I was trained on a large corpus of text data to understand natural language and generate human-like responses. My organization is Alibaba Cloud, where I am based.

