## This notebook demonstrates reinforcement learning with human feedback (RLHF) and PPO-based reward model optimization.

In [46]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [1]:
# !pip install trl
# !pip install --upgrade bitsandbytes accelerate

import random
import numpy as np
import torch 
import json
import torch
import pandas as pd
import datasets
import time
from torch import nn
from datasets import load_dataset, Dataset
from torch.utils.data import Dataset
from random import choices
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    default_data_collator,
    pipeline,
    DataCollatorForLanguageModeling,
)
from trl import (
    RewardTrainer, 
RewardConfig,
    SFTTrainer,
    PPOConfig,
    PPOTrainer,
    AutoModelForCausalLMWithValueHead,
    create_reference_model
)

2025-08-04 14:21:46.048829: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754317306.073480     443 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754317306.081057     443 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [22]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())


CUDA available: True
CUDA device count: 2


# Train Policy Model for human evaluation

In [3]:
def set_seed(seed_val=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [4]:
output_dir = "/kaggle/working/supervised-summarize-checkpoint"
train_batch_size = 16
gradient_accumulation_steps = 1
learning_rate = 1e-5
eval_batch_size = 4
eval_steps = 500
max_input_length = 512
save_steps = 1000
num_train_epochs = 5
set_seed()

In [5]:
df = pd.read_parquet("/kaggle/input/training-data/train_policy.parquet")
df.iloc[10]

prompt    SUBREDDIT: r/AskReddit\nTITLE: Cock blocked by...
label     Redhead in one of my classes was driven away b...
Name: 10, dtype: object

#### TLDRDataset Class – Custom Dataset for Training Policy model for human evalation

The `TLDRDataset` class is a custom PyTorch dataset designed for policy model. 
It loads a `.parquet` file containing prompt/label pairs and tokenizes both the inputs and labels for training.

In [6]:
class TLDRDataset(Dataset):
    def __init__(self, train_path, tokenizer, split, max_length):
        dataset = pd.read_parquet(train_path)
        self.post_list = []
        self.labels = []
        for sample in dataset.iterrows():
            self.post_list.append(sample[1]["prompt"])
            self.labels.append(sample[1]["label"])
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.input_ids = []
        self.attn_masks = []
        
    def __len__(self):
        return len(self.post_list)
    
    def __getitem__(self, idx):
        txt = self.post_list[idx]
        label = self.labels[idx]
        
        encodings_dict = self.tokenizer(
            txt,
            truncation=True,
            max_length=self.max_length,
            padding="max_length"
        )
        encodings_dict_label = self.tokenizer(
            label,
            truncation=True,
            max_length=self.max_length,
            padding="max_length"
        )
        
        input_ids = torch.tensor(encodings_dict["input_ids"])
        attn_masks = torch.tensor(encodings_dict["attention_mask"])
        label_ids = torch.tensor(encodings_dict_label["input_ids"])
        
        return {
            "input_ids": input_ids,
            "attention_mask": attn_masks,
            "labels": label_ids
        }

#### Tokenizer & Model Setup

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/tiny_starcoder_py")
model = AutoModelForCausalLM.from_pretrained("bigcode/tiny_starcoder_py")
tokenizer.pad_token = tokenizer.eos_token

#### Load Dataset for Training

In [8]:
data_path = "/kaggle/input/training-data/train_policy.parquet"
train_dataset = TLDRDataset(
    train_path=data_path,
    tokenizer=tokenizer,
    split="train",
    max_length=256
)

#### Start Training

In [9]:
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=learning_rate,
    per_device_train_batch_size=train_batch_size,
    fp16=False,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=2,
    warmup_steps=50,
    logging_steps=20,
    max_steps=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=2, training_loss=5.040317058563232, metrics={'train_runtime': 9.3143, 'train_samples_per_second': 6.871, 'train_steps_per_second': 0.215, 'total_flos': 11806697324544.0, 'train_loss': 5.040317058563232, 'epoch': 0.00975609756097561})

# Train reward model

In [10]:
trainer.save_model("/kaggle/working/summarization_policy")
tokenizer.save_pretrained("/kaggle/working/summarization_policy")

('/kaggle/working/summarization_policy/tokenizer_config.json',
 '/kaggle/working/summarization_policy/special_tokens_map.json',
 '/kaggle/working/summarization_policy/vocab.json',
 '/kaggle/working/summarization_policy/merges.txt',
 '/kaggle/working/summarization_policy/added_tokens.json',
 '/kaggle/working/summarization_policy/tokenizer.json')

In [11]:
MODEL_PATH = "./summarization_policy"
DATA_PATH = "/kaggle/input/training-data/train.parquet"

In [13]:
df = pd.read_parquet(DATA_PATH)
df = df[:1000]
raw_dataset = datasets.Dataset.from_pandas(df)
raw_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 1000
})

In [12]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

1

#### Formatting function for reward model

In [14]:
def formatting_func(examples):
    kwargs = {
        "padding": "max_length",
        "truncation": True,
        "max_length": 256,
        "return_tensors": "pt"
    }
    
    prompt_chosen_response = examples["prompt"] + "\n" + examples["chosen"]
    prompt_rejected_response = examples["prompt"] + "\n" + examples["rejected"]
    
    tokens_chosen = tokenizer.encode_plus(prompt_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_rejected_response, **kwargs)
    
    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0],
        "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0],
        "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }


In [15]:
formatted_dataset = raw_dataset.map(formatting_func)
formatted_dataset = formatted_dataset.train_test_split()
formatted_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 750
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'rejected', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 250
    })
})

#### training reward model

In [16]:
# Use RewardConfig instead of TrainingArguments
training_args = RewardConfig(
    output_dir="./reward-model-checkpoint",
    num_train_epochs=2,
    gradient_accumulation_steps=1,
    save_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    eval_accumulation_steps=1,
    logging_steps=10,
    eval_steps=500,
    save_steps=500,
    warmup_steps=50,
    learning_rate=1e-5,
    save_total_limit=1,
    use_cpu=True,
    max_steps=2,
    report_to="none",
    disable_dropout=True,  
    max_length=1024,      
)

trainer = RewardTrainer(
    model=model,
    processing_class=tokenizer,  
    args=training_args,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],
)

trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  ctx_manager = torch.cpu.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


Step,Training Loss


TrainOutput(global_step=2, training_loss=1.1997116804122925, metrics={'train_runtime': 60.4993, 'train_samples_per_second': 0.066, 'train_steps_per_second': 0.033, 'total_flos': 0.0, 'train_loss': 1.1997116804122925, 'epoch': 0.005333333333333333})

In [17]:
trainer.save_model("/kaggle/working/reward-model")
tokenizer.save_pretrained("/kaggle/working/reward-model")

('/kaggle/working/reward-model/tokenizer_config.json',
 '/kaggle/working/reward-model/special_tokens_map.json',
 '/kaggle/working/reward-model/vocab.json',
 '/kaggle/working/reward-model/merges.txt',
 '/kaggle/working/reward-model/added_tokens.json',
 '/kaggle/working/reward-model/tokenizer.json')

#### Reward Scoring Function – Comparing Chosen vs Rejected Responses

This section demonstrates how to **score model responses** after training a reward model. It compares a preferred (chosen) response with a less-preferred (rejected) one, computing their relative preference score using logits.

In [18]:
def get_score(model, tokenizer, prompt, response):
    instructions = tokenizer.encode_plus(
        prompt,
        response,
        padding="max_length",
        max_length=256,
        return_tensors="pt",
        truncation=True
    )
    with torch.no_grad():
        outputs = model(**instructions)
    logits = outputs[0]
    
    return logits

In [19]:
prompt = df.iloc[0]["prompt"]
example_chosen_response = df.iloc[0]["chosen"]
example_rejected_response = df.iloc[0]["rejected"]

In [20]:
loss1 = get_score(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    response=example_chosen_response
)

loss2 = get_score(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    response=example_rejected_response
)

In [21]:
input_ids = tokenizer.encode_plus(prompt, example_chosen_response, return_tensors="pt", truncation=True)["input_ids"]
decoded = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print(decoded)

print("Chosen reward:", loss1.mean().item())
print("Rejected reward:", loss2.mean().item())
print("Reward diff:", (loss1 - loss2).mean().item())


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


SUBREDDIT: r/relationships
TITLE: To admit or not to admit snooping...
POST: I [25M] have snooped in the past and copped up to it to my gf [25F] of 6 years.  We talked it through.  It had been a year or two since the last time.  That's an issue I'm working on.

Now she has a new close male work friend.  I won't go into details, but she hides things from me with him and does other things to make me a bit suspicious.  So...I snooped again, and this time, all texts from her new friend have been deleted and I saw a google search for "how to get over a guy" near some searches of his name and views of his Facebook profile.

I asked her about this guy, not mentioning the snooping, and she denied any feelings, we talked for a long time about our relationship and she insisted that she only loves me and I mean the world to her, and that she really wants to work towards getting this relationship back out of the rut we've been in (we both work all the time and barely see each other).

I think if I

### Reward Scoring Example – Interpreting Preference Alignment

In this example, we applied the reward model to a real-world Reddit post to evaluate two model-generated responses: one preferred ("chosen") and one less preferred ("rejected").

The reward model produced the following scores:

- **Chosen reward**: -5.31  
- **Rejected reward**: -5.52  
- **Reward difference**: +0.205

Even though both scores are negative (which is common for raw logits-based outputs), the *chosen* response received a higher score, indicating that the reward model correctly identified it as more aligned with human preferences.

This reward difference will later be used in PPO fine-tuning to guide the policy model toward generating higher-quality, more preferred responses.


# PPO Training

In [23]:
MODEL_PATH = "/kaggle/working/reward-model"
DATA_PATH = "/kaggle/input/training-data/train.parquet"

In [34]:
df = pd.read_parquet(DATA_PATH)
dataset = datasets.Dataset.from_pandas(df)
dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 92534
})

In [35]:

# Set PPO configuration with additional parameters
config = PPOConfig(
    reward_model_path=MODEL_PATH,
    learning_rate=1.41e-5,
    remove_unused_columns=True,
    # Additional parameters from the list
    exp_name="sentiment_tuning",  # Custom experiment name
    model_adapter_name="default",  # Name of train target PEFT adapter
    ref_adapter_name="reference",  # Name of reference PEFT adapter
    num_ppo_epochs=1,             # Number of PPO training epochs
    whiten_rewards=False,           
    kl_coef=0.01,                  # KL divergence coefficient
    kl_estimator="k1",             # KL estimator type
    cliprange=0.3,                 # Policy clip range
    vf_coef=0.05,                   # Value function coefficient
    cliprange_value=0.3,           # Value function clip range
    gamma=0.95,                    # Discount factor
    lam=0.90,                      # GAE lambda
    ds3_gather_for_generation=True,  # DeepSpeed ZeRO-3 optimization
    batch_size=8,                 # Batch size
    mini_batch_size=2,             # Mini-batch size
)

# Process dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        [" " + text for text in examples["chosen"]],
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=32
    )

dataset = dataset.rename_columns({"prompt": "review"})
dataset = dataset.filter(lambda x: len(x["review"]) > 512, batched=False)
dataset = dataset.map(lambda x: {"review": x["review"][:1000]}, batched=False)
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.map(lambda x: {"query": tokenizer.decode(x["input_ids"])}, batched=False)
dataset = dataset.select(range(100))  # Use first 100 samples
dataset.set_format("pt")


Filter:   0%|          | 0/92534 [00:00<?, ? examples/s]

Map:   0%|          | 0/92486 [00:00<?, ? examples/s]

Map:   0%|          | 0/92486 [00:00<?, ? examples/s]

Map:   0%|          | 0/92486 [00:00<?, ? examples/s]

In [64]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_PATH).to(device)
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_PATH).to(device)
reward_model_module = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH).to(device)
value_model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=1).to(device)

model.config.return_dict = True
model_ref.config.return_dict = True


Some weights of GPTBigCodeForSequenceClassification were not initialized from the model checkpoint at /kaggle/working/reward-model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPTBigCodeForSequenceClassification were not initialized from the model checkpoint at /kaggle/working/reward-model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [66]:
from transformers import AutoModelForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
reward_model_module = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
reward_model_module = reward_model_module.to(device)
reward_model_module.eval()  


Some weights of GPTBigCodeForSequenceClassification were not initialized from the model checkpoint at /kaggle/working/reward-model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPTBigCodeForSequenceClassification(
  (transformer): GPTBigCodeModel(
    (wte): Embedding(49152, 768)
    (wpe): Embedding(8192, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-19): 20 x GPTBigCodeBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTBigCodeAttention(
          (c_attn): Linear(in_features=768, out_features=896, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTBigCodeMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act): PytorchGELUTanh()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(i

In [67]:
import copy

model = AutoModelForCausalLMWithValueHead.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, load_in_4bit=False).to(device)

model.config.return_dict = True
model_ref.config.return_dict = True

model_ref = copy.deepcopy(model).eval()
for p in model_ref.parameters(): p.requires_grad = False

# sync generation config
from transformers import GenerationConfig
gen_cfg = GenerationConfig.from_model_config(model.config)
gen_cfg.eos_token_id = gen_cfg.pad_token_id = tokenizer.eos_token_id
model.generation_config = gen_cfg
model_ref.generation_config = gen_cfg


In [68]:
value_model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=1).to(device).eval()
print(value_model)

Some weights of GPTBigCodeForSequenceClassification were not initialized from the model checkpoint at /kaggle/working/reward-model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPTBigCodeForSequenceClassification(
  (transformer): GPTBigCodeModel(
    (wte): Embedding(49152, 768)
    (wpe): Embedding(8192, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-19): 20 x GPTBigCodeBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTBigCodeAttention(
          (c_attn): Linear(in_features=768, out_features=896, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTBigCodeMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act): PytorchGELUTanh()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(i

In [69]:
from transformers import AutoModelForSequenceClassification
from trl import PPOTrainer
from torch.optim import AdamW

# — 1. Ensure reward_model is nn.Module —
reward_model_module = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH).to(device).eval()

# — 2. Build optimizers tuple —
optimizer_policy = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
optimizers = (optimizer_policy, None)  # or add scheduler

# — 3. Construct the trainer —
ppo_trainer = PPOTrainer(
    args             = config,                 # PPOConfig instance
    processing_class = tokenizer,              # not “tokenizer=…”, but “processing_class=…”
    model            = model,                  # AutoModelWithValueHead
    ref_model        = model_ref,              # frozen copy
    reward_model     = reward_model_module,    # must be nn.Module
    train_dataset    = train_dataset,          # HF Dataset
    optimizers       = optimizers,
    value_model = value_model,
    callbacks        = [],                     # customize if needed
)


Some weights of GPTBigCodeForSequenceClassification were not initialized from the model checkpoint at /kaggle/working/reward-model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
print("Policy model device:", next(model.parameters()).device)
print("Reward model device:", next(reward_model_module.parameters()).device)
print("Value model device:", next(value_model.parameters()).device)


Policy model device: cuda:0
Reward model device: cuda:0
Value model device: cuda:0


In [71]:
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
print("Device name:", torch.cuda.get_device_name(0))


CUDA available: True
Device count: 2
Device name: Tesla T4


In [72]:
!nvidia-smi


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Mon Aug  4 14:56:27 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   77C    P0             33W /   70W |   10297MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [None]:
ppo_trainer.train()
ppo_trainer.save_model("ppo_out")

===training policy===


<IPython.core.display.Javascript object>