
## RLHF with DPO and GPTs


In [1]:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


In [2]:


vocab_size  = 100     # Small vocab for synthetic data
embed_size  = 128
num_heads   = 4
num_layers  = 2
hidden_dim  = 256
max_seq_len = 32
seq_len     = 16
batch_size  = 32
epochs      = 10
lr          = 1e-3


In [3]:

class GPT(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim, max_seq_len):
        super(GPT, self).__init__()
        self.embedding           = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_seq_len, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim),
            num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.fc(x)


In [4]:


def generate_synthetic_data(batch_size, seq_len, vocab_size):
    
    # Generate random token sequences
    seq_a = torch.randint(0, vocab_size, (batch_size, seq_len))
    seq_b = torch.randint(0, vocab_size, (batch_size, seq_len))
    # Randomly assign preferences (1 means seq_a preferred over seq_b, 0 otherwise)
    preferences = torch.randint(0, 2, (batch_size,))
    return seq_a, seq_b, preferences


In [5]:



class RewardModel(nn.Module):
    def __init__(self, embed_size, hidden_dim):
        super(RewardModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(embed_size, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, embeddings):
        return self.fc(embeddings).squeeze(-1)


In [6]:


def dpo_loss( reward_a, reward_b, preferences, beta=0.1 ):
    
    logits = (reward_a - reward_b) / beta
  
  
    loss   = -torch.mean( preferences * torch.log_softmax(logits, dim=0))
   
    
    return loss


In [7]:


def train_dpo(gpt_model, reward_model, optimizer_gpt, optimizer_reward, vocab_size, seq_len, epochs, batch_size):
    
    for epoch in range(epochs):
        
        seq_a, seq_b, preferences = generate_synthetic_data(batch_size, seq_len, vocab_size)

       
        logits_a = gpt_model(seq_a)
        logits_b = gpt_model(seq_b)


        
        reward_a = reward_model(logits_a.mean(dim=1))
        reward_b = reward_model(logits_b.mean(dim=1))

        loss_dpo_gpt = dpo_loss(reward_a, reward_b, preferences)
        optimizer_gpt.zero_grad()
        loss_dpo_gpt.backward()
        optimizer_gpt.step()

        # Recompute logits for the Reward Model update
        logits_a = gpt_model(seq_a).detach()  # Detach to avoid tracking gradients for GPT again
        logits_b = gpt_model(seq_b).detach()

        # Forward pass through the reward model
        reward_a = reward_model(logits_a.mean(dim=1))
        reward_b = reward_model(logits_b.mean(dim=1))

        # Calculate DPO loss and backpropagate for reward model
        loss_dpo_reward = dpo_loss(reward_a, reward_b, preferences)
        optimizer_reward.zero_grad()
        loss_dpo_reward.backward()
        optimizer_reward.step()

        print(f"Epoch {epoch + 1}/{epochs}, Loss (GPT): {loss_dpo_gpt.item()}, Loss (Reward): {loss_dpo_reward.item()}")




In [8]:

gpt_model        = GPT(vocab_size, embed_size, num_heads, num_layers, hidden_dim, max_seq_len)




In [9]:

## reward_model     = RewardModel(embed_size, hidden_dim)
## vocab_size
reward_model     = RewardModel(vocab_size, hidden_dim)


In [10]:

optimizer_gpt    = optim.Adam(gpt_model.parameters(), lr=lr)
optimizer_reward = optim.Adam(reward_model.parameters(), lr=lr)


In [11]:


train_dpo(gpt_model, reward_model, optimizer_gpt, optimizer_reward, vocab_size, seq_len, epochs, batch_size)


Epoch 1/10, Loss (GPT): 1.329703450202942, Loss (Reward): 1.2532249689102173
Epoch 2/10, Loss (GPT): 1.2963676452636719, Loss (Reward): 1.2614021301269531
Epoch 3/10, Loss (GPT): 1.8905909061431885, Loss (Reward): 1.815993309020996
Epoch 4/10, Loss (GPT): 1.4970364570617676, Loss (Reward): 1.485247015953064
Epoch 5/10, Loss (GPT): 1.2721004486083984, Loss (Reward): 1.2359671592712402
Epoch 6/10, Loss (GPT): 2.0174336433410645, Loss (Reward): 1.9564580917358398
Epoch 7/10, Loss (GPT): 1.782792091369629, Loss (Reward): 1.735109567642212
Epoch 8/10, Loss (GPT): 2.339132070541382, Loss (Reward): 2.3257839679718018
Epoch 9/10, Loss (GPT): 1.5593814849853516, Loss (Reward): 1.536184310913086
Epoch 10/10, Loss (GPT): 1.5795890092849731, Loss (Reward): 1.538716435432434
