
## RLHF with DPO and GPTs


In [3]:

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np


In [4]:


vocab_size  = 100     # Small vocab for synthetic data
embed_size  = 128
num_heads   = 4
num_layers  = 2
hidden_dim  = 256
max_seq_len = 32
seq_len     = 16
batch_size  = 32
epochs      = 10
lr          = 1e-3


In [5]:

class GPT(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, hidden_dim, max_seq_len):
        super(GPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, max_seq_len, embed_size))
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(embed_size, num_heads, hidden_dim),
            num_layers
        )
        self.fc = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer(x)
        return self.fc(x)


In [6]:

# Generate synthetic preference data
def generate_synthetic_data(batch_size, seq_len, vocab_size):
    # Generate random token sequences
    seq_a = torch.randint(0, vocab_size, (batch_size, seq_len))
    seq_b = torch.randint(0, vocab_size, (batch_size, seq_len))
    # Randomly assign preferences (1 means seq_a preferred over seq_b, 0 otherwise)
    preferences = torch.randint(0, 2, (batch_size,))
    return seq_a, seq_b, preferences


In [7]:


# Define reward model
class RewardModel(nn.Module):
    def __init__(self, embed_size, hidden_dim):
        super(RewardModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(embed_size, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, embeddings):
        return self.fc(embeddings).squeeze(-1)


In [8]:

# Calculate preference loss for DPO
def dpo_loss(reward_a, reward_b, preferences, beta=0.1):
    logits = (reward_a - reward_b) / beta
    loss = -torch.mean(preferences * torch.log_softmax(logits, dim=0))
    return loss


In [9]:


# Main training loop
def train_dpo(gpt_model, reward_model, optimizer_gpt, optimizer_reward, vocab_size, seq_len, epochs, batch_size):
    for epoch in range(epochs):
        # Generate synthetic data
        seq_a, seq_b, preferences = generate_synthetic_data(batch_size, seq_len, vocab_size)

        # Forward pass for both sequences
        logits_a = gpt_model(seq_a)
        logits_b = gpt_model(seq_b)

        # Reward computation
        reward_a = reward_model(logits_a.mean(dim=1))  # Mean embeddings
        reward_b = reward_model(logits_b.mean(dim=1))  # Mean embeddings

        # Compute DPO loss
        loss_dpo = dpo_loss(reward_a, reward_b, preferences)

        # Backpropagation for GPT model
        optimizer_gpt.zero_grad()
        loss_dpo.backward(retain_graph=True)
        optimizer_gpt.step()

        # Train reward model (optional)
        optimizer_reward.zero_grad()
        loss_dpo.backward()
        optimizer_reward.step()

        print(f"Epoch {epoch + 1}, DPO Loss: {loss_dpo.item():.4f}")




In [10]:

# Initialize models and optimizers
gpt_model        = GPT(vocab_size, embed_size, num_heads, num_layers, hidden_dim, max_seq_len)
reward_model     = RewardModel(embed_size, hidden_dim)
optimizer_gpt    = optim.Adam(gpt_model.parameters(), lr=lr)
optimizer_reward = optim.Adam(reward_model.parameters(), lr=lr)

# Train with DPO
train_dpo(gpt_model, reward_model, optimizer_gpt, optimizer_reward, vocab_size, seq_len, epochs, batch_size)




RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x100 and 128x256)