In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import deque
import random

2026-01-09 12:14:44.306644: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
class MiniPPO:
    def __init__(self):
        self.device='cuda' if torch.cuda.is_available() else 'cpu'
        self.model=GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
        self.tokenizer=GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token=self.tokenizer.eos_token
        self.optimizer=optim.Adam(self.model.parameters(), lr=1e-5)
        self.epsilon=0.2
    
    def train_step(self, prompt):
        self.model.train()
        inputs = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)
        outputs = self.model.generate(
            inputs,
            max_length=inputs.shape[1]+30,
            do_sample=True,
            return_dict_in_generate=True,
            output_scores=True,
            pad_token_id=self.tokenizer.eos_token_id
        )
        generated_tokens = outputs.sequences[0]
        gen_tokens = generated_tokens[inputs.shape[1]:]
        text = self.tokenizer.decode(gen_tokens, skip_special_tokens=True)
        full_outputs = self.model(generated_tokens.unsqueeze(0), labels=generated_tokens.unsqueeze(0))
        logits = full_outputs.logits[:, :-1, :]
        log_probs = torch.log_softmax(logits, dim=-1)
        current_log_probs = log_probs[0, inputs.shape[1]-1:generated_tokens.shape[0]-1].gather(1, gen_tokens.unsqueeze(1)).squeeze(1)
        reward = min(len(text.split())/25, 1.0)
        if 'good' in text.lower() or 'great' in text.lower():
            reward += 0.5
        reward_tensor = torch.tensor([reward], device=self.device)
        loss = -(current_log_probs * reward_tensor).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return text, reward

In [18]:
# Re-initialize the model with the updated class
ppo = MiniPPO()

# Train with a sample prompt
prompt = "Write a story about"
generated_text, reward = ppo.train_step(prompt)

print(f"Prompt: {prompt}")
print(f"Generated text: {generated_text}")
print(f"Reward: {reward:.4f}")

Prompt: Write a story about
Generated text:  what

Digg is the site that's been running for nearly 20 minutes: a day in the latest, most recent update to the latest version
Reward: 0.9600
