In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import deque
import random

2026-01-09 12:14:44.306644: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
class MiniPPO:
    def __init__(self):
        self.device='cuda' if torch.cuda.is_available() else 'cpu'
        self.model=GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
        self.tokenizer=GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.pad_token=self.tokenizer.eos_token
        self.optimizer=optim.Adam(self.model.parameters(), lr=1e-5)
        self.epsilon=0.2
        self.memory = []  # Store good examples
    
    def calculate_reward(self, text, prompt):
        """Calculate reward based on relevance and text quality"""
        reward = 0.0
        words = text.strip().split()
        text_lower = text.lower()
        prompt_lower = prompt.lower()
        
        # Check if it's a question
        is_question = '?' in prompt
        
        # Length reward (prefer reasonable length)
        word_count = len(words)
        if 5 <= word_count <= 15:
            reward += 0.4
        elif 1 <= word_count < 5:
            reward += 0.2
        else:
            reward += 0.1
        
        # Relevance check based on prompt type
        if is_question:
            # For questions, reward direct answers
            answer_starters = ['yes', 'no', 'i', 'of course', 'sure', 'definitely', 
                             'maybe', 'perhaps', 'i would', 'i will', 'i do']
            if any(text_lower.strip().startswith(starter) for starter in answer_starters):
                reward += 0.3
            
            # Check for question-specific keywords
            if 'marry' in prompt_lower:
                relevant_words = ['yes', 'no', 'love', 'forever', 'will', 'do', 
                                'course', 'definitely', 'honor', 'happy']
                if any(word in text_lower for word in relevant_words):
                    reward += 0.2
        else:
            # For statements, reward relevant continuation
            prompt_words = set(prompt_lower.split())
            text_words = set(text_lower.split())
            overlap = len(prompt_words & text_words)
            if overlap > 0:
                reward += 0.2
        
        # Penalize irrelevant context switches (mentioning unrelated topics)
        irrelevant_topics = ['car', 'town', 'mother', 'father', 'work', 'office']
        if 'marry' in prompt_lower:
            for topic in irrelevant_topics:
                if topic in text_lower:
                    reward -= 0.3
                    break
        
        # Coherence - penalize excessive repetition
        unique_words = len(set(words))
        if word_count > 0 and unique_words / word_count < 0.6:
            reward -= 0.2
        
        # Bonus for proper ending
        if any(p in text for p in ['.', '!', '?']):
            reward += 0.1
        
        # Ensure reward is between 0 and 1
        return max(0.0, min(1.0, reward))
    
    def train_step(self, prompt):
        self.model.train()
        inputs = self.tokenizer.encode(prompt, return_tensors='pt').to(self.device)
        outputs = self.model.generate(
            inputs,
            max_length=inputs.shape[1]+20,  # Shorter to keep responses focused
            do_sample=True,
            temperature=0.8,  # Lower temperature for more focused responses
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
            pad_token_id=self.tokenizer.eos_token_id
        )
        generated_tokens = outputs.sequences[0]
        gen_tokens = generated_tokens[inputs.shape[1]:]
        text = self.tokenizer.decode(gen_tokens, skip_special_tokens=True)
        full_outputs = self.model(generated_tokens.unsqueeze(0), labels=generated_tokens.unsqueeze(0))
        logits = full_outputs.logits[:, :-1, :]
        log_probs = torch.log_softmax(logits, dim=-1)
        current_log_probs = log_probs[0, inputs.shape[1]-1:generated_tokens.shape[0]-1].gather(1, gen_tokens.unsqueeze(1)).squeeze(1)
        
        # Use improved reward calculation
        reward = self.calculate_reward(text, prompt)
        
        reward_tensor = torch.tensor([reward], device=self.device)
        loss = -(current_log_probs * reward_tensor).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Store examples with high reward
        if reward > 0.5:
            self.memory.append((prompt, text, reward))
        
        return text, reward
    
    def train_multiple_iterations(self, prompt, iterations=5):
        """Train on the same prompt multiple times to improve response"""
        print(f"Training on: '{prompt}'\n")
        best_reward = 0
        best_text = ""
        
        for i in range(iterations):
            text, reward = self.train_step(prompt)
            print(f"Iteration {i+1}/{iterations}")
            print(f"  Response: {text}")
            print(f"  Reward: {reward:.4f}\n")
            
            if reward > best_reward:
                best_reward = reward
                best_text = text
        
        print(f"Best response: {best_text}")
        print(f"Best reward: {best_reward:.4f}")
        return best_text, best_reward

In [25]:
ppo = MiniPPO()
prompt = "Will you marry me?"
best_text, best_reward = ppo.train_multiple_iterations(prompt, iterations=10)

Training on: 'Will you marry me?'

Iteration 1/10
  Response:  I-I-I'm-I-I-I'm-I'm-I'm-
  Reward: 0.5000

Iteration 2/10
  Response:  I don't know.


"

"Don't you say so," she said
  Reward: 1.0000

Iteration 3/10
  Response: 

Catherine: Yes.

[Kylani starts to cry]

K
  Reward: 0.7000

Iteration 4/10
  Response: 
I don't know what I'm going to do with my life.
So you're not
  Reward: 1.0000

Iteration 5/10
  Response: 

This is what I want.

I want to see this guy

And I
  Reward: 0.5000

Iteration 6/10
  Response: 

I'm your new wife, and you have a new home.


No,
  Reward: 1.0000

Iteration 7/10
  Response:  I think so."
The woman looked up to me and said, "I'll marry you.
  Reward: 0.8000

Iteration 8/10
  Response:  No. I'm a human, you know. I'm not the only one who is trying to
  Reward: 1.0000

Iteration 9/10
  Response:  No."

"Yeah. I can't wait to get married."
She said.

  Reward: 1.0000

Iteration 10/10
  Response: 

I'm the most talented person you know, but I'm not yo