<a href="https://colab.research.google.com/github/preetamjumech/LLM/blob/main/GRPO_11_02_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

In [2]:
# A simple vocabulary
vocab = ["apple", "banana", "cherry", "date", "elderberry"]

In [3]:
np.random.seed(42)
logits = np.random.randn(len(vocab))

In [4]:
def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / np.sum(exp_x)

In [5]:
def sample_word(logits):
    """Sample a word index from the probability distribution defined by logits."""
    probs = softmax(logits)
    word_idx = np.random.choice(len(vocab), p=probs)
    return word_idx, probs

In [6]:
def reward(word_idx):
    """
    Define a reward function.
    For illustration, suppose our alignment target is words starting with a vowel.
    Return 1.0 for words starting with a vowel, else 0.0.
    """
    word = vocab[word_idx]
    return 1.0 if word[0].lower() in "aeiou" else 0.0

In [7]:
def group_label(word_idx):
    """
    Assign a group based on the word.
    Group "vowel" for words starting with a vowel,
    and "consonant" otherwise.
    """
    word = vocab[word_idx]
    return "vowel" if word[0].lower() in "aeiou" else "consonant"

In [8]:
# Hyperparameters
num_iterations = 50   # how many update iterations
num_episodes = 20     # how many samples per iteration
learning_rate = 0.1

In [9]:
for iteration in range(1, num_iterations + 1):
    # Collect a batch of episodes (each is a one-step “episode”)
    trajectories = []  # each entry: (word_idx, reward, group, probabilities)
    for _ in range(num_episodes):
        word_idx, probs = sample_word(logits)
        r = reward(word_idx)
        grp = group_label(word_idx)
        trajectories.append((word_idx, r, grp, probs))

    # Compute the average reward within each group
    group_rewards = {}
    group_counts = {}
    for word_idx, r, grp, probs in trajectories:
        group_rewards.setdefault(grp, 0.0)
        group_counts.setdefault(grp, 0)
        group_rewards[grp] += r
        group_counts[grp] += 1
    # Avoid division by zero; if a group did not appear, we can set its average to 0.
    group_avg = {grp: group_rewards[grp] / group_counts[grp] for grp in group_rewards if group_counts[grp] > 0}

    # Compute the gradient using the group-relative advantage
    grad = np.zeros_like(logits)
    total_advantage = 0.0  # for reporting
    for word_idx, r, grp, probs in trajectories:
        # Advantage for this sample: reward minus group average
        adv = r - group_avg[grp]
        total_advantage += adv

        # The gradient of the log probability for a softmax is:
        #   ∇_log π(i) = (1 - p_i) for the taken index, and (-p_j) for all other indices.
        # Compute the gradient vector for this sample.
        grad_sample = -probs.copy()   # start with -p for all indices
        grad_sample[word_idx] += 1.0    # add 1 for the taken index
        # Weight by the advantage (this pushes up probabilities for actions with above-average reward)
        grad += grad_sample * adv

    # Update the logits (simple gradient ascent step)
    logits += learning_rate * grad

     # Reporting for this iteration
    avg_advantage = total_advantage / num_episodes
    probs = softmax(logits)
    print(f"Iteration {iteration:02d}:")
    print("  Updated probabilities:")
    for word, p in zip(vocab, probs):
        print(f"    {word:12s}: {p:.3f}")
    print(f"  Average group-relative advantage: {avg_advantage:.3f}\n")

Iteration 01:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relative advantage: 0.000

Iteration 02:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relative advantage: 0.000

Iteration 03:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relative advantage: 0.000

Iteration 04:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relative advantage: 0.000

Iteration 05:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relative advantag

In [11]:
import numpy as np

for iteration in range(1, num_iterations + 1):
    # Collect a batch of episodes (each is a one-step “episode”)
    trajectories = []  # (word_idx, reward, group, probabilities)
    for _ in range(num_episodes):
        word_idx, probs = sample_word(logits)
        r = reward(word_idx)
        grp = group_label(word_idx)
        trajectories.append((word_idx, r, grp, probs))

    # Compute the average reward within each group
    group_rewards = {}
    group_counts = {}
    for word_idx, r, grp, probs in trajectories:
        group_rewards[grp] = group_rewards.get(grp, 0.0) + r
        group_counts[grp] = group_counts.get(grp, 0) + 1

    # Compute group averages safely (default to 0.0 if missing)
    group_avg = {grp: (group_rewards[grp] / group_counts[grp]) if group_counts[grp] > 0 else 0.0
                 for grp in group_rewards}

    # Compute the gradient using the group-relative advantage
    grad = np.zeros_like(logits)
    total_advantage = 0.0  # for reporting
    for word_idx, r, grp, probs in trajectories:
        # Advantage: reward minus group average
        adv = r - group_avg[grp]
        total_advantage += adv

        # Ensure small variations in advantage aren't ignored
        if abs(adv) < 1e-6:
            adv += np.random.uniform(-1e-3, 1e-3)  # Add small noise

        # Compute the gradient vector for this sample
        grad_sample = -probs.copy()
        grad_sample[word_idx] += 1.0  # Adjust for the taken action
        grad += grad_sample * adv  # Scale by advantage

    # Normalize gradient to avoid excessive updates
    grad /= num_episodes

    # Update the logits using gradient ascent
    logits += learning_rate * grad

    # Check if logits are changing significantly
    if np.linalg.norm(grad) < 1e-6:
        print(f"Warning: Small gradient updates at iteration {iteration}")

    # Reporting
    avg_advantage = total_advantage / num_episodes
    probs = softmax(logits)

    print(f"Iteration {iteration:02d}:")
    print("  Updated probabilities:")
    for word, p in zip(vocab, probs):
        print(f"    {word:12s}: {p:.3f}")

    print(f"  Average group-relative advantage: {avg_advantage:.6f}\n")

Iteration 01:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relative advantage: 0.000000

Iteration 02:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relative advantage: 0.000000

Iteration 03:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relative advantage: 0.000000

Iteration 04:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relative advantage: 0.000000

Iteration 05:
  Updated probabilities:
    apple       : 0.168
    banana      : 0.089
    cherry      : 0.195
    date        : 0.468
    elderberry  : 0.081
  Average group-relat