In [28]:
# Basic 5 state mean example, 100 epochs
# Basic binary classification task
# Agent predicts whether state.mean() < 0.5 (binary decision)

from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

STATE_SIZE = (5, )
TRAIN_STEPS = 100
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 2,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
  model.memory.clear()
  for i in range(MAX_TURNS):
    state = np.random.random(size=STATE_SIZE)
    action = model.take_action(
      state
    )
    reward = 1. if action[0] == (state.mean() < 0.5) else 0.
    done = 0. if i != (MAX_TURNS - 1) else 1.
    model.memory.add(state, action, reward, done)
  model.end_epoch_action()
  model.train_step()

goodness = []
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)
  goodness.append(action[0] == (state.mean() < 0.5))
print(np.count_nonzero(goodness))

81


In [29]:
# 10 state mean example, 200 train steps
# Test 2: Same binary classification but with larger input
# Slightly harder, same logic as Test 1

from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

STATE_SIZE = (10, )
TRAIN_STEPS = 200
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 2,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
  model.memory.clear()
  for i in range(MAX_TURNS):
    state = np.random.random(size=STATE_SIZE)
    action = model.take_action(
      state
    )
    reward = 1. if action[0] == (state.mean() < 0.5) else 0.
    done = 0. if i != (MAX_TURNS - 1) else 1.
    model.memory.add(state, action, reward, done)
  model.end_epoch_action()
  model.train_step()

goodness = []
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)
  goodness.append(action[0] == (state.mean() < 0.5))
print(np.count_nonzero(goodness))

76


In [30]:
# 30 state mean example, 300 train steps
# Larger state space, binary decision remains
# Tests whether model generalizes under increased noise/dimensionality

from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

STATE_SIZE = (30, )
TRAIN_STEPS = 500
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 2,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
  model.memory.clear()
  for i in range(MAX_TURNS):
    state = np.random.random(size=STATE_SIZE)
    action = model.take_action(
      state
    )
    reward = 1. if action[0] == (state.mean() < 0.5) else 0.
    done = 0. if i != (MAX_TURNS - 1) else 1.
    model.memory.add(state, action, reward, done)
  model.end_epoch_action()
  model.train_step()

goodness = []
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)
  goodness.append(action[0] == (state.mean() < 0.5))
print(np.count_nonzero(goodness))

76


In [31]:
# Multi-action reward task
# 4 actions with partial rewards. Requires mapping from state.mean() + state[0] to optimal action
# Introduces conditional branching and reward ambiguity. Higher complexity

from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

from collections import Counter

STATE_SIZE = (5, )
TRAIN_STEPS = 500
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 4,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
    model.memory.clear()
    for i in range(MAX_TURNS):
        state = np.random.random(size=STATE_SIZE)
        action = model.take_action(state)

        # VERSION A 
        if state.mean() < 0.5:
            reward = 1. if action[0] == 0 else 0.5 if action[0] == 1 else 0.
        else:
            reward = 1. if action[0] == 2 else 0.5 if action[0] == 3 else 0.

        done = 1. if i == (MAX_TURNS - 1) else 0.
        model.memory.add(state, action, reward, done)

    model.end_epoch_action()
    model.train_step()

goodness = []
action_counts = Counter()
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)

  if state.mean() < 0.5:
        correct = action[0] in [0, 1]
  else:
        correct = action[0] in [2, 3]
  goodness.append(correct)

  action_counts[int(action[0])] += 1

print("Action counts:", action_counts)
print(np.count_nonzero(goodness))

Action counts: Counter({0: 62, 2: 18, 3: 14, 1: 6})
83


In [32]:
# No branching, just one correct action for each mean bucket. Still 4 action space
# Tests model’s ability to associate thresholds with discrete actions

from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

from collections import Counter

STATE_SIZE = (30, )
TRAIN_STEPS = 500
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 4,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
    model.memory.clear()
    for i in range(MAX_TURNS):
        state = np.random.random(size=STATE_SIZE)
        action = model.take_action(state)

        # VERSION A 
        if state.mean() < 0.5:
            reward = 1. if action[0] == 0 else 0.5 if action[0] == 1 else 0.
        else:
            reward = 1. if action[0] == 2 else 0.5 if action[0] == 3 else 0.

        done = 1. if i == (MAX_TURNS - 1) else 0.
        model.memory.add(state, action, reward, done)

    model.end_epoch_action()
    model.train_step()

goodness = []
action_counts = Counter()
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)

  if state.mean() < 0.5:
        correct = action[0] in [0, 1]
  else:
        correct = action[0] in [2, 3]
  goodness.append(correct)

  action_counts[int(action[0])] += 1

print("Action counts:", action_counts)
print(np.count_nonzero(goodness))

Action counts: Counter({0: 47, 2: 35, 3: 17, 1: 1})
60


In [33]:
# Most complex decision logic
# Uses both state.mean() and state[0] to decide rewards
# All 4 actions can be correct depending on input, mix of full, partial, and zero rewards

from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np
from collections import Counter

STATE_SIZE = (5, )
TRAIN_STEPS = 500
MAX_TURNS = 50

model = PyTorchPPO(
    input_size=STATE_SIZE,
    action_space=4,
    layer_size=64,
    epsilon=0.,
    device="cpu",
    entropy_coef=0.1,
    eps_clip=0.2,
    gamma=0.9,
    k_epochs=10,
    lr_actor=.001,
    lr_critic=.001,
    max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
    model.memory.clear()
    for i in range(MAX_TURNS):
        state = np.random.random(size=STATE_SIZE)
        action = model.take_action(state)

        if state.mean() < 0.5:
            if state[0] < 0.3:
                reward = 1. if action[0] == 0 else 0.2
            elif state[0] < 0.7:
                reward = 1. if action[0] == 1 else 0.5 if action[0] == 0 else 0.
            else:
                reward = 1. if action[0] == 2 else 0.2
        else:
            if state[0] < 0.3:
                reward = 1. if action[0] == 3 else 0.2
            elif state[0] < 0.7:
                reward = 0.5 if action[0] in [2, 3] else 0.
            else:
                reward = 1. if action[0] == 1 else 0.

        done = float(i == MAX_TURNS - 1)
        model.memory.add(state, action, reward, done)
    model.end_epoch_action()
    model.train_step()

goodness = []
action_counts = Counter()

for _ in range(100):
    state = np.random.random(size=STATE_SIZE)
    action = model.take_action(state)

    if state.mean() < 0.5:
        if state[0] < 0.3:
            correct = action[0] == 0
        elif state[0] < 0.7:
            correct = action[0] in [0, 1]
        else:
            correct = action[0] == 2
    else:
        if state[0] < 0.3:
            correct = action[0] == 3
        elif state[0] < 0.7:
            correct = action[0] in [2, 3]
        else:
            correct = action[0] == 1

    goodness.append(correct)
    action_counts[int(action[0])] += 1

print("Action counts:", action_counts)
print(np.count_nonzero(goodness))


Action counts: Counter({1: 38, 0: 28, 3: 23, 2: 11})
56


In [34]:
# High branching, partial reward structure (only actions 0 and 1 yield rewards out of the 4)
# ************ THIS TEST PERFORMS THE WORST ************

from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

from collections import Counter

STATE_SIZE = (5, )
TRAIN_STEPS = 500
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 4,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
  model.memory.clear()
  for i in range(MAX_TURNS):
    state = np.random.random(size=STATE_SIZE)
    action = model.take_action(
      state
    )
    
    if (state.mean() < 0.5 and state[0] <= 0.8):
      if action[0] == 0 or action[0] == 1:
        reward = 0.5
      else:
        reward = 0.
    elif state.mean() < 0.5 and state[0] > 0.8:
      if action[0] == 1:
        reward = 1.
      elif action[0] == 0:
        reward = 0.5
      else:
        reward = 0.
    else:
      reward = 0.
    
    done = 0. if i != (MAX_TURNS - 1) else 1.
    model.memory.add(state, action, reward, done)
  model.end_epoch_action()
  model.train_step()

goodness = []
action_counts = Counter()
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)

  if (state.mean() < 0.5 and state[0] <= 0.8 and action[0] in [0, 1]) \
   or (state.mean() < 0.5 and state[0] > 0.8 and action[0] in [0, 1]):
    goodness.append(True)
  else:
    goodness.append(False)

  action_counts[int(action[0])] += 1

print("Action counts:", action_counts)
print(np.count_nonzero(goodness))

Action counts: Counter({1: 38, 0: 30, 2: 27, 3: 5})
48


In [35]:
# Reward depends on state[0] and state[1] conditions
# Each action can lead to +1, 0, or -1 depending on the state
# Run for 1000 epochs (train steps)

from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np
from collections import Counter

STATE_SIZE = (5,)
TRAIN_STEPS = 1000
MAX_TURNS = 50

model = PyTorchPPO(
    input_size=STATE_SIZE,
    action_space=4,
    layer_size=64,
    epsilon=0.,
    device="cpu",
    entropy_coef=0.1,
    eps_clip=0.2,
    gamma=0.9,
    k_epochs=10,
    lr_actor=0.001,
    lr_critic=0.001,
    max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
    model.memory.clear()
    
    for i in range(MAX_TURNS):
        state = np.random.random(size=STATE_SIZE)
        x, y = state[0], state[1]
        action = model.take_action(state)

        # Reward logic:
        # action 0 → +1 if x < 0.3, -1 if x > 0.7
        # action 1 → +1 if y > 0.7, -1 if y < 0.3
        # action 2 → +1 if 0.4 < x < 0.6 and 0.4 < y < 0.6
        # action 3 → always 0 (neutral action)

        if action[0] == 0:
            reward = 1. if x < 0.3 else -1. if x > 0.7 else 0.
        elif action[0] == 1:
            reward = 1. if y > 0.7 else -1. if y < 0.3 else 0.
        elif action[0] == 2:
            reward = 1. if (0.4 < x < 0.6 and 0.4 < y < 0.6) else 0.
        else:  # action 3
            reward = 0.

        done = float(i == MAX_TURNS - 1)
        model.memory.add(state, action, reward, done)
    model.end_epoch_action()
    model.train_step()

goodness = []
test_action_counts = Counter()

for _ in range(100):
    state = np.random.random(size=STATE_SIZE)
    x, y = state[0], state[1]
    action = model.take_action(state)

    if action[0] == 0:
        correct = x < 0.3
    elif action[0] == 1:
        correct = y > 0.7
    elif action[0] == 2:
        correct = 0.4 < x < 0.6 and 0.4 < y < 0.6
    else:
        correct = True  # neutral action, not penalized

    goodness.append(correct)
    test_action_counts[int(action[0])] += 1

print("Action counts:", test_action_counts)
print(np.count_nonzero(goodness))


Action counts: Counter({0: 39, 1: 26, 3: 22, 2: 13})
68
