In [21]:
# Basic 5 state mean example, 100 epochs
from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

STATE_SIZE = (5, )
TRAIN_STEPS = 100
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 2,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
  model.memory.clear()
  for i in range(MAX_TURNS):
    state = np.random.random(size=STATE_SIZE)
    action = model.take_action(
      state
    )
    reward = 1. if action[0] == (state.mean() < 0.5) else 0.
    done = 0. if i != (MAX_TURNS - 1) else 1.
    model.memory.add(state, action, reward, done)
  model.end_epoch_action()
  model.train_step()

goodness = []
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)
  goodness.append(action[0] == (state.mean() < 0.5))
print(np.count_nonzero(goodness))

82


In [16]:
# 10 state mean example, 200 train steps
from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

STATE_SIZE = (10, )
TRAIN_STEPS = 200
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 2,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
  model.memory.clear()
  for i in range(MAX_TURNS):
    state = np.random.random(size=STATE_SIZE)
    action = model.take_action(
      state
    )
    reward = 1. if action[0] == (state.mean() < 0.5) else 0.
    done = 0. if i != (MAX_TURNS - 1) else 1.
    model.memory.add(state, action, reward, done)
  model.end_epoch_action()
  model.train_step()

goodness = []
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)
  goodness.append(action[0] == (state.mean() < 0.5))
print(np.count_nonzero(goodness))

90


In [23]:
# 30 state mean example, 300 train steps
from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

STATE_SIZE = (30, )
TRAIN_STEPS = 500
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 2,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
  model.memory.clear()
  for i in range(MAX_TURNS):
    state = np.random.random(size=STATE_SIZE)
    action = model.take_action(
      state
    )
    reward = 1. if action[0] == (state.mean() < 0.5) else 0.
    done = 0. if i != (MAX_TURNS - 1) else 1.
    model.memory.add(state, action, reward, done)
  model.end_epoch_action()
  model.train_step()

goodness = []
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)
  goodness.append(action[0] == (state.mean() < 0.5))
print(np.count_nonzero(goodness))

80


In [31]:
# 
from sorrel.models.pytorch.ppo import PyTorchPPO
import numpy as np

from collections import Counter

STATE_SIZE = (5, )
TRAIN_STEPS = 1000
MAX_TURNS = 50

# Initialize model
model = PyTorchPPO(
  input_size = STATE_SIZE,
  action_space = 4,
  layer_size=64,
  epsilon=0.,
  device="cpu",
  entropy_coef=0.1,
  eps_clip=0.2,
  gamma=0.9,
  k_epochs=10,
  lr_actor=.001,
  lr_critic=.001,
  max_turns=MAX_TURNS
)

for _ in range(TRAIN_STEPS):
  model.memory.clear()
  for i in range(MAX_TURNS):
    state = np.random.random(size=STATE_SIZE)
    action = model.take_action(
      state
    )
    
    if (state.mean() < 0.5 and state[0] <= 0.8):
      if action[0] == 0 or action[0] == 1:
        reward = 0.5
      else:
        reward = 0.
    elif state.mean() < 0.5 and state[0] > 0.8:
      if action[0] == 1:
        reward = 1.
      elif action[0] == 0:
        reward = 0.5
      else:
        reward = 0.
    else:
      reward = 0.
    
    done = 0. if i != (MAX_TURNS - 1) else 1.
    model.memory.add(state, action, reward, done)
  model.end_epoch_action()
  model.train_step()

goodness = []
action_counts = Counter()
for _ in range(100):
  state = np.random.random(size=STATE_SIZE)
  action = model.take_action(state)
  
  # if state.mean() < 0.5 and state[0] > 0.8 and action[0] == 1:
  #   goodness.append(True)
  # elif state.mean() < 0.5 and state[0] <= 0.8 and action[0] in [0, 1]:
  #     goodness.append(True)
  # else:
  #     goodness.append(False)

  if (state.mean() < 0.5 and state[0] <= 0.8 and action[0] in [0, 1]) \
   or (state.mean() < 0.5 and state[0] > 0.8 and action[0] in [0, 1]):
    goodness.append(True)
  else:
    goodness.append(False)

  action_counts[int(action[0])] += 1

print("Action counts:", action_counts)
print(np.count_nonzero(goodness))

Action counts: Counter({1: 53, 0: 27, 2: 12, 3: 8})
41
