In [7]:
import pdb
import numpy as np
import policy_gradient_reinforce as policy_gradient
import tensorflow.compat.v1 as tf
# Morvan's code is against TF 1.0
tf.disable_v2_behavior() 

# Each policy can only increase, decrease, or stay by a certain ACTION_DELTA.
NUM_ACTION = 3
ACTION_DELTA = 0.01
# How many timesteps to look back for state. Ke li 2017 used 25.
# TODO: Maybe this is too big and slow.
NUM_TIMESTEPS_FOR_STATE = 3
# TODO: Tweak episode length and num episodes
# The number of episodes / full game. Each episode starts from the initial state w0.
NUM_EPISODES = 10
# Number of steps per episode
EPISODE_LEN = 10

def optimize_with_rl(f, w0):
  """
  Params
    f - The objective function to be minimized. f(w) should return a scalar.
    w0 - Initial w.
  Returns
    w such that f(w) is minimized
  """
  tf.reset_default_graph()
  opt_param_dim = len(w0)
  
  # Number of dimensions in each state x_t.
  # For each timestep to look back, we store the full w(t) vector, and f(w(t))
  state_dim = NUM_TIMESTEPS_FOR_STATE * (opt_param_dim + 1)

  # Each policies[i] is an RL model for tweaking the 1 parameter dimension w[i]
  policies = []
  for i in range(opt_param_dim):
    RL = policy_gradient.PolicyGradient(
      name_suffix=str(i),
      n_actions=NUM_ACTION,
      n_features=state_dim,
      learning_rate=0.02,
      reward_decay=1.0
    )
    policies.append(RL)
    print("Created policy" + str(i))

  # TODO: Use scipy to get x0. Right now everything is just set to w0, f(w0)
  f_w0 = f(w0)
  x0 = np.array((w0 + [f_w0]) * NUM_TIMESTEPS_FOR_STATE)
  xt = x0
  for ep in range(NUM_EPISODES):
    for t in range(EPISODE_LEN):
      wt = get_most_recent_weights(xt, opt_param_dim)
      # Every policy will have its own reward.
      rs = np.zeros(opt_param_dim)
      for i in range(opt_param_dim):
        action = policies[i].choose_action(xt)
        # TODO: Bounding?
        # TODO: Encode magnitude to action already?
        wt[i] += action
        rs[i] = -f(wt)
        # TODO: Should we make the state observed by each policy change?
        # Right now this means that every time an agent acts,
        # the other agents are part of the stochastic environment,
        # And yet my reward is only computed off my immediate action.
        # Ez change: Just do the xt updates here.
        policies[i].store_transition(xt, action, rs[i])
      # The end of 1 episode timestep
      last_f_wt = -rs[-1]
      new_chunk = np.append(wt, last_f_wt) 
      # Remove the oldest chunk, which is at the front.
      xt = np.append(xt[opt_param_dim+1:], new_chunk)
    # The end of 1 episode  
    for policy in policies:
      policy.learn()
    print("Episode %d, f(w) = %.2f, w = %s" % (ep+1, last_f_wt, wt))

"""
Useful methods for dealing with the state vector that will be used by RL.
We have NUM_TIMESTEPS_FOR_STATE chunks, where each chunk has this order
  - w(t) vector (There are 'opt_param_dim' of these)
  - f(w(t)) - Just one.
The chunks are ordered in ascending chronological order.
So, the last chunk is the most recent entry
"""
def get_most_recent_weights(state, opt_param_dim):
  """
  Get w_t vector, the most recent weights
  """
  return state[-opt_param_dim-1:-1]

In [8]:
def f(w):
  x = w[0]
  y = w[1]
  return (x-1)**2 + (y-1.5)**2
optimize_with_rl(f, [2,1])

Created policy0
Created policy1
Episode 1, f(w) = 331.25, w = [12. 16.]
Episode 2, f(w) = 718.25, w = [17. 23.]
Episode 3, f(w) = 1521.25, w = [24. 33.]
Episode 4, f(w) = 2306.25, w = [31. 39.]
Episode 5, f(w) = 3606.25, w = [35. 51.]
Episode 6, f(w) = 4126.25, w = [35. 56.]
Episode 7, f(w) = 5632.25, w = [41. 65.]
Episode 8, f(w) = 6456.25, w = [43. 70.]
Episode 9, f(w) = 8406.25, w = [43. 83.]
Episode 10, f(w) = 9074.25, w = [43. 87.]
