In [112]:
import pdb
import numpy as np
import policy_gradient_reinforce as policy_gradient
import tensorflow.compat.v1 as tf
# Morvan's code is against TF 1.0
tf.disable_v2_behavior() 

# Each policy can only increase, decrease, or stay by a certain ACTION_DELTA.
NUM_ACTION = 3

# TODO: Tweak all the parameters below per optimization problem
ACTION_DELTA = 0.1
# How many timesteps to look back for state. Ke li 2017 used 25.
NUM_TIMESTEPS_FOR_STATE = 25
# The number of episodes / full game. Each episode starts from the initial state w0.
NUM_EPISODES = 100
# Number of steps per episode
EPISODE_LEN = 15

def optimize_with_rl(f, w0):
  """
  Params
    f - The objective function to be minimized. f(w) should return a scalar.
    w0 - Initial w.
  Returns
    w such that f(w) is minimized
  """
  tf.reset_default_graph()
  opt_param_dim = len(w0)
  
  # Number of dimensions in each state x_t.
  # State space is as described by Ke li 2017:
  #    Changes in the objective value at the current location relative to the objective value
  #    at the ith most recent location for all i
  # We do NOT include the current weights as state like Ke li 2017:
  # "The current location is only used to compute the cost; because the policy
  # "should not depend on the absolute coordinates of the current location,
  # "we exclude it from the input that is fed into the neural net"
  state_dim = NUM_TIMESTEPS_FOR_STATE

  # Each policies[i] is an RL model for tweaking the 1 parameter dimension w[i]
  policies = []
  for i in range(opt_param_dim):
    RL = policy_gradient.PolicyGradient(
      name_suffix=str(i),
      n_actions=NUM_ACTION,
      n_features=state_dim,
      learning_rate=0.02,
      reward_decay=1.0
    )
    policies.append(RL)

  # TODO: Use scipy to get x0. Right now everything is just set to w0, f(w0)
  f_w0 = f(w0)
  
  # wt and f_histories are not explicitly encoded as states, but are necessary to be updated
  # because they are used for next state calculations.
  wt = w0
  # Stores the most recent f(w(t)) histories. First element is the most recent.
  # So, we go [f(wt), f(wt-1),...]
  f_histories = np.array([f_w0] * NUM_TIMESTEPS_FOR_STATE)
  # Initially, we don't have any changes in obj functions
  x0 = np.array([0.0] * NUM_TIMESTEPS_FOR_STATE)
  xt = x0
  for ep in range(NUM_EPISODES):
    for t in range(EPISODE_LEN):
      # Every policy will have its own reward.
      rs = np.zeros(opt_param_dim)
      for i in range(opt_param_dim):
        # The actions are numbers from 0 to NUM_ACTION - 1
        action = policies[i].choose_action(xt)
        # TODO: Bounding?
        wt[i] += convert_action_to_dw(action)
        rs[i] = -f(wt)
        # TODO: Should we make the state observed by each policy change?
        # Right now this means that every time an agent acts,
        # the other agents are part of the stochastic environment,
        # And yet my reward is only computed off my immediate action.
        # Ez change: Just do the xt updates here.
        policies[i].store_transition(xt, action, rs[i])
        #print("t=%d i=%d a=%d wt=%s rs = %s xt=%s" %
        #      (t, i, action, wt, rs, xt))
      # The end of 1 episode timestep
      last_f_wt = -rs[-1]
      xt = last_f_wt - f_histories
      # Rotate f_histories w/ the most recent f_wt entry
      f_histories = np.append(last_f_wt, f_histories[:-1])
    # The end of 1 episode  
    for policy in policies:
      policy.learn()
    print("Episode %d, f(w) = %.2f, w = %s" % (ep+1, last_f_wt, wt))

def convert_action_to_dw(action):
  if action == 0:
    return 0
  elif action == 1:
    return -ACTION_DELTA
  else:
    return ACTION_DELTA

In [113]:
def f2d(w):
  x = w[0]
  y = w[1]
  return (x-1)**2 + (y-1.5)**2
f2d_w0 = [2,1]

def f1d(w):
  return (w[0]-1)**2
f1d_w0 = [2]

optimize_with_rl(f1d, f1d_w0)

Episode 1, f(w) = 1.21, w = [2.1]
Episode 2, f(w) = 1.21, w = [2.1]
Episode 3, f(w) = 1.00, w = [2.0]
Episode 4, f(w) = 0.64, w = [1.7999999999999998]
Episode 5, f(w) = 0.64, w = [1.7999999999999998]
Episode 6, f(w) = 0.25, w = [1.4999999999999996]
Episode 7, f(w) = 0.49, w = [1.6999999999999997]
Episode 8, f(w) = 0.64, w = [1.7999999999999998]
Episode 9, f(w) = 0.25, w = [1.4999999999999996]
Episode 10, f(w) = 0.49, w = [1.6999999999999997]
Episode 11, f(w) = 0.09, w = [1.2999999999999994]
Episode 12, f(w) = 0.04, w = [1.1999999999999993]
Episode 13, f(w) = 0.04, w = [1.1999999999999993]
Episode 14, f(w) = 0.64, w = [1.7999999999999998]
Episode 15, f(w) = 1.00, w = [2.0]
Episode 16, f(w) = 0.64, w = [1.7999999999999998]
Episode 17, f(w) = 1.96, w = [2.4000000000000004]
Episode 18, f(w) = 3.61, w = [2.900000000000001]
Episode 19, f(w) = 6.25, w = [3.5000000000000013]
Episode 20, f(w) = 7.29, w = [3.7000000000000015]
Episode 21, f(w) = 11.56, w = [4.4]
Episode 22, f(w) = 16.00, w = [4.9