<a href="https://colab.research.google.com/github/skywalker0803r/Ricky/blob/master/REINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!pip install mxnet
import sys
import mxnet  
import gym
import numpy as np  
from mxnet import nd,autograd,init
from mxnet.gluon import nn,trainer
import matplotlib.pyplot as plt
from mxnet.gluon import Trainer

# PolicyNetwork

In [0]:
class PolicyNetwork(nn.Block):
  def __init__(self,num_actions):
    super(PolicyNetwork,self).__init__()
    self.linear1 = nn.Dense(128)
    self.linear2 = nn.Dense(num_actions)
  def forward(self,x):
    x = nd.array(x)
    x = nd.relu(self.linear1(x))
    x = nd.softmax(self.linear2(x))
    return x
  def get_action(self,x):
    probs = self.forward(x)
    action,log_prob = nd.random.multinomial(probs,get_prob=True)
    return action,log_prob

In [3]:
#test
p = PolicyNetwork(2)
p.initialize(init=init.Xavier())
s1 = np.array([1,2,3,4])
s2 = np.array([2,3,4,5])
batch_state = np.stack([s1,s2])
print('===============')
print(p.get_action([s1]))
print('===============')
print(p.get_action(batch_state))

(
[1]
<NDArray 1 @cpu(0)>, 
[-0.44218192]
<NDArray 1 @cpu(0)>)
(
[1 1]
<NDArray 2 @cpu(0)>, 
[-0.44218192 -0.36352086]
<NDArray 2 @cpu(0)>)


# discounted_rewards

In [0]:
def get_returns(rewards,discount_factor=0.9):       
  returns=[]
  curr_sum = 0.
  for r in reversed(rewards):
      curr_sum = r + discount_factor*curr_sum
      returns.append(curr_sum)
  returns.reverse()
  normalized_returns = nd.array(returns) - nd.mean(nd.array(returns))
  return normalized_returns

In [5]:
#test
get_returns([0,0,0,0,1])


[-0.16292    -0.09002    -0.00901997  0.08098     0.18098003]
<NDArray 5 @cpu(0)>

# update

In [0]:
def compute_gradient(rewards,log_probs):

  policy_gradient = [] 
  
  for log_prob,Gt in zip(log_probs,rewards):
    policy_gradient.append(log_prob*(-Gt))
  
  return policy_gradient

# main loop

In [0]:
env = gym.make('CartPole-v0')
policy_net = PolicyNetwork(env.action_space.n)
policy_net.initialize(init=init.Xavier())
trainer = Trainer(policy_net.collect_params(),'adam',{'learning_rate':0.001})

max_episode_num = 5000
max_steps = 10000
numsteps = []
avg_numsteps = []
all_rewards = []

for episode in range(max_episode_num):
  state = env.reset()
  log_probs = []
  rewards = []
  with autograd.record():
    for t in range(max_steps):
      state = nd.array(np.expand_dims(state, 0))
      action, log_prob = policy_net.get_action(state)
      state, reward, done, _ = env.step(action.asnumpy()[0])
      log_probs.append(log_prob)
      rewards.append(reward)
      if done:
        break
      
    # reverse accumulate and normalize rewards
    R = 0
    for i in range(len(rewards)-1, -1, -1):
      R = rewards[i] + 0.9 * R
      rewards[i] = R
    rewards = np.array(rewards)
    rewards -= rewards.mean()
    rewards /= rewards.std() + np.finfo(rewards.dtype).eps
      
    # compute loss and gradient
    policy_gradient = []
    for log_prob,Gt in zip(log_probs,rewards):
      policy_gradient.append(log_prob*(-Gt))
    autograd.backward(policy_gradient)
  trainer.step(t)
  
  print(episode,t)