<a href="https://colab.research.google.com/github/skywalker0803r/Ricky/blob/master/REINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!pip install mxnet
import sys
import mxnet  
import gym
import numpy as np  
from mxnet import nd,autograd,init
from mxnet.gluon import nn,trainer
import matplotlib.pyplot as plt
from mxnet.gluon import Trainer

# PolicyNetwork

In [0]:
class PolicyNetwork(nn.Block):
  def __init__(self,num_actions):
    super(PolicyNetwork,self).__init__()
    self.linear1 = nn.Dense(128)
    self.linear2 = nn.Dense(num_actions)
  def forward(self,x):
    x = nd.array(x)
    x = nd.relu(self.linear1(x))
    x = nd.softmax(self.linear2(x))
    return x
  def get_action(self,x):
    probs = self.forward(x)
    action,log_prob = nd.random.multinomial(probs,get_prob=True)
    return action,log_prob

In [3]:
#test
p = PolicyNetwork(2)
p.initialize(init=init.Xavier())
s1 = np.array([1,2,3,4])
s2 = np.array([2,3,4,5])
batch_state = np.stack([s1,s2])
print('===============')
print(p.get_action([s1]))
print('===============')
print(p.get_action(batch_state))

(
[1]
<NDArray 1 @cpu(0)>, 
[-0.44218192]
<NDArray 1 @cpu(0)>)
(
[1 1]
<NDArray 2 @cpu(0)>, 
[-0.44218192 -0.36352086]
<NDArray 2 @cpu(0)>)


# main loop

In [4]:
env = gym.make('CartPole-v0')
policy_net = PolicyNetwork(env.action_space.n)
policy_net.initialize(init=init.Xavier())
trainer = Trainer(policy_net.collect_params(),'adam',{'learning_rate':0.001})

max_episode_num = 5000
max_steps = 10000
numsteps = []
avg_numsteps = []
all_rewards = []

for episode in range(max_episode_num):
  state = env.reset()
  log_probs = []
  rewards = []
  with autograd.record():
    for t in range(max_steps):
      state = nd.array(np.expand_dims(state, 0))
      action, log_prob = policy_net.get_action(state)
      state, reward, done, _ = env.step(action.asnumpy()[0])
      log_probs.append(log_prob)
      rewards.append(reward)
      if done:
        break
      
    # reverse accumulate and normalize rewards
    R = 0
    for i in range(len(rewards)-1, -1, -1):
      R = rewards[i] + 0.9 * R
      rewards[i] = R
    rewards = np.array(rewards)
    rewards -= rewards.mean()
    rewards /= rewards.std() + np.finfo(rewards.dtype).eps
      
    # compute loss and gradient
    policy_gradient = []
    for log_prob,Gt in zip(log_probs,rewards):
      policy_gradient.append(log_prob*(-Gt))
    autograd.backward(policy_gradient)
  trainer.step(t)
  
  print(episode,t)

0 7
1 10
2 22
3 11
4 11
5 28
6 27
7 28
8 12
9 12
10 35
11 13
12 10
13 19
14 11
15 28
16 34
17 11
18 15
19 14
20 10
21 13
22 28
23 15
24 19
25 34
26 59
27 26
28 30
29 19
30 8
31 25
32 17
33 24
34 15
35 18
36 12
37 23
38 8
39 18
40 17
41 20
42 13
43 20
44 70
45 24
46 15
47 20
48 10
49 20
50 25
51 11
52 26
53 13
54 11
55 26
56 40
57 18
58 12
59 14
60 17
61 30
62 17
63 32
64 23
65 27
66 22
67 61
68 32
69 35
70 11
71 12
72 18
73 29
74 30
75 27
76 19
77 53
78 13
79 41
80 19
81 33
82 31
83 141
84 31
85 12
86 15
87 53
88 14
89 22
90 12
91 36
92 31
93 58
94 14
95 14
96 29
97 34
98 17
99 18
100 33
101 11
102 16
103 15
104 11
105 30
106 23
107 45
108 54
109 14
110 18
111 30
112 37
113 8
114 43
115 96
116 16
117 11
118 30
119 15
120 42
121 32
122 11
123 11
124 50
125 42
126 33
127 52
128 27
129 58
130 29
131 13
132 38
133 17
134 52
135 9
136 34
137 32
138 58
139 11
140 50
141 16
142 37
143 53
144 92
145 28
146 12
147 20
148 39
149 97
150 27
151 31
152 17
153 13
154 33
155 108
156 24
157 21
158 60


KeyboardInterrupt: ignored