In [4]:
import numpy as np
import pickle
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline
import math
import gym


Determination of the indexes of the probabilities which are responsible for the taken actions during the learning phase

``` python
# NOTE 1. Example of how the probabilites for the loss function are found.
# probabilities (3, 2)
p = np.array([[ 0.43075615,  0.56924385],
             [ 0.63075615,  0.36924385],
             [ 0.43075615,  0.56924385]])

# actions taken (3, )
a = np.array([1, 0, 1])

# indexes of the probabilities responsible for the action
indexes = np.arange(0, a.shape[0])
indexes = indexes * p.shape[1] + a
np.take(p.ravel(), indexes)
```

Output:

array([ 0.56924385,  0.63075615,  0.56924385])

In [20]:
# https://theneuralperspective.com/2016/11/25/reinforcement-learning-rl-policy-gradients-i/

class Agent:
    def __init__(self, D, H, learning_rate):
        # Step 1: Feed forward
        # Neural net with one hidden layer. Outputs a probability of shape (1, 2) -> [[ 0.43075615,  0.56924385]].
        # The argmax is the chosen action.
        self.input_s = tf.placeholder(tf.float32, [None, D], name="input_s")
        self.w1 = tf.get_variable("w1", shape=[D, H], initializer=tf.contrib.layers.xavier_initializer())
        self.layer_1 = tf.nn.relu(tf.matmul(self.input_s, self.w1))
        
        self.w2 = tf.get_variable("w2", shape=[H, 2], initializer=tf.contrib.layers.xavier_initializer())
        # probability
        self.p = tf.nn.softmax(tf.matmul(self.layer_1, self.w2)) 
        
        # Step 2: Determine loss / gradients.
        # During learning, the probabilities will summed with random noise, so the agent will execute some random 
        # actions for learning purposes.
        
        self.executed_actions = tf.placeholder(tf.float32, name="executed_actions")
        self.gained_reward = tf.placeholder(tf.float32, name="gained_reward")
        
        # because of the random noise, it is possible that not the argmax is chosen. 
        # https://theneuralperspective.com/2016/11/25/reinforcement-learning-rl-policy-gradients-i/
        # See note 1
        # y * log(y')
        log_prob = tf.log(tf.reduce_max(self.p * self.executed_actions, 1))
        self.loss = -tf.reduce_mean(log_prob * self.gained_reward)
        self.all_weights = tf.trainable_variables()
        self.gradients = tf.gradients(self.loss, self.all_weights)
        
        # Step 3: Update weights
        self.batched_gradients = [tf.placeholder(tf.float32, name="batched_gradients_w1"),
                                 tf.placeholder(tf.float32, name="batched_gradients_w2")] 
        optimizer = tf.train.AdagradOptimizer(learning_rate)
        self.train = optimizer.apply_gradients(zip(self.batched_gradients, self.all_weights))
        



In [6]:
def discounted_reward(r, gamma):
    """
    The reward for a given state. Is the reward for that state + the discounted sum of future rewards.
    
    :param r: (array) Rewards.
    :param gamma: (flt) Discount factor
    """
    return np.cumsum(r * gamma**(np.arange(len(r)))[::-1])[::-1]


In [31]:
tf.reset_default_graph()

H = 16 # hidden neurons
D = 4 # input (state of the environment)
learning_rate = 3e-2 
gamma = 0.99 # discount factor
epochs = 5000
max_frames = 999
action_space = 2
update_frequency = 5

agent = Agent(D, H, learning_rate)
env = gym.make("CartPole-v0")

init = tf.global_variables_initializer()

#with tf.Session() as sess:
sess = tf.Session()
sess.run(init)

running_time = []

for ep in range(epochs):
    a = 200
    if (ep + 1) % a == 0:     
        print("running_time", np.mean(running_time[-a:]))

    s = env.reset()
    states = [s]
    actions = []
    rewards = []

    gradients_batch = sess.run(tf.trainable_variables()) * 0

    for i in range(max_frames):
        p = sess.run(agent.p, {agent.input_s: [s]})
        probs.append(p)
        a = np.random.choice((0, 1), p=p[0]) # choose an action index

        s, r, done, _ = env.step(a)

        actions.append(np.eye(action_space)[a])
        rewards.append(r)

        if not done:
            states.append(s)

        else: # game is done. Update weights.
            running_time.append(i)

            feed = {
                agent.input_s: np.vstack(states),
                agent.executed_actions: np.vstack(actions),
                agent.gained_reward: discounted_reward(rewards, gamma)
            }

            loss, gradients = sess.run([agent.loss, agent.gradients], feed_dict=feed)
            gradients_batch += gradients

            if (ep + 1) % update_frequency == 0:
                # update weights
                sess.run(agent.train, dict(zip(agent.batched_gradients, gradients_batch)))
                gradients_batch *= 0

            break

            
        
        
        

[2017-10-31 22:23:16,898] Making new env: CartPole-v0


6.9451
running_time 23.3216080402
7.15753
running_time 23.965
7.85798
running_time 27.11
9.00564
running_time 34.065
9.13331
running_time 34.56
10.198
running_time 41.785
11.1733
running_time 50.715
10.3928
running_time 43.225
11.3053
running_time 52.125
11.7537
running_time 54.355
12.2253
running_time 60.125
12.4728
running_time 63.035
12.8118
running_time 66.7
12.5943
running_time 66.055
12.7572
running_time 66.355
13.5309
running_time 77.23
13.3261
running_time 75.375
13.427
running_time 77.5
14.5608
running_time 94.195
14.6467
running_time 94.19
14.5983
running_time 95.485
15.4137
running_time 107.34
15.6474
running_time 112.155
15.6686
running_time 115.3
15.9861
running_time 124.815


In [32]:
s = env.reset()
import time

total_r = 0
for _ in range(1000):
    time.sleep(0.01)
    a_dst = sess.run(agent.p, {agent.input_s: [s]})
    a = np.argmax(a_dst)
    env.render(close=0)
    s, r, d, _ = env.step(a)
    total_r += r
    
    if d == True:
        env.reset()
    
    print("\r", total_r, end="")

 1000.0

In [33]:
env.render(close=1)