In [None]:
import gym
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
env = gym.make('CartPole-v1')
env.reset()

In [None]:
import sys
sys.path.append("..")

from rl_agent import RL_Agent
from rl_learner import PG_Learner
from rl_learner import A3C_Learner

## Defining RL agent

In [None]:
class Cartpole_Agent(RL_Agent):
    # Overwriting supposedly abstract RL_Agent class
    # All what is left is to actually provide the specific model to choose action
    # It is still implied that
    # 1) __init__ method defines all its variables in model_name scope
    # 2) the class has self.session, self.prob_layer and self.log_prob_layer methods
    # The remaining functionality needed in PG and TRPO learners is still defined in abstract base
    def __init__(self, session, model_name):
        RL_Agent.__init__(self, session, model_name)
        with tf.variable_scope(model_name):
            self.n_actions = 2
            
            self.input_layer = tf.placeholder(shape=[None, 4], dtype=tf.float32)
            self.dense1_layer = tf.layers.dense(self.input_layer, 
                                                units=4, use_bias=True, 
                                                activation=tf.nn.relu, name="dense1_weights"
                                               )
            
            self.dense2_layer = tf.layers.dense(self.dense1_layer, 
                                                units=2, use_bias=True, 
                                                activation=tf.nn.relu, name="dense2_weights"
                                               ) 
            
            self.prob_layer = tf.nn.softmax(self.dense2_layer)
            self.log_prob_layer = tf.log(self.prob_layer)
            
            self.state_value = tf.layers.dense(self.dense1_layer, units=1, use_bias=True, activation=None) 
                        

## Policy gradient

In [None]:
tf.reset_default_graph()
sess = tf.Session()

actor_agent = Cartpole_Agent(sess, "2018_02_02_cartpole_pg")
sess.run(tf.global_variables_initializer())

pg = PG_Learner(sess,
                rl_agent=actor_agent, 
                game_env=env,
                discount=0.99, 
                batch_size=100, 
                frame_cap=None,
                lr=0.01)

import time
start_time = time.time()
for i in range(100):
    pg.step()
print "Used time: {} seconds".format(time.time() - start_time)

In [None]:
# np.save("simulations/pg_rewards", pg.reward_history)

plt.plot(np.load("simulations/pg_rewards.npy"))
plt.title("Reward history for a single policy gradient agent (cartpole)")
plt.xlabel("# game")
plt.ylabel("score")
plt.show()

## Considering Actor-Critic learner

In [None]:
tf.reset_default_graph()
sess = tf.Session()

rl_agent = Cartpole_Agent(sess, "2018_02_02_cartpole_actor")
critic_agent = Cartpole_Agent(sess, "2018_02_02_cartpole_critic")
sess.run(tf.global_variables_initializer())

a3c = A3C_Learner(sess,
                  actor_agent=rl_agent, 
                  critic_agent=critic_agent,
                  game_env=env,
                  discount=0.99, 
                  batch_size=100, 
                  frame_cap=None,
                  actor_lr=0.01,
                  critic_lr=0.001)

import time
start_time = time.time()
for i in range(100):
    a3c.step()
print "Used time: {} seconds".format(time.time() - start_time)

In [None]:
# np.save("simulations/a3c_rewards", a3c.reward_history)

# plt.plot(np.load("simulations/a3c_rewards.npy"), label="all rewards")
plt.plot(pd.Series(np.load("simulations/a3c_rewards.npy")).rolling(window=200).mean(), label="a3c rolling mean")
plt.plot(pd.Series(np.load("simulations/pg_rewards.npy")).rolling(window=200).mean(), label="pg rolling mean")

plt.title("Reward history for a single A3C agent (cartpole)")
plt.xlabel("# game")
plt.ylabel("score")
plt.legend()
plt.show()