In [None]:
import gym
import numpy as np
import tensorflow as tf

In [None]:
env = gym.make('CartPole-v1')
env.reset()

In [None]:
import sys
sys.path.append("..")

from rl_agent import RL_Agent
from rl_learner import PG_Learner
from rl_learner import A3C_Learner

## Defining RL agent

In [None]:
class Cartpole_Agent(RL_Agent):
    # Overwriting supposedly abstract RL_Agent class
    # All what is left is to actually provide the specific model to choose action
    # It is still implied that
    # 1) __init__ method defines all its variables in model_name scope
    # 2) the class has self.session, self.prob_layer and self.log_prob_layer methods
    # The remaining functionality needed in PG and TRPO learners is still defined in abstract base
    def __init__(self, model_name):
        RL_Agent.__init__(self, model_name)
        with tf.variable_scope(model_name):
            self.session = tf.Session()
            self.n_actions = 2
            
            self.input_layer = tf.placeholder(shape=[None, 4], dtype=tf.float32)
            self.dense1_layer = tf.layers.dense(self.input_layer, 
                                                units=4, use_bias=True, 
                                                activation=tf.nn.relu, name="dense1_weights"
                                               )
            
            self.dense2_layer = tf.layers.dense(self.dense1_layer, 
                                                units=2, use_bias=True, 
                                                activation=tf.nn.relu, name="dense2_weights"
                                               ) 
            
            self.prob_layer = tf.nn.softmax(self.dense2_layer)
            self.log_prob_layer = tf.log(self.prob_layer)
            
            self.state_value = tf.layers.dense(self.dense1_layer, units=1, use_bias=True, activation=None) 
                        
            self.session.run(tf.global_variables_initializer())


## Policy gradient

In [None]:
tf.reset_default_graph()
pg = PG_Learner(rl_agent=Cartpole_Agent("2018_02_02_cartpole_pg"), 
                game_env=env,
                discount=0.99, 
                batch_size=100, 
                frame_cap=None,
                lr=0.01)

import time
start_time = time.time()
for i in range(100):
    pg.step()
print "Used time: {} seconds".format(time.time() - start_time)

In [None]:
plt.plot(pg.reward_history)
plt.show()

## Considering Actor-Critic learner

In [None]:
tf.reset_default_graph()
rl_agent = Cartpole_Agent("2018_02_02_cartpole_pg")

a3c = A3C_Learner(actor_agent=rl_agent, 
                  critic_agent=rl_agent,
                  game_env=env,
                  discount=0.99, 
                  batch_size=100, 
                  frame_cap=None,
                  actor_lr=0.01,
                  critic_lr=0.0001)

import time
start_time = time.time()
for i in range(100):
    states, actions, rewards = a3c.play_single_game()
    baselines = a3c.critic_agent.evaluate_states(states)
#     print rewards, baselines
    print sum(actions) / float(len(actions)), sum(baselines) / float(len(baselines))
    a3c.step()
print "Used time: {} seconds".format(time.time() - start_time)