In [1]:
import sys
import numpy as np
import gym
from matplotlib import pyplot as plt
from dqn import DQN
from drqn import DRQN, Memory
from reinforce import REINFORCE, PiApproximationWithNN, Baseline
from reinforce_Buffer import REINFORCE as RF_Buffer, PiApproximationWithNN as Pi_Buffer, ReplayMemory
import warnings
warnings.filterwarnings('ignore')

def test_DQN(env, run):
    gamma = 1.0
    return DQN(env, gamma, 1000, run)


def test_DRQN(env, run):
    gamma = 1.0
    return DRQN(env, gamma, 1000, run)

def test_reinforce(env,runs):
    gamma = 1.
    alpha = 3e-4

    if 'tensorflow' in sys.modules:
        import tensorflow as tf
        tf.reset_default_graph()

    pi = PiApproximationWithNN(
        env.observation_space.shape[0],
        env.action_space.n,
        alpha)

    B = Baseline(0.)

    return REINFORCE(env, gamma, 1000, runs, pi, B)


def test_reinforce_Buffer(env, mem_size, runs):
    gamma = 1.
    alpha = 3e-4

    if 'tensorflow' in sys.modules:
        import tensorflow as tf
        tf.reset_default_graph()

    pi = Pi_Buffer(
        env.observation_space.shape[0],
        env.action_space.n,
        alpha,
        mem_size)

    B = Baseline(0.)

    return RF_Buffer(env, gamma, 1000, runs, pi, B, mem_size)




  from ._conv import register_converters as _register_converters


In [None]:
num_iter = 10
env = gym.make("CartPole-v0")

without_buffer = []
for q in range(num_iter):
    print('***************************************')
    print("----------------> Without Buffer: {}".format(q))
    training_progress = test_reinforce(env,q)
    without_buffer.append(training_progress[0])
    pi = training_progress[1]
print('***************************************')
without_buffer = np.mean(without_buffer, axis=0)
    # play(env,pi)
    
    # Test REINFORCE_buffer size 2 and 5
with_buffer2 = []
for q in range(num_iter):
    print('***************************************')
    print("----------------> With Buffer = 2: {}".format(q))
    training_progress = test_reinforce_Buffer(env, 2, q)
    with_buffer2.append(training_progress[0])
    pi_buff = training_progress[1]
print('***************************************')
with_buffer2 = np.mean(with_buffer2, axis=0)
    

***************************************
----------------> Without Buffer: 0


In [None]:
# Plot the experiment result
fig,ax = plt.subplots()
ax.plot(np.arange(len(without_buffer)), without_buffer, label='No Buffer')
ax.plot(np.arange(len(with_buffer2)), with_buffer2, label='Buffer - Size 2')
# ax.plot(np.arange(len(with_buffer5)), with_buffer5, label='Buffer - Size 5')

ax.set_xlabel('iteration')
ax.set_ylabel('G_0')
ax.legend()

plt.show()


In [None]:
# # Test DQN
dqn_list = []
dqn_policies = []
for q in range(num_iter):
    dqn_rew, dqn_pi = test_DQN(env, q)
    dqn_list.append(dqn_rew)
    dqn_policies.append(dqn_pi)
dqn_result = np.mean(dqn_list,axis=0)
smoothed_dqn_result = running_mean(dqn_result, 10)
#
# Test DRQN
drqn_list = []
drqn_policies = []
for q in range(num_iter):
    drqn_rew, drqn_pi = test_DRQN(env, q)
    drqn_list.append(drqn_rew)
    drqn_policies.append(drqn_pi)
drqn_result = np.mean(drqn_list, axis=0)
smoothed_drqn_result = running_mean(drqn_result, 10)

In [None]:
fig, ax = plt.subplots()
ax.plot(np.arange(len(smoothed_dqn_result)), smoothed_dqn_result, label='DQN_smoothed')
ax.plot(np.arange(len(dqn_result)), dqn_result, label='DQN', color='red', alpha=0.3)
ax.plot(np.arange(len(smoothed_drqn_result)), smoothed_drqn_result, label='DRQN_smoothed')
ax.plot(np.arange(len(drqn_result)), drqn_result, label='DRQN', color='grey', alpha=0.3)

ax.set_xlabel('iteration')
ax.set_ylabel('G_0')
ax.legend()

plt.show()
