In [5]:
import numpy as np
import pandas as pd
import random
import gym
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pickle

# Initialize OpenAI for Breakout
env = gym.make("Breakout-v0")

# Hyper parameters for policy gradient model
global num_actions, discount, batch_size
num_actions = 3 # env.action_space.n # 6
discount = 0.99
save_every = 10
train_every = 1
reward_factor = 1
max_episode = 100000 # When to terminate traininig
initial_exp = -1 # Initial exploration probability
final_exp = 0.01 # Final exploration probability

# Hyper parameters for Network
learning_rate=0.0005
decay_ = 0.95
epsilon_ = 1e-8
FC_param = {'FC_1':600}

# Initialize parameter for policy gradient model 
observation = env.reset()
image_old  = None
images, fake_labels, rewards_std, action_hist, reward_hist, reward_runn = [], [], [], [], [], []
reward_episode = 0

# Build network
def Network(image_, FC_param):
    
    FC_tmp = tf.contrib.layers.flatten(image_, scope='Flatten')
    
    for key,value in FC_param.items():
        FC_tmp = tf.layers.dense(inputs=FC_tmp, 
                                 units=value,
                                 kernel_initializer= tf.truncated_normal_initializer(mean=0,
                                                                                     stddev=1./np.sqrt(5000), 
                                                                                     dtype=tf.float32),
                                 activation=tf.nn.relu,
                                 use_bias=False,
                                 name=key)
    
    logits = tf.layers.dense(inputs=FC_tmp, 
                             units=num_actions,
                             kernel_initializer= tf.truncated_normal_initializer(mean=0,
                                                                                 stddev=1./np.sqrt(500), 
                                                                                 dtype=tf.float32),
                             use_bias=False,
                             name='Logits')
    
    action_probs = tf.nn.softmax(logits, name='SoftMax')
    
    return action_probs

# Get discounted reward and normalize it
def discount_norm(rew):
    rew_func = lambda a, v: a*discount + v # Reward function
    
    rew_reverse = tf.scan(rew_func, tf.reverse(rew,[True, False]))
    discounted_rew = tf.reverse(rew_reverse,[True, False])
    
    mean, variance= tf.nn.moments(discounted_rew, [0])
    discounted_rew -= mean
    discounted_rew /= tf.sqrt(variance + 1e-6)
    
    return discounted_rew

# Process image by cropping and binarizing
def process_obs(obs):
    obs = obs[32:196,8:152]
    obs = obs[::2,::2,0]
    obs[obs != 0] = 1 
    return obs.astype(np.float)

# Calculate running mean for plotting
def get_running_mean(reward_hist):
    
    running_mean=[]
    mean=0
    for i in range(len(reward_hist)):
        mean = 0.99*mean + 0.01*reward_hist[i]
        running_mean.append(mean)
        
    return running_mean

# Build model
with tf.Graph().as_default() as g:
    with tf.device("/cpu:0"):
        image_ = tf.placeholder(dtype=tf.float32, shape=[None, 82, 72,1],name="image")
        fake_label_ = tf.placeholder(dtype=tf.float32, shape=[None, num_actions],name="fake_label")
        reward_ = tf.placeholder(dtype=tf.float32, shape=[None,1], name="reward")
        
        # Get policy gradient
        discounted_epr = discount_norm(reward_) 

        tf_aprob = Network(image_,FC_param)
        loss = tf.nn.l2_loss(fake_label_-tf_aprob) # Define loss
        optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay_, epsilon=epsilon_)
        
        # Assign optimizer with artificial gradient
        grads = optimizer.compute_gradients(loss, var_list=tf.trainable_variables(), grad_loss=discounted_epr)
        
        train_op = optimizer.apply_gradients(grads)

# Main program
with g.as_default(), tf.Session() as sess:
    
    tf.global_variables_initializer().run()
    saver = tf.train.Saver(tf.global_variables())
    
    # Load from last session or start over
    try:
        episode_number = pickle.load(open('PG-Breakout-ckpt-1/last-episode.p','rb'))
        print("episode number:", episode_number)
        saver.restore(sess, 'PG-Breakout-ckpt-1/Breakout.ckpt-'+str(episode_number))
        reward_hist = pickle.load(open('PG-Breakout-ckpt-1/all_reward.p','rb'))
        print('Continue from last saved episode')
    except:
        print('Training from beginning')
        episode_number = 0
        print("episode number:", episode_number)
        
    # Starting playing
    while episode_number<=max_episode: 
        
        # Process observation
        image_proc = process_obs(observation)
        image_diff = image_proc - image_old if image_old is not None else np.zeros((82, 72))
        image_old = image_proc

        # Uniformly pick an action
        feed_step = {image_: np.reshape(image_diff, (1,82, 72,1))}
        aprob = sess.run(tf_aprob,feed_step) ; aprob = aprob[0,:]
        
        # Epsilon greedy exploration
        ratio = np.max((max_episode - episode_number)/max_episode, 0)
        exploration_prob = (initial_exp - final_exp) * ratio + final_exp
        if random.random() < exploration_prob:
            action = random.randint(0, num_actions-1)
        else:
            action = action = np.random.choice(num_actions, p=aprob)
        
        # Generate fake label for back prop
        label = np.zeros_like(aprob) ; label[action] = 1

        # Input action to OpenAI and get feedback
        observation, reward, done, info = env.step(action+1)
        reward_episode += reward

        # Record for training
        images.append(image_diff); fake_labels.append(label); rewards_std.append(reward); action_hist.append(action+1)

        # Training
        if done:
            
            if episode_number % train_every == 0:
                
                feed_episode = {image_: np.array(images).reshape(-1, 82, 72, 1), 
                        fake_label_: np.array(fake_labels).reshape(-1,num_actions), 
                        reward_: np.array([i * reward_factor for i in rewards_std]).reshape(-1,1)}
                sess.run(train_op,feed_episode)
                    
                # Clear memory
                episode_duration = len(rewards_std)
                images, fake_labels, rewards_std, action_hist = [], [], [], []
            
            # Bookkeeping
            reward_hist.append(reward_episode)
            reward_runn.append(reward_episode)
            episode_number += 1
            observation = env.reset()
            print('\tep {}: reward: {} duration: {}'.format(episode_number, reward_episode,episode_duration))
            reward_episode = 0
            
            # Save model
            if episode_number % save_every == 0:
                saver.save(sess, 'PG-Breakout-ckpt-1/Breakout.ckpt', global_step=episode_number)
                print('Model saved, mean reward is',np.mean(reward_runn))
                pickle.dump(episode_number,open('PG-Breakout-ckpt-1/last-episode.p','wb'))
                pickle.dump(reward_hist,open('PG-Breakout-ckpt-1/all_reward.p','wb'))
                plt.figure(figsize=(18,3))
                plt.plot([7]*(len(reward_hist)-55000),'r--')
                plt.plot(np.array(get_running_mean(reward_hist[55000:episode_number])))
                plt.show()
                reward_runn = []


[2017-12-26 10:05:58,072] Making new env: Breakout-v0


episode number: 310
INFO:tensorflow:Restoring parameters from PG-Breakout-ckpt-1\Breakout.ckpt-310


[2017-12-26 10:05:58,796] Restoring parameters from PG-Breakout-ckpt-1\Breakout.ckpt-310


Training from beginning
episode number: 0
	ep 1: reward: 2.0 duration: 259
	ep 2: reward: 0.0 duration: 180
	ep 3: reward: 0.0 duration: 170


KeyboardInterrupt: 

In [None]:
# Plotting
colors = cm.rainbow(np.linspace(0, 1, 20))
interval = 1000

def get_mean_var(reward_hist):
    ts = pd.Series(reward_hist)
    return pd.rolling_mean(ts, interval), pd.rolling_var(ts, interval)

running_mean, running_var = get_mean_var(reward_hist)

plt.title('Breakout result',fontsize=20)
plt.ylim(0,60)
plt.plot(np.array(reward_hist),label='All rewards',c=colors[3,:])
plt.xlabel('Episode')
plt.ylabel('Rewards')
plt.plot(np.array(running_mean),label='Running mean',c='y')
plt.plot([7]*len(running_mean),'r--')
plt.legend(loc=1,fontsize=10)
plt.savefig('PG-Breakout-ckpt-1/ANN1.jpg')
plt.show()