In [67]:
import tensorflow as tf
import numpy as np
import random
import sys
import base64, io, time, gym
import IPython, functools
import matplotlib.pyplot as plt
from copy import copy
from tqdm import trange

In [50]:
print(tf.config.experimental.list_physical_devices('GPU'))

[]


In [51]:
# setup environment
env = gym.make("Pong-v0", frameskip=5)
print("Environment has observation space =", env.observation_space)
n_actions = env.action_space.n
print("Number of possible actions that the agent can choose from =", n_actions)

Environment has observation space = Box(210, 160, 3)
Number of possible actions that the agent can choose from = 6


In [52]:
#encode observations (vertical position of left paddle, position of ball, vertical position of right paddle)

def obs_preprocess(observation):
    #take in an observation: Box(210, 160,3) *just an array
    # return game state: int left paddle, int xball, int yball, int right paddle
    red = observation[:,:,0]
    red = red[35:194,:]
    #213 is left paddle (?, 17)
    #236 ball (?,?)
    #92 is right paddle (?, 140)
    try:
        lpaddle = int(np.argwhere(red[:,17] == 213)[0]) + 9
    except IndexError:
        lpaddle = 0
    try:
        rpaddle = int(np.argwhere(red[:,140] == 92)[0]) + 9
    except IndexError:
        rpaddle = 0
    
    try:
        xball, yball = np.argwhere(red == 92)[3]
    except IndexError:
        xball, yball = 0,0
    
    return np.expand_dims(np.array([lpaddle, rpaddle, xball, yball], dtype=np.float32), 0)

In [53]:
# initialize PPO policy network

class policy_network(tf.keras.Model):
    #network takes in preprocessed obs and outputs probability of actions
    
    def __init__(self):
        super(policy_network, self).__init__()
        
        self.fc1 = tf.keras.layers.Dense(32, activation="relu")
        self.fc2 = tf.keras.layers.Dense(16, activation="relu")
        self.fc3 = tf.keras.layers.Dense(8, activation="relu")
        self.fc4 = tf.keras.layers.Dense(6)
        self.softmax = tf.keras.layers.Softmax(-1)
    
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.softmax(self.fc4(x))
        return x

In [54]:
# initialize Value function netowrk

class value_network(tf.keras.Model):
    #network takes in preprocessed obs and value of given state
    
    def __init__(self):
        super(value_network, self).__init__()
        
        self.fc5 = tf.keras.layers.Dense(16, activation="relu")
        self.fc6 = tf.keras.layers.Dense(16, activation="relu")
        self.fc7 = tf.keras.layers.Dense(8, activation="relu")
        self.fc8 = tf.keras.layers.Dense(1)
    
    def call(self, x):
        x = self.fc5(x)
        x = self.fc6(x)
        x = self.fc7(x)
        x = self.fc8(x)
        return x

In [55]:
# create choose action function

def choose_action(model, observation):
    #takes in and model and observation
    #outputs a single action from model output
    
    prob_weights = model.predict(observation)

    action = np.random.choice(range(env.action_space.n), size=1, p=np.squeeze(prob_weights))[0]
    
    return action

In [56]:
# create memory for agent

class Memory:
    def __init__(self):
        self.clear()
        
    def clear(self):
        self.observations = []
        self.actions = []
        self.rewards = []
        
    def add_memory(self, action, observation, reward):
        self.actions.append(action)
        self.observations.append(observation)
        self.rewards.append(reward)

In [57]:
# create discount reward function

def normalize(x):
    x -= tf.math.reduce_mean(x)
    x /= tf.math.reduce_std(x)
    return x

def discount_reward(rewards, gamma = 0.99):
    #input rewards: list of reward at each time step, gamma: decay factor
    
    discounted_rewards = np.zeros_like(rewards)
    R = 0
    
    for t in reversed(range(0, len(rewards))):
        #reset reward when game ends
        if rewards[t] != 0:
            R = 0
        R = R * gamma + rewards[t]
        discounted_rewards[t] = R
        
    return normalize(discounted_rewards)

In [58]:
# create an Advantage Function

def advantage(R, observation, value_model):
    # return list of advantages of an action given observation
    A = R - value_model.predict(observation)
    return A

In [59]:
# create objective function

@tf.function
def objective_func(policy_logits, policy_old, advantage, action, observation, epsilon=0.2):
    advantage = tf.cast(advantage, tf.float32)
    hot_action = tf.squeeze(tf.one_hot([action],env.action_space.n))
    policy_logits = tf.squeeze(policy_logits)
    policy_ratio = tf.tensordot(policy_logits, hot_action, 1)/tf.tensordot(policy_old(observation), hot_action, 1)
    clip_ratio = tf.clip_by_value(policy_ratio, 1-epsilon, 1+epsilon)
    raw_obj = tf.math.multiply(policy_ratio, advantage)
    clip_obj = tf.math.multiply(clip_ratio, advantage)
    return tf.math.minimum(raw_obj, clip_obj)

In [60]:
# create value loss

@tf.function
def value_loss(value_logits, reward):
    reward = tf.cast(reward, tf.float32)
    return tf.math.pow(value_logits-reward, 2)

In [61]:
# create a function that added a list of grads

def add_grads(grads):
    
    # (n, layers, variable)
    grads = np.transpose(np.asarray(grads)).tolist()
    # (layers, n, variable)
    sum_grads = []
    for layer in grads:
        # add grads of each layer to get one grad per layer
        sum_grads.append(tf.math.add_n(layer))
    # (layers, varible)
    return sum_grads

In [62]:
#create loss function for a batch of trajectories

def get_grads(R_hat, policy_net, policy_old, value_net, observations, actions):
    
    #create grad storage
    policy_grads = []
    value_grads = []
    
    for observation, action in zip(observations, actions):
        with tf.GradientTape(persistent=True) as tape:

            #compute advantage
            adv = advantage(R_hat, observation, value_net)
            #get network outputs
            policy_logits = policy_net(observation)
            value_logits = value_net(observation)
            #compute losses  
            L = tf.squeeze(objective_func(policy_logits, policy_old, adv, action, observation, epsilon=0.2), 0)
            L_value = tf.squeeze(value_loss(value_logits, R_hat), 0)

        #compute gradients
        policy_grad = tape.gradient(L, policy_net.trainable_variables)
        value_grad = tape.gradient(L_value, value_net.trainable_variables)
        
        #store grads
        policy_grads.append(policy_grad)
        value_grads.append(value_grad)
        
    policy_grad = [x * 1/len(policy_grads) for x in add_grads(policy_grads)] 
    value_grad = [x * 1/len(value_grads) for x in add_grads(value_grads)] 
    
    return policy_grad, value_grad


In [68]:
# training loop

lr_p = 1e-4
lr_v = 1e-4
max_iter = 10
batch_size = 5

policy_net = policy_network()
policy_old = copy(policy_net)
value_net = value_network()
policy_optimizer = tf.keras.optimizers.Adam(learning_rate = lr_p)
value_optimizer = tf.keras.optimizers.SGD(learning_rate = lr_v)
memory = Memory()

for i_iter in trange(max_iter, desc="Training Iteration"):
    for i_batch in trange(batch_size, desc="Batch Iteration"):
        #create grad storage
        policy_grads = []
        value_grads = []
        
        #reset envirnoment
        observation = env.reset()

        while True:
            # get action
            observation = obs_preprocess(observation)
            action = choose_action(policy_net, observation)
            # get next observation
            next_observation, reward, done, info = env.step(action)
            # add experience to memory
            memory.add_memory(action, observation, reward)

            # if play through is over
            if done:
                #compute discounted reward of run
                R_hat = tf.math.reduce_sum(discount_reward(memory.rewards))
                #make forward and get gradients
                policy_grad, value_grad = get_grads(R_hat, policy_net, policy_old, value_net, memory.observations, memory.actions)
                #clear memory
                memory.clear()
                break
                
            observation = next_observation
            
    #sum and average grads of trajectories
    policy_grad = [x * 1/len(policy_grads) for x in add_grads(policy_grads)] 
    value_grad = [x * 1/len(value_grads) for x in add_grads(value_grads)] 
            
    #copy old policy        
    policy_old = copy(policy_net)
    
    #make backwards prop and step
    policy_optimizer.apply_gradients(zip(policy_grads, policy_net.trainable_variables))
    value_optimizer.apply_gradients(zip(value_grads, value_net.trainable_variables))








Training Iteration:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A[A[A[A







Batch Iteration:   0%|          | 0/5 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







Batch Iteration:  20%|██        | 1/5 [02:44<10:58, 164.62s/it][A[A[A[A[A[A[A[A







Batch Iteration:  40%|████      | 2/5 [03:12<06:10, 123.53s/it][A[A[A[A[A[A[A[A







Batch Iteration:  60%|██████    | 3/5 [05:29<04:15, 127.69s/it][A[A[A[A[A[A[A[A







Batch Iteration:  80%|████████  | 4/5 [06:00<01:38, 98.53s/it] [A[A[A[A[A[A[A[A

KeyboardInterrupt: 