In [306]:
import tensorflow as tf
import numpy as np
import random
import base64, io, time, gym
import IPython, functools
import matplotlib.pyplot as plt
from copy import copy
from tqdm import tqdm

In [307]:
print(tf.config.experimental.list_physical_devices('GPU'))

[]


In [308]:
# setup environment
env = gym.make("Pong-v0", frameskip=5)
print("Environment has observation space =", env.observation_space)
n_actions = env.action_space.n
print("Number of possible actions that the agent can choose from =", n_actions)

Environment has observation space = Box(210, 160, 3)
Number of possible actions that the agent can choose from = 6


In [309]:
#encode observations (vertical position of left paddle, position of ball, vertical position of right paddle)

def obs_preprocess(observation):
    #take in an observation: Box(210, 160,3) *just an array
    # return game state: int left paddle, int xball, int yball, int right paddle
    red = observation[:,:,0]
    red = red[35:194,:]
    #213 is left paddle (?, 17)
    #236 ball (?,?)
    #92 is right paddle (?, 140)
    try:
        lpaddle = int(np.argwhere(red[:,17] == 213)[0]) + 9
    except IndexError:
        lpaddle = 0
    try:
        rpaddle = int(np.argwhere(red[:,140] == 92)[0]) + 9
    except IndexError:
        rpaddle = 0
    
    try:
        xball, yball = np.argwhere(red == 92)[3]
    except IndexError:
        xball, yball = 0,0
    
    return np.expand_dims(np.array([lpaddle, rpaddle, xball, yball], dtype=np.float32), 0)

In [310]:
# initialize PPO policy network

class policy_network(tf.keras.Model):
    #network takes in preprocessed obs and outputs probability of actions
    
    def __init__(self):
        super(policy_network, self).__init__()
        
        self.fc1 = tf.keras.layers.Dense(32, activation="relu")
        self.fc2 = tf.keras.layers.Dense(16, activation="relu")
        self.fc3 = tf.keras.layers.Dense(8, activation="relu")
        self.fc4 = tf.keras.layers.Dense(6)
        self.softmax = tf.keras.layers.Softmax(-1)
    
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.softmax(self.fc4(x))
        return x

In [311]:
# initialize Value function netowrk

class value_network(tf.keras.Model):
    #network takes in preprocessed obs and value of given state
    
    def __init__(self):
        super(value_network, self).__init__()
        
        self.fc5 = tf.keras.layers.Dense(16, activation="relu")
        self.fc6 = tf.keras.layers.Dense(16, activation="relu")
        self.fc7 = tf.keras.layers.Dense(8, activation="relu")
        self.fc8 = tf.keras.layers.Dense(1)
    
    def call(self, x):
        x = self.fc5(x)
        x = self.fc6(x)
        x = self.fc7(x)
        x = self.fc8(x)
        return x

In [312]:
# create choose action function

def choose_action(model, observation):
    #takes in and model and observation
    #outputs a single action from model output
    
    prob_weights = model.predict(observation)

    action = np.random.choice(range(env.action_space.n), size=1, p=np.squeeze(prob_weights))[0]
    
    return action

In [313]:
# create memory for agent

class Memory:
    def __init__(self):
        self.clear()
        
    def clear(self):
        self.observations = []
        self.actions = []
        self.rewards = []
        
    def add_memory(self, action, observation, reward):
        self.actions.append(action)
        self.observations.append(observation)
        self.rewards.append(reward)

In [322]:
# create discount reward function

def normalize(x):
    x -= tf.math.reduce_mean(x)
    x /= tf.math.reduce_std(x)
    return x

def discount_reward(rewards, gamma = 0.99):
    #input rewards: list of reward at each time step, gamma: decay factor
    
    discounted_rewards = np.zeros_like(rewards)
    R = 0
    
    for t in reversed(range(0, len(rewards))):
        #reset reward when game ends
        if rewards[t] != 0:
            R = 0
        R = R * gamma + rewards[t]
        discounted_rewards[t] = R
        
    return normalize(discounted_rewards)

In [329]:
# create an Advantage Function

def advantage(R, observation, value_model):
    # return list of advantages of an action given observation
    A = R - value_model.predict(observation)
    return A

In [330]:
# create objective function

def objective_func(policy, policy_old, advantage, action, observation, epsilon=0.2):
    advantage = tf.cast(advantage, tf.float32)
    policy_ratio = policy.predict(observation)[0,action]/policy_old.predict(observation)[0,action]
    clip_ratio = tf.clip_by_value(policy_ratio, 1-epsilon, 1+epsilon)
    raw_obj = tf.math.multiply(policy_ratio, advantage)
    clip_obj = tf.math.multiply(clip_ratio, advantage)
    return tf.math.minimum(raw_obj, clip_obj)

In [331]:
# create value loss

def value_loss(value_model, observation, reward):
    reward = tf.cast(reward, tf.float32)
    return tf.math.pow(value_model.predict(observation)-reward, 2)

In [334]:
print(value_net.trainable_variables)

[<tf.Variable 'value_network_79/dense_636/kernel:0' shape=(4, 16) dtype=float32>, <tf.Variable 'value_network_79/dense_636/bias:0' shape=(16,) dtype=float32>, <tf.Variable 'value_network_79/dense_637/kernel:0' shape=(16, 16) dtype=float32>, <tf.Variable 'value_network_79/dense_637/bias:0' shape=(16,) dtype=float32>, <tf.Variable 'value_network_79/dense_638/kernel:0' shape=(16, 8) dtype=float32>, <tf.Variable 'value_network_79/dense_638/bias:0' shape=(8,) dtype=float32>, <tf.Variable 'value_network_79/dense_639/kernel:0' shape=(8, 1) dtype=float32>, <tf.Variable 'value_network_79/dense_639/bias:0' shape=(1,) dtype=float32>]


In [337]:
#create loss function for a batch of trajectories

def training_step(traj, R_hat, policy_net, policy_old, value_net):
    with tf.GradientTape(persistent=True) as tape:
        #get losses
        L_traj = tf.constant([], dtype=tf.float32)
        L_traj_value = tf.constant([], dtype=tf.float32)

        for k in range(len(traj)): 
            #compute loss for each time step in trajectory
            L_time = tf.constant([], dtype=tf.float32)
            L_time_value = tf.constant([], dtype=tf.float32)

            for t in range(len(traj[k].observations)):

                observation = traj[k].observations[t]
                action = traj[k].actions[t]
                adv = advantage(R_hat[k], observation, value_net)
                
                L_t = tf.squeeze(objective_func(policy_net, policy_old, adv, action, observation, epsilon=0.2), 0)
                L_t_value = tf.squeeze(value_loss(value_net, observation, R_hat[k]), 0)

                L_time = tf.concat([L_time, L_t],0)
                L_time_value = tf.concat([L_time_value, L_t_value],0)

            #average time step losses
            L_t_avg = tf.math.reduce_mean(L_time, keepdims=True)
            L_t_avg_value = tf.math.reduce_mean(L_time_value, keepdims=True)
            
            L_traj = tf.concat([L_traj, L_t_avg],0)
            L_traj_value = tf.concat([L_traj_value, L_t_avg_value],0)

        #average trajectory losses
        L_policy = -1*tf.math.reduce_sum(L_traj, keepdims=True)
        L_value = tf.math.reduce_sum(L_traj_value, keepdims=True)
        
    #make backwards pass
    policy_grads = tape.gradient(L_policy, policy_net.trainable_variables)
    policy_optimizer.apply_gradients(zip(policy_grads, policy_net.trainable_variables))
    
    value_grads = tape.gradient(L_value, value_net.trainable_variables)
    value_optimizer.apply_gradients(zip(value_grads, value_net.trainable_variables))

In [338]:
# training loop

lr_p = 1e-4
lr_v = 1e-4
max_iter = 1
batch_size = 1

policy_net = policy_network()
policy_old = copy(policy_net)
value_net = value_network()
policy_optimizer = tf.keras.optimizers.Adam(learning_rate = lr_p)
value_optimizer = tf.keras.optimizers.SGD(learning_rate = lr_v)

for i_iter in range(max_iter):
    
    # collect trajectories and rewards
    # list of memories
    traj = []
    # list of summed discounted rewards from memories
    R_hat = []

    memory = Memory()
    for i_batch in range(batch_size):
        #reset envirnoment
        observation = env.reset()

        while True:
            # get action
            observation = obs_preprocess(observation)
            action = choose_action(policy_net, observation)
            # get next observation
            next_observation, reward, done, info = env.step(action)
            # add experience to memory
            memory.add_memory(action, observation, reward)

            # if play through is over
            if done:
                # append memory to trajectories 
                traj.append(memory)
                R_hat.append(tf.math.reduce_sum(discount_reward(memory.rewards)))
                break
            observation = next_observation
            
    #copy old policy        
    policy_old = copy(policy_net)
    
    #make backwards prop and step
    training_step(traj, R_hat, policy_net, policy_old, value_net)

[None, None, None, None, None, None, None, None]


ValueError: No gradients provided for any variable: ['policy_network_81/dense_648/kernel:0', 'policy_network_81/dense_648/bias:0', 'policy_network_81/dense_649/kernel:0', 'policy_network_81/dense_649/bias:0', 'policy_network_81/dense_650/kernel:0', 'policy_network_81/dense_650/bias:0', 'policy_network_81/dense_651/kernel:0', 'policy_network_81/dense_651/bias:0'].