In [1]:
import tensorflow as tf
import numpy as np
import random
import base64, io, time, gym
import IPython, functools
import matplotlib.pyplot as plt
from copy import copy
from tqdm import tqdm

In [13]:
tf.test.is_gpu_available( cuda_only=False, min_cuda_compute_capability=None )

InternalError: cudaGetDevice() failed. Status: cudaGetErrorString symbol not found.

In [2]:
# setup environment
env = gym.make("Pong-v0", frameskip=5)
print("Environment has observation space =", env.observation_space)
n_actions = env.action_space.n
print("Number of possible actions that the agent can choose from =", n_actions)

Environment has observation space = Box(210, 160, 3)
Number of possible actions that the agent can choose from = 6


In [3]:
#encode observations (vertical position of left paddle, position of ball, vertical position of right paddle)

def obs_preprocess(observation):
    #take in an observation: Box(210, 160,3) *just an array
    # return game state: int left paddle, int xball, int yball, int right paddle
    red = observation[:,:,0]
    red = red[35:194,:]
    #213 is left paddle (?, 17)
    #236 ball (?,?)
    #92 is right paddle (?, 140)
    try:
        lpaddle = int(np.argwhere(red[:,17] == 213)[0]) + 9
    except IndexError:
        lpaddle = 0
    try:
        rpaddle = int(np.argwhere(red[:,140] == 92)[0]) + 9
    except IndexError:
        rpaddle = 0
    
    try:
        xball, yball = np.argwhere(red == 92)[3]
    except IndexError:
        xball, yball = 0,0
    
    return [lpaddle, rpaddle, xball, yball]

In [4]:
# initialize PPO policy network

class policy_network(tf.keras.Model):
    #network takes in preprocessed obs and outputs probability of actions
    
    def __init__(self):
        super(policy_network, self).__init__()
        
        self.fc1 = tf.keras.layers.Dense(32, activation="relu")
        self.fc2 = tf.keras.layers.Dense(16, activation="relu")
        self.fc3 = tf.keras.layers.Dense(8, activation="relu")
        self.fc4 = tf.keras.layers.Dense(6)
    
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        return x

In [5]:
# initialize Value function netowrk

class value_network(tf.keras.Model):
    #network takes in preprocessed obs and value of given state
    
    def __init__(self):
        super(value_network, self).__init__()
        
        self.fc1 = tf.keras.layers.Dense(16, activation="relu")
        self.fc2 = tf.keras.layers.Dense(16, activation="relu")
        self.fc3 = tf.keras.layers.Dense(8, activation="relu")
        self.fc4 = tf.keras.layers.Dense(1)
    
    def call(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        return x

In [6]:
# create choose action function

def choose_action(model, observation):
    #takes in and model and observation
    #outputs a single action from model output
    
    prob_logits = model(observation)
    # sample actions base on probablities 
    action = tf.random.categorical(prob_logits, 1)[0]
    
    return action

In [7]:
# create memory for agent

class Memory:
    def __init__(self):
        self.clear()
        
    def clear(self):
        self.observations = []
        self.actions = []
        self.rewards = []
        
    def add_memory(self, action, observation, reward):
        self.actions.append(action)
        self.observations.append(observation)
        self.rewards.append(reward)

In [8]:
# create discount reward function

def normalize(x):
    x -= torch.mean(x)
    x /= torch.std(x)
    return x

def discount_reward(rewards, gamma = 0.99):
    #input rewards: list of reward at each time step, gamma: decay factor
    
    discounted_rewards = np.zeros_like(rewards)
    R = 0
    
    for t in reversed(range(0, len(rewards))):
        #reset reward when game ends
        if rewards[t] != 0:
            R = 0
        R = R * gamma + rewards[t]
        discounted_rewards[t] = R
        
    return normalize(discounted_rewards)

In [9]:
# create an Advantage Function

def advantage(R, observation, value_model):
    # return list of advantages of an action given observation
    A = R - value_model(observation)
    return A

In [10]:
# create objective function

def g(epsilon, A):
    if A >= 0:
        return (1+epsilon)*A
    else:
        return (1-epsilon)*A

def objective_func(policy, policy_old, advantage, action, observation, epsilon=0.2,):
    policy_ratio = policy(observation)[action]/policy_old(observation)[action]
    clip_objective = g(epsilon, advantage)
    return min(policy_ratio*advantage, clip_objective)

In [11]:
# create value loss

def value_loss(value_model, observation, reward):
    return (value_model(observation)-reward)**2

In [12]:
#create loss function for a batch of trajectories

def loss_fn(traj, R_hat, policy_net, policy_old, value_net):
    L_traj = []
    L_traj_value = []
    for k in range(len(traj)): 
        #compute loss for each time step in trajectory
        L_time = []
        L_time_value = []
        
        for t in range(len(traj[k].observations)):
            
            observation = traj[k].observations[t]
            action = traj[k].actions[t]
            adv = advantage(R_hat[k], observation[t], value_net)
            
            L_time[t] = objective_func(policy_net, policy_old, adv, action, observation, 0.2)
            L_time_value[t] = value_loss(value_net, observation, R_hat[k])
            
        #average time step losses
        L_traj[k] = 1/len(traj[k].observations) * np.sum(L_time)
        L_traj_value[k] = 1/len(traj[k].observations) * np.sum(L_time_value)
        
    #average trajectory losses
    L_policy = -1/len(traj) * np.sum(L_traj)
    L_value = 1/len(traj) * np.sum(L_traj_value)
    
    return L_policy, L_value

In [25]:
# training loop

lr_p = 1e-4
lr_v = 1e-4
max_iter = 1
batch_size = 1

policy_net = policy_network()
policy_old = copy(policy_net)
value_net = value_network()
policy_optimizer = tf.keras.optimizers.Adam(learning_rate = lr_p)
value_optimizer = tf.keras.optimizers.SGD(learning_rate = lr_v)

for i_iter in range(max_iter):
    
    # do foward pass
    with tf.GradientTape() as tape:
        # list of memories
        traj = []
        # list of summed discounted rewards from memories
        R_hat = []

        memory = Memory()
        for i_batch in range(batch_size):
            #reset envirnoment
            observation = env.reset()

            while True:
                # get action
                observation = obs_preprocess(observation)
                action = choose_action(policy_net, observation)
                # get next observation
                next_observation, reward, done, info = env.step(action)
                # add experience to memory
                memory.add_memory(action, observation, reward)

                # if play through is over
                if done:
                    # append memory to trajectories 
                    traj.append(memory)
                    R_hat.append(torch.sum(discount_reward(memory.rewards)))
                    break
                observation = next_observation

        L_policy , L_value = loss_fn(traj, R_hat, policy_net, policy_old, value_net)
        
    #make backwards pass
    policy_old = copy(policy_net)
    
    policy_grads = tape.gradient(L_policy, policy_net.trainable_variables)
    policy_optimizer.apply_gradients(zip(policy_grads, policy_net.trainable_variables))
    
    value_grads = tape.gradient(L_value, value_net.trainable_variables)
    value_optimizer.apply_gradients(zip(value_grads, value_net.trainable_variables))
            


InternalError: cudaGetDevice() failed. Status: cudaGetErrorString symbol not found.