# Import Libraries

In [1]:
from keras.layers import Activation, Dense, Dropout
from keras.models import Sequential, load_model
from tensorflow.keras.models import Model
from keras.optimizers import Adam
from keras import backend as K

import tensorflow as tf
import numpy as np
import random
import gym

  logger.warn(


# Setup Evironment 

In [2]:
env = gym.make('CartPole-v0')
env._max_episode_steps = 500

  logger.warn(


# Define Agent

In [3]:
class Agent(Model):
    def __init__(self, 
                 lr = 1e-3, 
                 gamma = 0.99,
                 n_actions = 2, 
                 **kwargs):
        
        super().__init__(**kwargs)
        
        self.p_net = self.FCN()
        
        self.gamma = gamma
        
        self.action_space = [_ for _ in range(n_actions)]
        self.step = 0
        
        self.p_net_filename = 'p_net.h5'
        
    def compile(self, opt, **kwargs):
        super().compile(**kwargs)
        self.opt = opt
        
    def FCN(self, n_inputs = 4, n_hiddens = [128], n_actions = 2):
        model = Sequential()
        model.add(tf.keras.Input(shape = (n_inputs,)))
        model.add(Dense(units = n_hiddens[0], activation = "relu"))
        model.add(Dropout(0.5))
        model.add(Dense(units = n_actions, activation = "softmax"))
    
        return model
    
    def select_action(self, observation, is_training = True):
        state = tf.convert_to_tensor(np.array([observation], copy=False, dtype = np.float32))
        if is_training:
            p = self.p_net(state, training = True)
        else:
            p = self.p_net(state, training = False)
        
        tmp = p
        tmp = tf.cast(tmp, dtype = tf.float64)
        tmp = tmp*(1./tf.reduce_sum(tmp))
        
        a = np.random.choice(a = len(self.action_space), 
                                size = 1, 
                                replace = False, 
                                p = tmp[0,:])[0]
        
        log_p = tf.math.log(p[:, a])
        log_p = tf.math.reduce_sum(log_p)

        return a, log_p
        
    def learn(self, rewards, log_ps, g):
        
        d_rewards = np.array([0]*len(rewards))
        for i in range(len(rewards)):
            Gt = 0.
            pw = 0
            for r in rewards[i:]:
                Gt += r*self.gamma**pw
                pw += 1
            d_rewards[i] = Gt

        d_rewards = tf.convert_to_tensor(d_rewards, dtype = tf.float32)
        d_rewards = (d_rewards - tf.math.reduce_mean(d_rewards))/(tf.math.reduce_std(d_rewards) + 1e-9)
        
        loss_fcn = []
        for log_p, Gt in zip(log_ps, d_rewards):
            loss_fcn.append(-log_p*Gt)

        loss_fcn = tf.convert_to_tensor(loss_fcn, dtype = tf.float32)
        loss_fcn = tf.reduce_sum(loss_fcn, 0)
        
        grad = g.gradient(loss_fcn, self.p_net.trainable_variables)
        self.opt.apply_gradients(zip(grad, self.p_net.trainable_variables))
               
    def train(self, n_episodes = 500, n_trial = 500, windows = 20):
        steps = 0
        rewards_sum = []
        rewards_avg = []
        best_rewards = -np.inf
        for episode in range(n_episodes):
            
            obs = env.reset()
            log_ps = []
            rewards = []
            with tf.GradientTape() as tape:  
                for i in range(n_trial):
                    #select action
                    action, log_p = self.select_action(obs)
                    
                    #interact with the environment
                    obs_, reward, done, info = env.step(action)
                    
                    log_ps.append(log_p)
                    rewards.append(reward)
                    
                    if done or i == n_trial - 1:
                        #learn from experience
                        self.learn(rewards = rewards, log_ps = log_ps, g = tape)

                        rewards_sum.append(np.sum(rewards))
                        rewards_avg.append(np.mean(rewards_sum[-windows:]))
                        
                        if rewards_avg[-1] > best_rewards:
                            best_rewards = rewards_avg[-1]
                            self.save_models()
                        print(f"episode: {episode}, rewards_sum: {rewards_sum[episode]}, rewards_avg: {rewards_avg[episode]}, best_rewards: {best_rewards}")
                        break
                    obs = obs_
            
    def save_models(self):
        self.p_net.save(self.p_net_filename)
        print("... saving model ...")
        
    def load_models(self):
        self.p_net = load_model(self.p_net_filename)
        print('... loading models ...')

# Start Learning

In [4]:
n_episode = 500
n_trial = env._max_episode_steps
decay = (1./0.75 - 1)/n_episode
opt = tf.keras.optimizers.Adam(learning_rate = 1e-3, decay = decay)
agent = Agent()
agent.compile(opt = opt)

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2023-11-08 01:07:11.952654: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-08 01:07:11.952766: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
agent.FCN().summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 128)               640       
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 2)                 258       
                                                                 
Total params: 898
Trainable params: 898
Non-trainable params: 0
_________________________________________________________________


In [None]:
agent.train(n_episodes = n_episode, n_trial = n_trial)

# Demo

In [7]:
#start playing
agent.load_models()

s_steps = 0
for i in range(10):
    step = 0
    observation = env.reset()
    done = False
    while not done:
        action, _ = agent.select_action(observation)
        observation_, reward, done, info = env.step(action)
        observation = observation_
        env.render()
    
        step += 1
    s_steps += step
    print(f'game: {i + 1}, step: {step}')
    
print(f"average step: {s_steps/10.}")

... loading models ...
game: 1, step: 500
game: 2, step: 500
game: 3, step: 500
game: 4, step: 500
game: 5, step: 383
game: 6, step: 500
game: 7, step: 500
game: 8, step: 500
game: 9, step: 500
game: 10, step: 500
average step: 488.3


In [None]:
env.close()