# Import Libraries

In [1]:
from keras.layers import Activation, Dense, Dropout
from keras.models import Sequential, load_model
from keras.optimizers import Adam
from keras import backend as K

import tensorflow as tf
import numpy as np
import random
import gym

  logger.warn(


# Setup Evironment 

In [23]:
env = gym.make('CartPole-v0')
env._max_episode_steps = 500

  logger.warn(


# Define Experience Class

In [3]:
class Experience_replay(object):
    def __init__(self, max_size, input_shape = 4):
        #define max memory size
        self.mem_size = max_size 
        
        #memory counter
        self.mem_cntr = 0 
        
        #initialise state memory
        self.state_memory = np.zeros((self.mem_size, input_shape), dtype = np.float32)
        
        #initialise new state memory
        self.new_state_memory = np.zeros((self.mem_size, input_shape), dtype = np.float32)
        
        #initialise action memory
        self.action_memory = np.zeros(self.mem_size, dtype = np.int32)
        
        #initialise reward memory
        self.reward_memory = np.zeros(self.mem_size, dtype = np.float32)
        
        #initialise terminal memory (is it done?)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.uint8)
        
    def push(self, state, action, reward, state_, done):
        
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = done
        
        self.mem_cntr += 1
        
    def sample(self, batch_size):
        #randamly sample memory in the size of batch_size 
        #replace = False => index cannot be chosen multiple times
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace = False)
        
        #extract batch of memory
        states = self.state_memory[batch]
        new_states = self.new_state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        dones = self.terminal_memory[batch]
        
        return states, actions, rewards, new_states, dones        


# Define DQN

In [4]:
def DQN(n_inputs = 4, n_hiddens = [128, 64], n_actions = 2, lr = 1e-3):
    
    model = Sequential() 
    model.add(tf.keras.Input(shape = (n_inputs,)))
    model.add(Dense(units = n_hiddens[0], activation = "relu"))
    model.add(Dense(units = n_hiddens[1], activation = "relu"))
    model.add(Dropout(0.2))
    model.add(Dense(units = n_actions))
    
    model.compile(optimizer=Adam(lr=lr), loss='mean_squared_error')
    
    return model

# Define Agent

In [5]:
class Agent():
    def __init__(self, 
                 lr = 1e-3, 
                 eps_start = 1.0, 
                 eps_end = 0.01, 
                 eps_decay = 200, 
                 batch_size = 128, 
                 target_update = 40, 
                 gamma = 0.99,
                 n_actions = 2):
        
        self.experience = Experience_replay(max_size = 100000, input_shape = 4)
        
        self.q_eval   = DQN()
        self.q_target = DQN()
        self.q_target.set_weights(self.q_eval.get_weights())
        
        self.eps_start = eps_start
        self.eps_decay = eps_decay
        self.eps_end = eps_end 
        self.eps_threshold = None
        
        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        
        self.action_space = [_ for _ in range(n_actions)]
        self.step = 0
        
        self.training_result = {"reward": [], 
                                "mean reward": []}
        
        self.q_eval_model_file = 'q_eval.h5'
        self.q_target_model_file = 'q_target.h5'
        
    def select_action(self, observation):

        sample = random.random()
        self.eps_threshold = self.eps_end + (self.eps_start - self.eps_end)*np.exp(-self.step/self.eps_decay)
        
        if sample > self.eps_threshold:
            state = np.array([observation], copy=False, dtype = np.float32)
            actions = self.q_eval.predict(state, verbose = 0)
            action = np.argmax(actions)
        else:
            action = np.random.choice(self.action_space)
        
        return action
    
    def replace_target_network(self):
        if self.target_update != 0 and self.step % self.target_update == 0:
            self.q_target.set_weights(self.q_eval.get_weights())
    
    def learn(self):
        if self.experience.mem_cntr > self.batch_size:
            states, actions, rewards, new_states, dones = self.experience.sample(self.batch_size)
            
            self.replace_target_network()
            
            q_eval = self.q_eval.predict(states, verbose = 0)
            q_next = self.q_target.predict(new_states, verbose = 0)
            q_next[dones] = 0
            
            indices = np.arange(self.batch_size)
            q_target = q_eval[:]
        
            q_target[indices, actions] = rewards + self.gamma*np.max(q_next, axis = 1) 

            self.q_eval.train_on_batch(states, q_target)
             
            self.step += 1
            
    def train(self, n_episodes = 100, n_trial = 500, windows = 10):
        steps = 0
        sliding_reward = []
        mean_reward = 0
        best_reward = -np.inf
        for episode in range(n_episodes):
            c_samples = 0
            rewards   = 0
            
            obs = env.reset()
            for i in range(n_trial):
                #select action
                action = self.select_action(obs)
                
                #interact with the environment
                obs_, reward, done, info = env.step(action)
                
                #if the agent is fail before time limit => reward = -10
                if done and i < env._max_episode_steps - 1: 
                    reward = -10
                
                #store experience
                self.experience.push(obs, action, reward, obs_, done)
                obs = obs_
                
                #learn from experience
                self.learn()
                
                rewards += reward
                
                if done:
                    break   
                
                steps += 1
                c_samples += self.batch_size
            
            sliding_reward.append(rewards)
            mean_reward = np.mean(sliding_reward[-windows:])
            
            if mean_reward > best_reward:
                self.save_models()
                best_reward = mean_reward
                
            self.training_result["reward"].append(rewards)
            self.training_result["mean reward"].append(mean_reward)
            

            print(f'episode: {episode}, rewards: {rewards}, mean reward: {mean_reward}, best reward: {best_reward}, threshold: {self.eps_threshold}')
                
    def save_models(self):
        self.q_eval.save(self.q_eval_model_file)
        self.q_target.save(self.q_target_model_file)
        print("... saving model ...")
        
    def load_models(self):
        self.q_eval = load_model(self.q_eval_model_file)
        self.q_target = load_model(self.q_target_model_file)
        print('... loading models ...')

# Start Learning

In [6]:
agent = Agent()

2023-10-26 19:47:48.396654: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-10-26 19:47:48.398106: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



  super(Adam, self).__init__(name, **kwargs)


In [7]:
agent.train(n_episodes = 200, n_trial = env._max_episode_steps)

... saving model ...
episode: 0, rewards: 16.0, mean reward: 16.0, best reward: 16.0, threshold: 1.0
episode: 1, rewards: 9.0, mean reward: 12.5, best reward: 16.0, threshold: 1.0
episode: 2, rewards: 20.0, mean reward: 15.0, best reward: 16.0, threshold: 1.0
episode: 3, rewards: 3.0, mean reward: 12.0, best reward: 16.0, threshold: 1.0
episode: 4, rewards: 5.0, mean reward: 10.6, best reward: 16.0, threshold: 1.0


2023-10-26 01:54:56.366824: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-10-26 01:54:56.398142: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-10-26 01:54:56.470514: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-10-26 01:54:56.660005: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


episode: 5, rewards: 11.0, mean reward: 10.666666666666666, best reward: 16.0, threshold: 0.9950623544007555


2023-10-26 01:54:57.311042: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


episode: 6, rewards: 8.0, mean reward: 10.285714285714286, best reward: 16.0, threshold: 0.9057890438555999
episode: 7, rewards: 10.0, mean reward: 10.25, best reward: 16.0, threshold: 0.8165008432473004
episode: 8, rewards: 5.0, mean reward: 9.666666666666666, best reward: 16.0, threshold: 0.7544941117761887
episode: 9, rewards: -1.0, mean reward: 8.6, best reward: 16.0, threshold: 0.7181847054890343
episode: 10, rewards: 20.0, mean reward: 9.0, best reward: 16.0, threshold: 0.616500130242572
episode: 11, rewards: 21.0, mean reward: 10.2, best reward: 16.0, threshold: 0.5268253189934059
episode: 12, rewards: 4.0, mean reward: 8.6, best reward: 16.0, threshold: 0.48948132326580884
episode: 13, rewards: 8.0, mean reward: 9.1, best reward: 16.0, threshold: 0.4460273379609393
episode: 14, rewards: 30.0, mean reward: 11.6, best reward: 16.0, threshold: 0.3652085007518921
... saving model ...
episode: 15, rewards: 94.0, mean reward: 19.9, best reward: 19.9, threshold: 0.22012549408847562
..

# Demo

In [32]:
#start playing
agent.eps_threshold = 0.00
agent.step = 1e5
agent.load_models()

s_steps = 0
for i in range(10):
    step = 0
    observation = env.reset()
    done = False
    while not done:
        action = agent.select_action(observation)
        observation_, reward, done, info = env.step(action)
        observation = observation_
        env.render()
    
        step += 1
    s_steps += step
    print(f'game: {i + 1}, step: {step}')
    
print(f"average step: {s_steps/10.}")

... loading models ...


2023-10-26 19:54:46.190658: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


game: 1, step: 500
game: 2, step: 500
game: 3, step: 500
game: 4, step: 164
game: 5, step: 214
game: 6, step: 163
game: 7, step: 164
game: 8, step: 196
game: 9, step: 182
game: 10, step: 500
average step: 308.3


In [21]:
env.close()