In [1]:
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import numpy as np
import gym
import sklearn
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Softmax
from tensorflow.keras.optimizers import Adam

In [3]:
import Box2D

In [None]:
class ReplayBuffer():
    def __init__(self, max_size, input_dims):
        
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_dims),dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims),dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size,dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32)
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[ index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = 1 - int(done)
        self.mem_cntr+= 1
        
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self,mem_size)
        batch = np.random.choice(max_mem, batch_size,replace=False)
        
        states = self.state_memory[batch]
        states = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]
        return states, actions, rewards, states_, terminal

In [None]:
"""
def build_model(input_dims):
    
    n_iterations = 200
    n_episodes_per_update = 15
    n_max_steps = 1000
    discount_rate = 0.95

    keras.backend.clear_session()

    model = keras.models.Sequential([
    keras.layers.Dense(64, activation="relu", input_shape=(input_dims,)),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(10, activation="softmax"),])
    
    model.compile(optimizer=Adam(learning_rate=0.01), loss = 'mse')
    
    return model"""

def build_model(lr,n_actions, input_dims, fc1_dims, fc2_dims):
    
    model = keras.models.Sequential([
        keras.layers.Dense(fc1_dims, activation="relu", input_shape=(input_dims,)),
        keras.layers.Dense(fc2_dims, activation="relu"),
        keras.layers.Dense(n_actions, activation=None),])
    
    model.compile(optimizer=Adam(learning_rate=0.01), loss = 'mean _squared error')
    
    return model

In [None]:
class Agent():
    def __init__(self, lr, gamma,n_actions, epsilon,batch_size,input_dims, epsilon_dec=1e-3, epsilon_end=0.01,
                 mem_size=1000000, fname='lunar_model.h5' ):
        
        self.action_space = [i for i in range(n_actions)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = epsilon_dec
        self.eps_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.memory = ReplayBuffer(mem_size, input_dims)
        self.q_eval = build_model(lr, n_actions, input_dims, 64, 64)
        
        def store_transition(self, state, action,reward,new_state, done):
            self.memory.store_transition(state, action, reward, new_state, done)
            
        def choose_action(self, observation):
                if np.random.random()< self.epsilon:
                    action = np.random.choice(self.action_space)
                else:
                    state = np.array([observation])
                    actions = self.q_eval.predict(state)
                    action = np.argmax(actions)
        return action
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        states, actions, rewards, states_, dones = \
                self.memory.sample_buffer(self.batch_size)
        q_eval = self.q_eval.predict(states)
        q_next = self.q_eval.predict(states_)
        q_target = np.copy(q_eval)
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        q_target[batch_index,actions] = rewards +self.gamma * np.max(q_next, axis=1)*dones
        
        self.q_eval.train_on_batch(states, q_target)
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        
    def save_model(self):
        self.q_eval.save(self.model_file)
        
    def load_model(self):
        self.q_eval = load_model(self.model_file)

In [None]:
if __name__ == '__main__':
    env = gym.make("LunarLander-v2")
    # Set the initial values of the model
    n_games = 200
    agent = Agent(gamma = 0.99, epsilon = 1.0 , lr = 0.01 ,input_dims = 8, 
                  n_actions = env.action_space.n, mem_size = 1000000, batch_size = 64, epsilon_end = 0.01)
    
    #agent.load_model()
    scores = []
    eps_history = []
    
    for i in range(n_games):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, observation_, done)
            observation = observation_
            agent.learn()
            
        eps_history.append(agent.epsilon)
        scores.append(score)

        avg_score = np.mean(scores[-100:])
        print(f"episode ", i ,'score %.2f%' % score, 
              'average score %.2f' % avg_score)
        
       
        
    filename = 'lunarlander.png'
    x = [i+1 for i in range(n_games)]
    plotLearning(x, scores, eps_history, filename)
        
        
            


In [None]:
def plotLearning(x, scores, epsilons, filename, lines=None) :
    
    fig=pit.figure()
    ax=fig.add_subplot(111, label="1" )
    ax2=fig.add_subplot(111,label="2", frame_on=False)
    
    ax.plot(x, epsilons,color="CO")
    ax.set_xlabel("Game",color="CO")
    ax.set_ylabel( "Epsilon", color="CO")
    ax.tick_params(axis="x",colors="CO")
    ax.tick_params(axis='y',colors="CO")
    
    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N) :
        running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
        
    ax2.scatter(x, running_avg, color="C1" )
    #ax2_xaxis.tick top()
    ax2.axes.get_xaxis().set_visible(False)
    ax2.yaxis.tick_right()
    #ax2.set xlabel(*x label 2'color="C1")
    ax2.set_ylabel('Score', color="C1")
    #ax2.xaxis,set_label_position('top
    ax2.yaxis.set_label_position('right')
    #ax2.tick params(axis='x', colors="C1")
    ax2.tick_params(axis='y',colors="Ci")
    if lines is not None:
        for line in lines:
            plt.axvline(x=line)
            
    pit. savefig(filename)

In [5]:
keras.backend.clear_session()
env = gym.make("LunarLander-v2")
n_inputs = env.observation_space.shape[0]

model = keras.models.Sequential([
    keras.layers.Dense(32, activation="relu", input_shape=[8,]),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(4, activation="softmax"),
])


### Policy gradient
# - Let the NN play the game several times
# - At each step calculate gradients, but don't apply it
# - After running severl times, compute each action's advantage by
#    aggregating the results with a discount factor (0.9-0.99)
# - If overall action is positive, apply averge gradient
#    if overall action is negative, apply negative average gradient  

def play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        left_proba = model(obs[np.newaxis]) # calling the model with a single observation (obs has been reshaped to become a batch of size one)
        action = (tf.random.uniform([1, 1]) > left_proba)
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32)
        loss = tf.reduce_mean(loss_fn(y_target, left_proba))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, reward, done, info = env.step(int(action[0, 0].numpy()))
    return obs, reward, done, grads


def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]



# Set the initial values of the model
n_iterations = 200
n_episodes_per_update = 15
n_max_steps = 1000
discount_rate = 0.95

optimizer = keras.optimizers.Adam(learning_rate=0.01)
loss_fn = keras.losses.binary_crossentropy

keras.backend.clear_session()

model = keras.models.Sequential([
    keras.layers.Dense(32, activation="relu", input_shape=[8,]),
    keras.layers.Dense(64, activation="relu"),
    keras.layers.Dense(4, activation="softmax"),
])



Metal device set to: Apple M1


2021-12-11 16:46:17.773866: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-11 16:46:17.774009: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
if __name__ == '__main__':
    env = gym.make("LunarLander-v2")
    env.seed(42);

    for iteration in range(n_iterations):
        all_rewards, all_grads = play_multiple_episodes(
            env, n_episodes_per_update, n_max_steps, model, loss_fn)
        total_rewards = sum(map(sum, all_rewards))                     
        print("\rIteration: {}, mean rewards: {:.1f}".format(          
            iteration, total_rewards / n_episodes_per_update), end="") 
        all_final_rewards = discount_and_normalize_rewards(all_rewards,
                                                           discount_rate)
        all_mean_grads = []
        for var_index in range(len(model.trainable_variables)):
            mean_grads = tf.reduce_mean(
                [final_reward * all_grads[episode_index][step][var_index]
                 for episode_index, final_rewards in enumerate(all_final_rewards)
                     for step, final_reward in enumerate(final_rewards)], axis=0)
            all_mean_grads.append(mean_grads)
        optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

    env.close()

Iteration: 1, mean rewards: -447.3