<a href="https://colab.research.google.com/github/prikmm/MLprojects/blob/main/LunarLander_v2_RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gym
!pip install tf-agents
!pip install 'gym[atari]'
!pip install gym[box2d]

Collecting tf-agents
[?25l  Downloading https://files.pythonhosted.org/packages/e7/cd/a0710b1caae042b7a4d54fc74073fb4df7adf073934798443bdc0059813a/tf_agents-0.7.1-py3-none-any.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 6.3MB/s 
Installing collected packages: tf-agents
Successfully installed tf-agents-0.7.1
Collecting box2d-py~=2.3.5; extra == "box2d"
[?25l  Downloading https://files.pythonhosted.org/packages/87/34/da5393985c3ff9a76351df6127c275dcb5749ae0abbe8d5210f06d97405d/box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448kB)
[K     |████████████████████████████████| 450kB 6.7MB/s 
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.8


In [2]:
import tensorflow as tf
from tensorflow import keras
from tf_agents.environments import suite_gym
import matplotlib.pyplot as plt
import numpy as np
import gym

In [3]:
env = gym.make("LunarLander-v2")
env

<TimeLimit<LunarLander<LunarLander-v2>>>

In [4]:
print(env.observation_space)
print(env.observation_space.shape[0])

Box(-inf, inf, (8,), float32)
8


In [5]:
print(env.action_space)
print(env.action_space.n)

Discrete(4)
4


In [6]:
initial_timestep = env.reset()
initial_timestep

array([-0.00533285,  1.4059075 , -0.5401753 , -0.2227978 ,  0.00618622,
        0.12235782,  0.        ,  0.        ], dtype=float32)

# Creating a simple Policy:

In [7]:
K = keras.backend
K.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

n_inputs = env.observation_space.shape[0]
n_outputs = env.action_space.n

simple_policy_model = keras.models.Sequential([
    keras.layers.Dense(32, activation="relu", input_shape=[n_inputs]),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(n_outputs, activation="softmax"),
])

In [8]:
def lander_play_one_step(env, obs, model, loss_fn):
    with tf.GradientTape() as tape:
        probas = simple_policy_model(obs[np.newaxis])
        logits = tf.math.log(probas + keras.backend.epsilon())
        action = tf.random.categorical(logits, num_samples=1)
        loss = tf.reduce_mean(loss_fn(action, probas))
    grads = tape.gradient(loss, model.trainable_variables)
    obs, rewards, done, info = env.step(action[0, 0].numpy())
    return obs, rewards, done, grads


def lander_play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = []
    all_grads = []
    for episode in range(n_episodes):
        current_rewards = []
        current_grads = []
        obs = env.reset()
        for step in range(n_max_steps):
            obs, reward, done, grads = lander_play_one_step(env, obs, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done: 
                break

        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [9]:
def discount_rewards(rewards, discount_rate):
    discounted = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        discounted[step] += discounted[step + 1] * discount_rate
    return discounted


def discounted_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate)
                              for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean) / reward_std
            for discounted_rewards in all_discounted_rewards]

In [10]:
n_iterations = 200
n_episodes_per_update = 16
n_max_steps = 1000
discount_rate = 0.99

In [11]:
optimizer = keras.optimizers.Nadam(lr=0.005)
loss_fn = keras.losses.sparse_categorical_crossentropy

In [12]:
env.seed(42)

mean_rewards = []

for iteration in range(n_iterations):
    all_rewards , all_grads = lander_play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, simple_policy_model, loss_fn)
    mean_reward = sum(map(sum, all_rewards)) / n_episodes_per_update
    print("\rIteration: {}/{}, mean reward:{:.1f} ".format(
        iteration + 1, n_iterations, mean_reward), end="")
    mean_rewards.append(mean_reward)
    all_final_rewards = discounted_and_normalize_rewards(all_rewards,
                                                       discount_rate)
    all_mean_grads = []
    for var_index in range(len(simple_policy_model.trainable_variables)):
        mean_grads = tf.reduce_mean(
            [final_reward * all_grads[episode_index][step][var_index]
             for episode_index, final_rewards in enumerate(all_final_rewards)
                for step, final_reward in enumerate(final_rewards)], axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, simple_policy_model.trainable_variables))

Iteration: 1/200, mean reward:-183.1 

KeyboardInterrupt: ignored

In [None]:
plt.plot(mean_rewards)
plt.xlabel("Episode")
plt.ylabel("Mean reward")
plt.grid()
plt.show()

# Using TF_agents:

In [28]:
!pip install gym
!pip install tf-agents
!pip install 'gym[atari]'
!pip install gym[box2d]
!pip install pyvirtualdisplay

Collecting pyvirtualdisplay
  Downloading https://files.pythonhosted.org/packages/19/88/7a198a5ee3baa3d547f5a49574cd8c3913b216f5276b690b028f89ffb325/PyVirtualDisplay-2.1-py3-none-any.whl
Collecting EasyProcess
  Downloading https://files.pythonhosted.org/packages/48/3c/75573613641c90c6d094059ac28adb748560d99bd27ee6f80cce398f404e/EasyProcess-0.3-py2.py3-none-any.whl
Installing collected packages: EasyProcess, pyvirtualdisplay
Successfully installed EasyProcess-0.3 pyvirtualdisplay-2.1


In [31]:
import tensorflow as tf
from tensorflow import keras
from tf_agents.environments import suite_gym
import matplotlib.pyplot as plt
import numpy as np
import gym
import PIL

import matplotlib as mpl
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

import pyvirtualdisplay
_display = pyvirtualdisplay.Display(visible=False,  # use False with Xvfb
                                    size=(1400, 900))
_ = _display.start()

import warnings
warnings.filterwarnings('ignore')

EasyProcessError: ignored

In [3]:
env = suite_gym.load("LunarLander-v2")
env

<tf_agents.environments.wrappers.TimeLimit at 0x7fa35fe9e950>

In [4]:
from tf_agents.environments.tf_py_environment import TFPyEnvironment

tf_env = TFPyEnvironment(env)
tf_env

<tf_agents.environments.tf_py_environment.TFPyEnvironment at 0x7fa35f9eb210>

In [5]:
print(tf_env.observation_spec())
print(tf_env.action_spec())
print(tf_env.time_step_spec())

BoundedTensorSpec(shape=(8,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32))
BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0), maximum=array(3))
TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(8,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)))


In [6]:
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
from tf_agents.networks.value_network import ValueNetwork

fc_layer_params = [32, 32]
actor_fc_layer_params = fc_layer_params

actor_net = ActorDistributionNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    fc_layer_params=actor_fc_layer_params,
)

value_net = ValueNetwork(
    tf_env.observation_spec(),
    fc_layer_params=fc_layer_params,
)

In [7]:
from tf_agents.agents import PPOAgent

train_step = tf.Variable(0)
update_period = 4
actor_optimizer = keras.optimizers.Nadam(lr=2.5e-4)

agent = PPOAgent(
    time_step_spec=tf_env.time_step_spec(),
    action_spec=tf_env.action_spec(),
    optimizer=actor_optimizer,
    actor_net=actor_net,
    value_net=value_net,
)
agent.initialize()

In [8]:
agent.collect_data_spec

_TupleWrapper(Trajectory(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), observation=BoundedTensorSpec(shape=(8,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)), action=BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0), maximum=array(3)), policy_info=DictWrapper({'dist_params': DictWrapper({'logits': TensorSpec(shape=(4,), dtype=tf.float32, name='CategoricalProjectionNetwork_logits')})}), next_step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32))))

In [9]:
from tf_agents.replay_buffers import tf_uniform_replay_buffer

transitions_storage = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=1000,
)

In [10]:
transitions_storage_observer = transitions_storage.add_batch

In [11]:
from tf_agents.metrics import tf_metrics

train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]

In [12]:
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

collect_driver = DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers=[transitions_storage_observer] + train_metrics,
    num_steps=update_period)

In [13]:
transitions_dataset = transitions_storage.as_dataset(
                                sample_batch_size=tf_env.batch_size,
                                num_steps=2).prefetch(1)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [14]:
transitions_dataset

<PrefetchDataset shapes: (Trajectory(step_type=(1, 2), observation=(1, 2, 8), action=(1, 2), policy_info=DictWrapper({dist_params: DictWrapper({logits: (1, 2, 4)})}), next_step_type=(1, 2), reward=(1, 2), discount=(1, 2)), BufferInfo(ids=(1, 2), probabilities=(1,))), types: (Trajectory(step_type=tf.int32, observation=tf.float32, action=tf.int64, policy_info=DictWrapper({dist_params: DictWrapper({logits: tf.float32})}), next_step_type=tf.int32, reward=tf.float32, discount=tf.float32), BufferInfo(ids=tf.int64, probabilities=tf.float32))>

In [15]:
from tf_agents.utils.common import function

collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    dataset_iterator = iter(transitions_dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(dataset_iterator)
        train_loss = agent.train(trajectories)
        print("{} loss:{:.5f}".format(
            iteration, train_loss.loss.numpy()), end="")

In [16]:
train_agent(200)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))
0 loss:24.707621 loss:5.563912 loss:-0.003973 loss:3.521964 loss:-0.046575 loss:0.477976 loss:0.361607 loss:0.638288 loss:0.313039 loss:-0.3557910 loss:-0.3418811 loss:0.2070112 loss:0.4780313 loss:-0.0423214 loss:0.4236715 loss:0.3546516 loss:-0.3992917 loss:0.4544318 loss:0.3501019 loss:0.5035920 loss:0.3758821 loss:-0.4123422 loss:0.3031923 loss:-0.3417424 loss:0.4617325 loss:-0.2944826 loss:0.5726227 loss:-0.2030128 loss:-0.3715129 loss:-0.2734930 loss:-0.4375231 loss:-0.3793732 loss:-0.4239033 loss:-0.3869034 loss:0.4914635 loss:0.3594036 loss:-0.3657237 loss:-0.4220038 loss:0.3388239 loss:0.4413340 loss:-0.3215341 loss:0.5670142 loss:-0.0268043 loss:0.2991244 loss:-0.2672045 loss:0.0303746 loss:0.3568347 loss:-0.0601548 loss:0.5934649 los

In [17]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

In [19]:
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total

    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
            if self.counter % 100 == 0:
                print("\r{}/{}".format(
                    self.counter, self.total), end="")

In [30]:
tf_env.render(mode="rgb_array")
tf_env.pyenv.envs[0].render(mode="rgb_array")

NameError: ignored

In [20]:
frames = []
def save_frames(trajectory):
    global frames
    frames.append(tf_env.pyenv.envs[0].render(mode="rgb_array"))

watch_driver = DynamicStepDriver(
    tf_env,
    agent.policy,
    observers=[save_frames, ShowProgress(1000)],
    num_steps=1000)
final_time_step, final_policy_state = watch_driver.run()

plot_animation(frames)

NoSuchDisplayException: ignored

In [None]:
import PIL

image_path = os.path.join("images", "rl", "lunarlander.gif")
frame_images = [PIL.Image.fromarray(frame) for frame in frames[:150]]
frame_images[0].save(image_path, format='GIF',
                     append_images=frame_images[1:],
                     save_all=True,
                     duration=30,
                     loop=0)

In [None]:
%%html
<img src="images/rl/lunarlander.gif" />