<a href="https://colab.research.google.com/github/rennyatwork/CegepSteFoy_ReinfLearn_Work/blob/main/TP01/taxi_gradient_policy_ppo_v01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [22]:
import gymnasium as gym
import numpy as np
import tensorflow as tf
import time

class ActorCritic(tf.keras.Model):
    def __init__(self, num_actions, hidden_units=64):
        super(ActorCritic, self).__init__()
        self.dense1 = tf.keras.layers.Dense(hidden_units, activation="relu")
        self.policy = tf.keras.layers.Dense(num_actions, activation="softmax")
        self.value = tf.keras.layers.Dense(1)

    def call(self, input_data):
        x = self.dense1(input_data)
        return self.policy(x), self.value(x)

def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95):
    gae = 0
    returns = []
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * next_value * masks[step] - values[step]
        gae = delta + gamma * tau * masks[step] * gae
        returns.insert(0, gae + values[step])
        next_value = values[step]
    return returns

def ppo_loss(new_policy, old_policy, actions, advantages, clip_param=0.2):
    prob_ratio = tf.exp(tf.math.log(tf.reduce_sum(new_policy * actions, axis=1) + 1e-10) -
                        tf.math.log(tf.reduce_sum(old_policy * actions, axis=1) + 1e-10))
    surr1 = prob_ratio * advantages
    surr2 = tf.clip_by_value(prob_ratio, 1.0 - clip_param, 1.0 + clip_param) * advantages
    return -tf.reduce_mean(tf.minimum(surr1, surr2))

def train_ppo(env, num_episodes=2000, max_timesteps=1000, update_timesteps=2000, print_interval=0.1, weights=None, **kwargs):
    optimizer = tf.keras.optimizers.Adam(learning_rate=kwargs.get('learning_rate', 0.0003))

    state_dim = env.observation_space.n
    action_dim = env.action_space.n

    model = ActorCritic(action_dim, hidden_units=kwargs.get('hidden_units', 64))

    # Build the model here to ensure weights are created
    dummy_input = tf.keras.Input(shape=(state_dim,))
    model(dummy_input)

    if weights is not None:
        print("Weights shapes from previous training:", [w.shape for w in weights])
        print("Model expected weights shapes:", [w.shape for w in model.trainable_weights])
        if len(weights) != len(model.trainable_weights):
            raise ValueError("The number of weights provided does not match the model's expected weights.")
        model.set_weights(weights)


    history = []
    for episode in range(num_episodes):
        state, _ = env.reset()
        state = tf.one_hot(state, state_dim)
        episode_start = time.time()

        rewards = []
        states, actions, values, log_probs, masks = [], [], [], [], []

        for t in range(max_timesteps):
            state_tensor = tf.expand_dims(state, 0)
            with tf.GradientTape() as tape:
                policy, value = model(state_tensor)
                action_probs = tf.squeeze(policy)
                value = tf.squeeze(value)
                action = tf.random.categorical(tf.math.log([action_probs]), 1)[0, 0]

            next_state, reward, terminated, truncated, _ = env.step(action.numpy())
            log_prob = tf.math.log(action_probs[action])

            states.append(state)
            actions.append(tf.one_hot(action, action_dim))
            values.append(value)
            log_probs.append(log_prob)
            rewards.append(reward)
            masks.append(1.0 - int(terminated or truncated))

            if terminated or truncated:
                break

            state = tf.one_hot(next_state, state_dim)

        _, next_value = model(tf.one_hot(next_state, state_dim).numpy()[np.newaxis, :])
        returns = compute_gae(next_value, rewards, masks, values)
        advantages = np.array(returns) - np.array(values)
        advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-10)

        old_policy, _ = model(tf.stack(states))
        old_policy = tf.stop_gradient(old_policy)

        for _ in range(10):  # PPO update iterations
            with tf.GradientTape() as tape:
                new_policy, _ = model(tf.stack(states))
                loss = ppo_loss(new_policy, old_policy, tf.stack(actions), advantages)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        total_reward = sum(rewards)
        history.append((t+1, total_reward))

        episode_time = time.time() - episode_start

        if episode % int(num_episodes * print_interval) == 0:
            interval = int(num_episodes * print_interval)
            avg_reward = np.mean([reward for _, reward in history[-interval:]])
            avg_steps = np.mean([steps for steps, _ in history[-interval:]])
            print(f"Progress: {episode/num_episodes*100:.1f}%, "
                  f"Episode {episode+1}/{num_episodes}, "
                  f"Avg Reward: {avg_reward:.2f}, "
                  f"Avg Steps: {avg_steps:.2f}, "
                  f"Time: {episode_time:.4f}s")

    return model.get_weights()

if __name__ == "__main__":
    env = gym.make('Taxi-v3', render_mode='rgb_array')

    # First training session
    weights_1 = train_ppo(env, num_episodes=2, hidden_units=32, learning_rate=0.001)

    # Second training session using weights from the first one
    weights_2 = train_ppo(env, num_episodes=2, hidden_units=32, learning_rate=0.001, weights=weights_1)

Progress: 0.0%, Episode 1/10, Avg Reward: -830.00, Avg Steps: 200.00, Time: 2.4574s
Progress: 10.0%, Episode 2/10, Avg Reward: -731.00, Avg Steps: 200.00, Time: 1.7077s
Progress: 20.0%, Episode 3/10, Avg Reward: -812.00, Avg Steps: 200.00, Time: 2.2680s
Progress: 30.0%, Episode 4/10, Avg Reward: -893.00, Avg Steps: 200.00, Time: 2.2569s
Progress: 40.0%, Episode 5/10, Avg Reward: -785.00, Avg Steps: 200.00, Time: 1.6795s
Progress: 50.0%, Episode 6/10, Avg Reward: -740.00, Avg Steps: 200.00, Time: 1.7190s
Progress: 60.0%, Episode 7/10, Avg Reward: -866.00, Avg Steps: 200.00, Time: 1.8018s
Progress: 70.0%, Episode 8/10, Avg Reward: -722.00, Avg Steps: 200.00, Time: 1.7912s
Progress: 80.0%, Episode 9/10, Avg Reward: -812.00, Avg Steps: 200.00, Time: 1.6655s
Progress: 90.0%, Episode 10/10, Avg Reward: -830.00, Avg Steps: 200.00, Time: 2.1011s
Weights shapes from previous training: [(500, 32), (32,), (32, 6), (6,), (32, 1), (1,)]
Model expected weights shapes: [TensorShape([500, 32]), Tensor

In [21]:
print("Weights shapes from first training:", [w.shape for w in weights_1])
print("Model expected weights shapes:", [w.shape for w in model.trainable_weights])

Weights shapes from first training: [(500, 32), (32,), (32, 6), (6,), (32, 1), (1,)]


NameError: name 'model' is not defined

In [19]:
 env = gym.make('Taxi-v3', render_mode='rgb_array')

# First training session
weights_1 = train_ppo(env, num_episodes=10, hidden_units=32, learning_rate=0.001)

weights_2 = train_ppo(env, num_episodes=10, hidden_units=32, learning_rate=0.001, weights=weights_1)




Progress: 0.0%, Episode 1/10, Avg Reward: -731.00, Avg Steps: 200.00, Time: 1.7672s
Progress: 10.0%, Episode 2/10, Avg Reward: -830.00, Avg Steps: 200.00, Time: 1.7214s
Progress: 20.0%, Episode 3/10, Avg Reward: -794.00, Avg Steps: 200.00, Time: 1.7665s
Progress: 30.0%, Episode 4/10, Avg Reward: -794.00, Avg Steps: 200.00, Time: 2.1558s
Progress: 40.0%, Episode 5/10, Avg Reward: -830.00, Avg Steps: 200.00, Time: 2.5112s
Progress: 50.0%, Episode 6/10, Avg Reward: -839.00, Avg Steps: 200.00, Time: 1.8082s
Progress: 60.0%, Episode 7/10, Avg Reward: -794.00, Avg Steps: 200.00, Time: 1.6851s
Progress: 70.0%, Episode 8/10, Avg Reward: -803.00, Avg Steps: 200.00, Time: 1.6494s
Progress: 80.0%, Episode 9/10, Avg Reward: -704.00, Avg Steps: 200.00, Time: 1.6908s
Progress: 90.0%, Episode 10/10, Avg Reward: -884.00, Avg Steps: 200.00, Time: 1.7041s


ValueError: You called `set_weights(weights)` on layer 'actor_critic_12' with a weight list of length 6, but the layer was expecting 0 weights.

ValueError: You called `set_weights(weights)` on layer 'actor_critic_10' with a weight list of length 6, but the layer was expecting 0 weights.