<a href="https://colab.research.google.com/github/rennyatwork/CegepSteFoy_ReinfLearn_Work/blob/main/TP01/taxi_dqn_v04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install gymnasium
!pip install segment_tree



In [27]:
# Standard libraries
import random
import time
from collections import deque
import numpy as np

# Gym (or Gymnasium, depending on the version)
import gym  # For Gym, or use `import gymnasium as gym` if using Gymnasium specifically

# TensorFlow and Keras
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


In [28]:


# Environment Setup
env = gym.make('Taxi-v3', render_mode="rgb_array")
state_size = env.observation_space.n  # Taxi-v3 has 500 states
action_size = env.action_space.n

# DQN Model
class DQNModel:
    def __init__(self, state_size, action_size):
        self.model = self._build_model(state_size, action_size)
        self.target_model = tf.keras.models.clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())

    def _build_model(self, state_size, action_size):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(8, activation='relu', input_shape=(state_size,)),
            tf.keras.layers.Dense(8, activation='relu'),
            tf.keras.layers.Dense(action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                      loss='mse')
        return model

    def predict(self, state, pPrint=False):
        if pPrint:
            print(f"[predict] state: {state}")
        if np.isscalar(state):
            state = np.eye(state_size)[state]  # Convert scalar to one-hot encoded array
        return self.model.predict(np.array([state]), verbose=0)[0]

    def update_target_model(self, pPrint=False):
        if pPrint:
            print("[update_target_model] Updating target network weights.")
        self.target_model.set_weights(self.model.get_weights())

  deprecation(
  deprecation(


In [29]:
# DQN Agent
class DQNAgent:

    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 0.99   # exploration rate
        self.epsilon_min = 0.005
        self.epsilon_decay = 0.995
        self.model = DQNModel(state_size, action_size)
        self.update_target_model()

    def update_target_model(self, pPrint=False):
        if pPrint:
            print("[update_target_model] Calling model's update_target_model.")
        self.model.update_target_model()

    def remember(self, state, action, reward, next_state, done, pPrint=False):
        if pPrint:
            print(f"[remember] State: {state}, Action: {action}, Reward: {reward}, Next_state: {next_state}, Done: {done}")
        state = np.eye(self.state_size)[state] if isinstance(state, int) else state
        next_state = np.eye(self.state_size)[next_state] if isinstance(next_state, int) else next_state
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, pPrint=False):
        if pPrint:
            print("[act] Epsilon-greedy action selection.")
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values)

    def replay(self, batch_size, pPrint=False):
        if pPrint:
            print('[replay]')

        # Sample a minibatch from memory
        minibatch = random.sample(self.memory, min(len(self.memory), batch_size))

        # Get all states and next states from the minibatch for batch processing
        states = np.array([exp[0] for exp in minibatch])
        next_states = np.array([exp[3] for exp in minibatch])

        # Predict Q-values for all current states in the minibatch
        q_values = self.model.model.predict(states, verbose=0)

        # Predict Q-values for all next states using the target network
        q_values_next = self.model.target_model.predict(next_states, verbose=0)

        # Iterate over the minibatch and adjust Q-values
        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            target = reward if done else reward + self.gamma * np.max(q_values_next[i])
            q_values[i][action] = target

        # Train the model on the batch of updated Q-values
        self.model.model.fit(states, q_values, epochs=1, verbose=0)

        # Decay epsilon for exploration
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay



    def train(self, episodes, batch_size, pPrint=False, pModel=None, pPrintStats=True):

        start_time_all = time.time()
        total_score = 0

        if pModel is not None:
            self.model = pModel
        else:
            self.model = DQNModel(self.state_size, self.action_size)

        if pPrint:
            print("[BEGIN train] Starting training.")
        scores = []
        for e in range(episodes):

          if pPrint:
            print(f'FOR loop, episode {e}')

          start_time = time.time()
          state = env.reset()  # Reset the environment correctly
          done = False
          score = 0
          penalties = 0
          iter = 0

          while not done:

            if pPrint:
              print('')
              print('------')
              print(f'[while] - [episode]: {e},  [state]: {state},  [iter]: {iter}, [score]: {score},  [done]: {done}')

            action = self.act(state, pPrint)  # Get action from the agent
            next_state, reward, terminated, _ = env.step(action)

            iter += 1
            penalties += 1 if reward == -10 else 0

            # Store the experience in memory
            self.remember(state, action, reward, next_state, done)

            # Train the model if memory is sufficient
            if len(self.memory) > batch_size:
                self.replay(batch_size)

            state = next_state
            done = terminated

            score += reward

            ### Add additional conditions for early termination
            if score < -20:
              done = True
              print('Too many negative points. Impossible to win. BYE!!!')
            if iter >= 30:
              done = True
              print('Too many steps. Not going ANYWHERE. BYE!!!')
            if penalties >= 2:
              done = True
              print('Too many wrong actions.  BYE!!!')

            total_score += score
            scores.append(score)




          if pPrintStats:
              print(f"Episode {e + 1}/{episodes}, Score: {score}, total iter: {iter}, Avg Reward: {total_score / (e + 1):.2f}, Time: {time.time() - start_time:.2f}s")

        elapsed_time = time.time() - start_time_all
        print('----------------------')
        print('[END training]')
        print(f'Avg score  = {total_score / episodes:.2f}')
        print(f"Training complete in {elapsed_time:.2f} sec.")

        return scores, self.model  # Return scores and the trained model



    def evaluate(self, env, episodes=5, pModel=None):
        """Evaluate the trained DQN agent by letting it play the game."""
        model_to_evaluate = pModel if pModel is not None else self.model
        penalties = 0

        for e in range(episodes):
          #print(f'for loop, episode {e}')
          start_time = time.time()
          state = env.reset()
          state = state[0] if isinstance(state, tuple) else state  # Handle Gym's tuple output
          done = False
          total_reward = 0
          steps = 0

          while not done:
              # Use the trained model to select the best action
              action = np.argmax(model_to_evaluate.predict(state))
              next_state, reward, done, info = env.step(action)
              total_reward += reward
              steps += 1
              state = next_state

              if total_reward < -30:
                  done = True
                  print('Too many negative points. Impossible to win. BYE!!!')
              if steps >= 30:
                  done = True
                  print('Too many steps. Not going ANYWHERE. BYE!!!')
              if reward == -10:
                  penalties += 1
                  if penalties >= 2:
                      done = True
                      print('Too many wrong actions.  BYE!!!')

          elapsed_time = time.time() - start_time  # Calculate elapsed time

          print(f"Episode {e + 1}: Total Reward = {total_reward}, Steps Taken = {steps}, Time Elapsed = {elapsed_time:.2f} seconds")



In [31]:
# Usage
trained_agent = DQNAgent(state_size, action_size)  # Use existing DQNAgent class

# Train the agent and get the trained model
#scores, trained_model_01 = trained_agent.train(episodes=500, batch_size=32, pPrint=True)
#scores, trained_model_02 = trained_agent.train(episodes=500, batch_size=32, pModel=trained_model_01, pPrint=True)


scores, trained_model_01 = trained_agent.train(episodes=10, batch_size=32, pPrint=False)
#scores, trained_model_02 = trained_agent.train(episodes=500, batch_size=32, pModel=trained_model_01, pPrint=False)





Too many negative points. Impossible to win. BYE!!!
Too many wrong actions.  BYE!!!
Episode 1/10, Score: -22, total iter: 4, Avg Reward: -55.00, Time: 0.53s
Too many negative points. Impossible to win. BYE!!!
Too many wrong actions.  BYE!!!
Episode 2/10, Score: -29, total iter: 11, Avg Reward: -105.50, Time: 0.13s
Too many negative points. Impossible to win. BYE!!!
Episode 3/10, Score: -21, total iter: 12, Avg Reward: -99.33, Time: 0.27s
Too many wrong actions.  BYE!!!
Episode 4/10, Score: -20, total iter: 2, Avg Reward: -82.00, Time: 0.03s
Too many negative points. Impossible to win. BYE!!!
Too many wrong actions.  BYE!!!
Episode 5/10, Score: -22, total iter: 4, Avg Reward: -73.00, Time: 1.41s
Too many negative points. Impossible to win. BYE!!!
Too many wrong actions.  BYE!!!
Episode 6/10, Score: -27, total iter: 9, Avg Reward: -78.83, Time: 2.48s
Too many wrong actions.  BYE!!!
Episode 7/10, Score: -20, total iter: 2, Avg Reward: -71.86, Time: 0.49s
Too many negative points. Impossib

In [None]:
def train(self, episodes, batch_size, pPrint=False, pModel=None, pPrintStats=True):

    start_time_all = time.time()
    total_score = 0

    if pModel is not None:
        self.model = pModel
    else:
        self.model = DQNModel(self.state_size, self.action_size)

    if pPrint:
        print("[BEGIN train] Starting training.")
    scores = []
    for e in range(episodes):

        if pPrint:
            print(f'FOR loop, episode {e}')

        start_time = time.time()
        state = env.reset()  # Reset the environment correctly
        # Handle Gym's tuple output in reset()
        state = state[0] if isinstance(state, tuple) else state
        done = False
        score = 0
        penalties = 0
        iter = 0

        while not done:

            if pPrint:
                print('')
                print('------')
                print(f'[while] - [episode]: {e},  [state]: {state},  [iter]: {iter}, [score]: {score},  [done]: {done}')

            action = self.act(state, pPrint)  # Get action from the agent
            # Update this line to handle additional return values from env.step()
            next_state, reward, terminated, truncated, info = env.step(action)
            # Alternatively, if you don't need truncated and info:
            # next_state, reward, terminated, _, _ = env.step(action)

            iter += 1
            penalties += 1 if reward == -10 else 0

            # Store the experience in memory
            self.remember(state, action, reward, next_state, done)

            # Train the model if memory is sufficient
            if len(self.memory) > batch_size:
                self.replay(batch_size)

            state = next_state
            done = terminated  # Update done based on 'terminated'

            score += reward

            ### Add additional conditions for early termination
            if score < -20:
                done = True
                print('Too many negative points. Impossible to win. BYE!!!')
            if iter >= 30:
                done = True
                print('Too many steps. Not going ANYWHERE. BYE!!!')
            if penalties >= 2:
                done = True
                print('Too many wrong actions.  BYE!!!')

            total_score += score
            scores.append(score)



        if pPrintStats:
            print(f"Episode {e + 1}/{episodes}, Score: {score}, total iter: {iter}, Avg Reward: {total_score / (e + 1):.2f}, Time: {time.time() - start_time:.2f}s")

    elapsed

In [None]:
scores, trained_model_02 = trained_agent.train(episodes=1500, batch_size=32, pModel=trained_model_01, pPrint=True)

In [166]:
# Evaluate using the trained model
# Evaluate the trained model
trained_agent.evaluate(env, episodes=10, pModel=trained_model_03)

----------
step: 1
----------
step: 2
----------
step: 3
----------
step: 4
----------
step: 5
----------
step: 6
----------
step: 7
----------
step: 8
----------
step: 9
----------
step: 10
----------
step: 11
----------
step: 12
----------
step: 13
----------
step: 14
----------
step: 15
----------
step: 16
----------
step: 17
----------
step: 18
----------
step: 19
----------
step: 20
----------
step: 21
Too many negative points. Impossible to win. BYE!!!
Episode 1: Total Reward = -21, Steps Taken = 21, Time Elapsed = 2.21 seconds
----------
step: 1
----------
step: 2
----------
step: 3
----------
step: 4
----------
step: 5
----------
step: 6
----------
step: 7
----------
step: 8
----------
step: 9
----------
step: 10
----------
step: 11
----------
step: 12
----------
step: 13
----------
step: 14
----------
step: 15
----------
step: 16
----------
step: 17
----------
step: 18
----------
step: 19
----------
step: 20
----------
step: 21
Too many negative points. Impossible to win. BYE!