In [22]:
import sys
import numpy as np 
import gymnasium as gym

#### $ Discretization : $ 
- Discretization is the process of transforming continuous data or variables into discrete or categorical data or variables. In other words, it involves breaking down a continuous variable into distinct groups or categories. This is often done in order to simplify data analysis, as working with discrete values can be more efficient and easier to interpret than working with continuous values. Discretization is commonly used in fields such as statistics, data analysis, and machine learning, where continuous data must be converted into a form that can be processed by algorithms or models. There are various methods of discretization, including binning, clustering, and decision trees.

#### $ Classes \ and \ Functions $

In [43]:
class DiscretizedEnvironment(gym.Wrapper):
    """
    class that discretize continuos data into categorical 

    arguments:
    - env : gym.make environment object
    - n_bins: int, number of bins to discretize

    return :

    
    """
    def __init__(self, env: gym.make, n_bins : int = 10):
        super().__init__(env)
        self.n_bins = n_bins 

        # discretize observation space
        self.high = env.observation_space.high
        self.low = env.observation_space.low
        self.observation_space = gym.spaces.Discrete(n_bins ** len(self.high))   
        # ============ Reason ===========
        # We calculate the total number of possible states by taking the product of the number of bins for each dimension.
        # For example, if n_bins is 10 and there are two dimensions in the observation space, then the total number of possible states is 10 ** 2 = 100.
        # ===============================

        # Define Bins for each dimension         
        self.observation_bins = [np.linspace(self.low[i], self.high[i], n_bins + 1)[1:-1] for i in range(len(self.low))]    
        # ============ Reason ===========
        # low and high are arrays that represent the lower and upper bounds of each dimension
        # np.linspace takes three arguments: the start value (low[i]), the end value (high[i]), and the number of intervals (n_bins + 1) between the start and end values. 
        # We add 1 to n_bins because we want to include both the lower and upper bounds in the bins.
        # We then select only the inner bins (excluding the lower and upper bounds) using the slicing notation [1:-1].
        # We do this for each dimension in the observation space by iterating over range(len(low))   
        #  ===========================================================================================      
                                                                                       
        # Define action space  
        self.action_space = gym.spaces.Discrete(3) # 3 discrete actions: push left, do nothing, push right
         
    def _discretize_observation(self, obs):
        """ discretize the space
        arguments:
        - obs : observation space
        
        return 
        - state
        """

        # convert continuous spaces to discrete
        state = 0
        for i, b in enumerate(self.observation_bins):
            state += np.digitize(obs[i], b) * ((self.n_bins) ** i)
        return state
    
    def reset(self):
        obs = self.env.reset()[0]
        return self._discretize_observation(obs)
    
    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        return self._discretize_observation(observation), reward, terminated, truncated, info

class QLearningAgent:
    """" Q-Learning agent action on a continuous space

    Arguments:
        - environment : gym.make environment Object 
        - alpha : float, Learning Rate 
        - gamma : float, Discount Factor 
        - exploration_rate: float, probability of taking a random action
        - epsilon_decay_rate: float, how quickly epsilon should decay 
        - discretization_bins:  int, number of bins to discretize the observation space
    
    Important Formulas
        `Q*(s, a) = R(s, a) + gamma * max(a')`

        ` The update rule is: Q(s_t, a_t) <- Q(s_t, a_t) + alpha * (r_t + gamma * max[a'](Q(s_{t+1}, a')) - Q(s_t, a_t))`
    """
    def __init__ (self, environment:gym.make, alpha:float = 0.01, gamma:float=0.99, 
                  exploration_rate:float =1.0, epsilon_decay_rate:float =0.9995, min_epsilon:float=.01, discretization_bins:int=10):
        
        # Set Environment & Q-Table Parameters
        self.env = DiscretizedEnvironment(env=environment, n_bins = discretization_bins)
        self.Qtable = np.zeros([self.env.observation_space.n, self.env.action_space.n])

        # Set Learning Parameters 
        self.alpha =  alpha # Learning Rate
        self.gamma = gamma # Discount Factor

        # Set Exploration Parameters
        self.exploration_rate = exploration_rate
        self.epsilon_decay_rate = epsilon_decay_rate
        self.min_epsilon = min_epsilon
        self.discretization_bins = discretization_bins

    def act (self, state:int):
        """ Select an action using epsilon-greedy policy selection
        
        Arguments:
        - state: int, current state
        
        Return:
        - action: int, selected action 
        """

        # Choosing a action : it wil becoming less exploratory once it gets more experience 
        if np.random.uniform () < self.exploration_rate:
            action = self.env.action_space.sample()
        else:
            # np.argmax, return the index of the action with the highest value 
            action = np.argmax(self.Qtable[state])
        
        return action 

    def learn_and_update (self, state:int, action:int, reward:float, next_state:int, is_done:bool):
        """ Update Q-table using Q-learning algorithm
        
        Arguments:
        - state: int, current state
        - action: int, current action
        - reward: float, reward for current state-action pair
        - next_state: int, next state
        - is_done: bool, whether the episode is terminated
        """

        # Get Current Q-Value
        current_q_value = self.Qtable[state, action]

        # compute Maximum Q-value for the Next State in the Qtable
        max_next_q_value = np.max(self.Qtable[next_state])

        # Compute the TD : temporal difference target 
        TD_target= reward + self.gamma * max_next_q_value * (not is_done)

        # Compute the TD_error :  temporal-difference error
        TD_error = TD_target - current_q_value

        # Updating Q-table
        self.Qtable[state, action] += self.alpha * TD_error

    def train(self, num_episode:int):
        """
        Train the agent for a specified number of episodes
        
        Arguments:
        - num_episode: int, number of episodes
        """

        scores = []
        max_avg_score = -np.inf
        for episode in range(1, num_episode+1):
    
            # Reset & start Environment 
            state = self.env.reset()

            # decay exploration 
            self.exploration_rate *= self.epsilon_decay_rate
            self.exploration_rate = max(self.exploration_rate, self.min_epsilon)
           
            episode_reward = 0.0
            done = False

            # Starting Training 
            while not done:
                # Select action & step 
                action = self.act(state=state)
                next_state, reward, done, _ , _ = self.env.step(action)

                # Update Q-Values 
                episode_reward += reward
                self.learn_and_update(state, action, reward, next_state, done)
                
                # Set Next State as Current State
                state = next_state

            scores.append(episode_reward)
            if len (scores) > 100:
                avg_score = np.mean(scores[-100:])
                if avg_score > max_avg_score:
                    max_avg_score = avg_score

            if episode % 100 == 0:
                print(f"Episode:{episode}/{num_episode} | Max Average Score:{max_avg_score}", end="\r")

        return self.Qtable
    
                
    def test(self, test_env: gym.make, num_episodes:int):
        """
        Test the agent for a specified number of episodes. It will automatically extract the QTable from the training step

        Arguments:
        - test_env : gym.make object environment
        - num_episodes: int, number of episodes to test for

        Returns:
        - total_reward: float, total reward earned over all episodes
        """
        env_test = DiscretizedEnvironment(env=test_env, n_bins = self.discretization_bins)
        total_reward = 0.0
        
        
        for episode in range(num_episodes):
            num_actions = 0
            # reset the environment and get initial state
            state = env_test.reset()
            
            # loop until episode is finished
            done = False
            while not done:
                # select an action
                action = np.argmax(self.Qtable[state])
                
                # step the environment
                next_state, reward, done, _, _ = env_test.step(action)
                
                # update total reward
                total_reward += reward
                
                # set next state as current state
                state = next_state
                num_actions += 1
            print(f"Number of action needed to solve the environment : {num_actions} in episode {episode + 1}")
                
        return total_reward

#### $ Solving \ MountainCar \ Problem $

In [44]:
# Setting Environment Parameters
ENV = {'id' :"MountainCar-v0", 'render_mode':None}
env = gym.make(**ENV)

# Training Environment
QAgent = QLearningAgent(env)   
Qtable = QAgent.train(num_episode=25000)

In [47]:
# Testing 
ENV_TEST = {'id' :"MountainCar-v0", 'render_mode':'human'}
test_env = gym.make(**ENV_TEST)

QAgent = QLearningAgent(env)  
QAgent.test(test_env=test_env, num_episodes=50)
test_env.close()

Number of action needed to solve the environment : 105 in episode 1
Number of action needed to solve the environment : 166 in episode 2
Number of action needed to solve the environment : 167 in episode 3
Number of action needed to solve the environment : 141 in episode 4
Number of action needed to solve the environment : 144 in episode 5
Number of action needed to solve the environment : 146 in episode 6
Number of action needed to solve the environment : 167 in episode 7
Number of action needed to solve the environment : 122 in episode 8
Number of action needed to solve the environment : 101 in episode 9
Number of action needed to solve the environment : 154 in episode 10
Number of action needed to solve the environment : 106 in episode 11
Number of action needed to solve the environment : 97 in episode 12
Number of action needed to solve the environment : 122 in episode 13
Number of action needed to solve the environment : 98 in episode 14
Number of action needed to solve the environm