In [35]:
import sys
import numpy as np 
import gymnasium as gym

#### $ Discretization : $ 
- Discretization is the process of transforming continuous data or variables into discrete or categorical data or variables. In other words, it involves breaking down a continuous variable into distinct groups or categories. This is often done in order to simplify data analysis, as working with discrete values can be more efficient and easier to interpret than working with continuous values. Discretization is commonly used in fields such as statistics, data analysis, and machine learning, where continuous data must be converted into a form that can be processed by algorithms or models. There are various methods of discretization, including binning, clustering, and decision trees.

#### $ Classes \ and \ Functions $

In [52]:
class DiscretizedEnvironment(gym.Wrapper):
    """
    class that discretize continuos data into categorical 

    arguments:
    - env : gym.make environment object
    - n_bins: int, number of bins to discretize

    return :

    
    """
    def __init__(self, env: gym.make, n_bins : int = 10):
        super().__init__(env)
        self.n_bins = n_bins 

        # discretize observation space
        self.high = env.observation_space.high
        self.low = env.observation_space.low
        self.observation_space = gym.spaces.Discrete(n_bins ** len(self.high))   
        # ============ Reason ===========
        # We calculate the total number of possible states by taking the product of the number of bins for each dimension.
        # For example, if n_bins is 10 and there are two dimensions in the observation space, then the total number of possible states is 10 ** 2 = 100.
        # ===============================

        # Define Bins for each dimension         
        self.observation_bins = [np.linspace(self.low[i], self.high[i], n_bins + 1)[1:-1] for i in range(len(self.low))]    
        # ============ Reason ===========
        # low and high are arrays that represent the lower and upper bounds of each dimension
        # np.linspace takes three arguments: the start value (low[i]), the end value (high[i]), and the number of intervals (n_bins + 1) between the start and end values. 
        # We add 1 to n_bins because we want to include both the lower and upper bounds in the bins.
        # We then select only the inner bins (excluding the lower and upper bounds) using the slicing notation [1:-1].
        # We do this for each dimension in the observation space by iterating over range(len(low))   
        #  ===========================================================================================      
                                                                                       
        # Define action space  
        self.action_space = gym.spaces.Discrete(3) # 3 discrete actions: push left, do nothing, push right
         
    def _discretize_observation(self, obs):
        """ discretize the space
        
        arguments:
        - obs : observation space
        
        return 
        - state
        
        """

        # convert continuous spaces to discrete
        state = 0
        for i, b in enumerate(self.observation_bins):
            state += np.digitize(obs[i], b) * ((self.n_bins) ** i)
        return state
    
    def reset(self):
        obs = self.env.reset()[0]
        return self._discretize_observation(obs)
    
    def step(self, action):
        observation, reward, terminated, truncated, info = self.env.step(action)
        return self._discretize_observation(observation), reward, terminated, truncated, info

class QLearningAgent:
    """" Q-Learning agent action on a continuous space

    Arguments:
        - environment : gym.make environment Object 
        - alpha : float, Learning Rate 
        - gamma : float, Discount Factor 
        - exploration_rate: float, probability of taking a random action
        - epsilon_decay_rate: float, how quickly epsilon should decay 
        - discretization_bins:  int, number of bins to discretize the observation space


    
    Important Formulas
        `Q*(s, a) = R(s, a) + gamma * max(a')`

        ` The update rule is: Q(s_t, a_t) <- Q(s_t, a_t) + alpha * (r_t + gamma * max[a'](Q(s_{t+1}, a')) - Q(s_t, a_t))`
    """
    def __init__ (self, environment:gym.make, alpha:float = 0.1, gamma:float=0.99, 
                  exploration_rate:float =1.0, epsilon_decay_rate:float =0.99, discretization_bins:int=10):
        
        # Set Environment & Q-Table Parameters
        self.env = DiscretizedEnvironment(env=environment, n_bins = discretization_bins)
        self.Qtable = np.zeros([self.env.observation_space.n, self.env.action_space.n])

        # Set Learning Parameters 
        self.alpha =  alpha # Learning Rate
        self.gamma = gamma # Discount Factor

        # Set Exploration Parameters
        self.exploration_rate = exploration_rate
        self.epsilon_decay_rate = epsilon_decay_rate

    def act (self, state:int):
        """ Select an action using epsilon-greedy policy selection
        
        Arguments:
        - state: int, current state
        
        Return:
        - action: int, selected action 
        """

        # Choosing a action : it wil becoming less exploratory once it gets more experience 
        if np.random.uniform () < self.exploration_rate:
            action = self.env.action_space.sample()
        else:
            # np.argmax, return the index of the action with the highest value 
            action = np.argmax(self.Qtable[state])
        
        return action 

    def learn_and_update (self, state:int, action:int, reward:float, next_state:int, is_done:bool):
        """ Update Q-table using Q-learning algorithm
        
        Arguments:
        - state: int, current state
        - action: int, current action
        - reward: float, reward for current state-action pair
        - next_state: int, next state
        - is_done: bool, whether the episode is terminated
        """

        # Get Current Q-Value
        current_q_value = self.Qtable[state, action]

        # compute Maximum Q-value for the Next State in the Qtable
        max_next_q_value = np.max(self.Qtable[next_state])

        # Compute the TD : temporal difference target 
        TD_target= reward + self.gamma * max_next_q_value * (not is_done)

        # Compute the TD_error :  temporal-difference error
        TD_error = TD_target - current_q_value

        # Updating Q-table
        self.Qtable[state, action] += self.alpha * TD_error


    def train(self, num_episode:int):
        """
        Train the agent for a specified number of episodes
        
        Arguments:
        - num_episode: int, number of episodes
        """

        scores = []
        max_avg_score = -np.inf
        for episode in range(1, num_episode+1):
    
            # Reset & start Environment 
            state = self.env.reset()

            # decay exploration 
            self.exploration_rate *= self.exploration_rate
            episode_reward = 0.0
            done = False

            # Starting Training 
            while not done:
                # Select action & step 
                action = self.act(state=state)
                next_state, reward, done, _ , _ = self.env.step(action)

                # Update Q-Values 
                episode_reward += reward
                self.learn_and_update(state, action, reward, next_state, done)

                # Set Next State as Current State
                state = next_state

            scores.append(episode_reward)

            if len (scores) > 10:
                avg_score = np.mean(scores[-10:])
                if avg_score > max_avg_score:
                    max_avg_score = avg_score

                if episode % 10 == 0:
                    print(f"Episode:{episode} | Max Average Score:{max_avg_score}")
                    sys.stdout.flush()
    
ENV = {'id' :"MountainCar-v0", 'render_mode':None}
env = gym.make(**ENV)
QAgent = QLearningAgent(env)   
QAgent.train(num_episode=100)



Episode:20 | Max Average Score:-45562.8
Episode:30 | Max Average Score:-23675.7
Episode:40 | Max Average Score:-23675.7
Episode:50 | Max Average Score:-20351.3
Episode:60 | Max Average Score:-20351.3
Episode:70 | Max Average Score:-18424.5
Episode:80 | Max Average Score:-18424.5
Episode:90 | Max Average Score:-16441.0
Episode:100 | Max Average Score:-16441.0


In [33]:
class QLearningAgent:
    """
    Q-learning agent that learns to solve the MountainCar problem.

    Arguments:
    - env: gym.make environment object
    - learning_rate: float, learning rate for updating Q-values
    - discount_factor: float, discount factor for future rewards
    - exploration_rate: float, probability of taking a random action
    - exploration_decay_rate: float, rate at which the exploration rate decays over time
    - discretization_bins: int, number of bins to discretize the observation space

    """
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0, 
                 exploration_decay_rate=0.99, discretization_bins=10):
        
        # set the environment and discretization
        self.env = DiscretizedEnvironment(env, n_bins=discretization_bins)
        
        # initialize Q-values table
        self.q_table = np.zeros([self.env.observation_space.n, self.env.action_space.n])
        
        # set learning parameters
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        
        # set exploration parameters
        self.exploration_rate = exploration_rate
        self.exploration_decay_rate = exploration_decay_rate
        
    def act(self, state):
        """
        Select an action using an epsilon-greedy policy.

        Arguments:
        - state: int, current state

        Returns:
        - action: int, selected action
        """
        # choose a random action with probability of exploration_rate
        if np.random.uniform() < self.exploration_rate:
            action = self.env.action_space.sample()
        # otherwise, choose the action with highest Q-value for the current state
        else:
            action = np.argmax(self.q_table[state])
        return action
    
    def learn(self, state, action, reward, next_state, done):
        """
        Update Q-values using Q-learning update rule.

        Arguments:
        - state: int, current state
        - action: int, selected action
        - reward: float, received reward
        - next_state: int, next state
        - done: bool, whether the episode is finished
        """
        # update Q-value for the current state and action
        current_q = self.q_table[state, action]
        max_q = np.max(self.q_table[next_state])
        td_target = reward + self.discount_factor * max_q * (not done)
        td_error = td_target - current_q
        self.q_table[state, action] += self.learning_rate * td_error
    
    def train(self, num_episodes):
        """
        Train the agent for a specified number of episodes.

        Arguments:
        - num_episodes: int, number of episodes to train for
        """
        for episode in range(num_episodes):
            # reset the environment and get initial state
            state = self.env.reset()
            
            # decay exploration rate
            self.exploration_rate *= self.exploration_decay_rate
            episode_reward = 0.0
            
            # loop until episode is finished
            done = False
            while not done:
                # select an action and step the environment
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                episode_reward += reward
                
                # update Q-values
                self.learn(state, action, reward, next_state, done)
                
                # set next state as current state
                state = next_state
                
    def test(self, num_episodes):
        """
        Test the agent for a specified number of episodes.

        Arguments:
        - num_episodes: int, number of episodes to test for

        Returns:
        - total_reward: float, total reward earned over all episodes
        """
        total_reward = 0.0
        
        for episode in range(num_episodes):
            # reset the environment and get initial state
            state = self.env.reset()
            
            # loop until episode is finished
            done = False
            while not done:
                # select an action
                action = np.argmax(self.q_table[state])
                
                # step the environment
                next_state, reward, done, _ = self.env.step(action)
                
                # update total reward
                total_reward += reward
                
                # set next state as current state
                state = next_state
                
        return total_reward



#### $ Solving \ MountainCar \ Problem $