# Solving Cartpole with Q-Learning (Bellman)

## Env Setup

In [1]:
import gymnasium as gym
import numpy as np

In [2]:
env = gym.make('CartPole-v1', render_mode='human')

## Step and Render for random sample actions

In [3]:
episodeNumber = 20
timeSteps = 100

for episodeIndex in range(episodeNumber):
    initial_state = env.reset()
    print("Episode: ", episodeIndex)
    env.render()
    observations = []
    for timeIndex in range(timeSteps):
        print("TimeStep:", timeIndex)
        random_action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(random_action)
        observations.append(observation)
        if terminated:
            print("Episode finished after {} time steps".format(timeIndex+1))
            break

env.close()




Episode:  0
TimeStep: 0
TimeStep: 1
TimeStep: 2
TimeStep: 3
TimeStep: 4
TimeStep: 5
TimeStep: 6
TimeStep: 7
TimeStep: 8
TimeStep: 9
TimeStep: 10
TimeStep: 11
TimeStep: 12
Episode finished after 13 time steps
Episode:  1
TimeStep: 0
TimeStep: 1
TimeStep: 2
TimeStep: 3
TimeStep: 4
TimeStep: 5
TimeStep: 6
TimeStep: 7
TimeStep: 8
TimeStep: 9
TimeStep: 10
TimeStep: 11
TimeStep: 12
Episode finished after 13 time steps
Episode:  2
TimeStep: 0
TimeStep: 1
TimeStep: 2
TimeStep: 3
TimeStep: 4
TimeStep: 5
TimeStep: 6
TimeStep: 7
TimeStep: 8
TimeStep: 9
TimeStep: 10
TimeStep: 11
TimeStep: 12
TimeStep: 13
TimeStep: 14
TimeStep: 15
TimeStep: 16
TimeStep: 17
TimeStep: 18
TimeStep: 19
TimeStep: 20
TimeStep: 21
TimeStep: 22
TimeStep: 23
TimeStep: 24
TimeStep: 25
TimeStep: 26
TimeStep: 27
Episode finished after 28 time steps
Episode:  3
TimeStep: 0
TimeStep: 1
TimeStep: 2
TimeStep: 3
TimeStep: 4
TimeStep: 5
TimeStep: 6
TimeStep: 7
TimeStep: 8
TimeStep: 9
TimeStep: 10
TimeStep: 11
TimeStep: 12
TimeStep: 

: 

## Q-Learning Algorithm

Bins are used to discretize the state space of the environment. Otherwise, the state space would be too large to effectively store a Q-matrix for all possible states in memory. Q-learning would not be possible due to the virtually infinite number of values the state space could take.

- **alpha**: step size
- **gamma**: discount factor
- **epsilon**: parameter for epsilon-greedy policy
- **number of bins**: 4D value that looks like [position_bin, velocity_bin, angle_bin, angular_velocity_bin]

In [None]:
class QLearning:
    def __init__(self, env, alpha, gamma, epsilon, numberOfEpisodes, numberOfBins, lowerBounds, upperBounds):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon 
        self.numberOfActions = env.action_space.n 
        self.numberOfEpisodes = numberOfEpisodes
        self.numberOfBins = numberOfBins
        self.lowerBounds = lowerBounds
        self.upperBounds = upperBounds

        # rewards sum per episode
        self.sumEpisodeRewards=[]

        self.Qmatrix=np.random.uniform(low=0, high=1, size=(numberOfBins[0],numberOfBins[1],numberOfBins[2],numberOfBins[3],self.numberOfActions))

    # convert continuous values of the state into discrete values for Q-learning
    def getDiscreteState(self, state):
        position = state[0]
        velocity = state[1]
        angle = state[2]
        angularVelocity = state[3]

        # creating discrete bins for the continuous state space
        cartPositionBin = np.linspace(self.lowerBounds[0],self.upperBounds[0],self.numberOfBins[0])
        cartVelocityBin = np.linspace(self.lowerBounds[1],self.upperBounds[1],self.numberOfBins[1])
        poleAngleBin = np.linspace(self.lowerBounds[2],self.upperBounds[2],self.numberOfBins[2])
        poleAngularVelocityBin = np.linspace(self.lowerBounds[3],self.upperBounds[3],self.numberOfBins[3])

        # get indexs of the bins to which the continuous variables belong to
        indexPosition = np.maximum(np.digitize(position, cartPositionBin)-1,0)
        indexVelocity = np.maximum(np.digitize(velocity, cartVelocityBin)-1,0)
        indexAngle = np.maximum(np.digitize(angle, poleAngleBin)-1,0)
        indexAngularVelocity = np.maximum(np.digitize(angularVelocity, poleAngularVelocityBin)-1,0)

        return (indexPosition, indexVelocity, indexAngle, indexAngularVelocity)

    
    def selectAction(self, state, episodeNumber):

        # enabling random actions for exploration
        if episodeNumber < 500:
            return np.random.choice(self.numberOfActions)
        # eventually decreasing the value for epsilon to make the algorithm more greedy
        elif episodeNumber > 7000:
            self.epsilon = 0.999 * self.epsilon
        
        randomNumber = np.random.random()
        discreteState = self.getDiscreteState(state)

        if randomNumber < self.epsilon:
            return np.random.choice(self.numberOfActions)
        else:
            # select an action such that the Q-value for that action, state pair is the highest possible value in that state
            # np.max(self.Qmatrix[discreteState]))[0] -- will return a list of actions, as there could be multiple possible max actions
            return np.random.choice(np.where(self.Qmatrix[discreteState] == np.max(self.Qmatrix[discreteState]))[0])


    # simulating episodes with Q-learning
    def runEpisodes(self):
        for episodeIndex in range(self.numberOfEpisodes):

            state_S, _ = self.env.reset()
            state_S = list(state_S)

            print("Episode: ", episodeIndex)

            terminal_state = False
            while not terminal_state:
                state_S_discrete = self.getDiscreteState(state_S)
                action_a = self.selectAction(state_S, episodeIndex)

                state_S_prime, reward, terminal_state, _, _ = self.env.step(action_a)
                self.sumEpisodeRewards.append(reward)

                state_S_prime = list(state_S_prime)
                state_S_prime_discrete = self.getDiscreteState(state_S_prime)

                Q_max = np.max(self.Qmatrix[state_S_prime_discrete])

                if not terminal_state:
                    # update Q-values for non-terminal states
                    diff = reward + self.gamma*Q_max - self.Qmatrix[state_S_discrete + (action_a, )]
                    self.Qmatrix[state_S_discrete + (action_a, )] += self.alpha*diff
                else:
                    self.Qmatrix[state_S_discrete + (action_a, )] += self.alpha*(reward - self.Qmatrix[state_S_discrete + (action_a, )])
                
                state_S = state_S_prime

        
    # final optimal policy using the Q-matrix generated by running Q-learning
    def runOptimalPolicy(self):
        env1 = gym.make('CartPole-v1', render_mode='human')
        curr_state, _ = env1.reset()
        curr_state.render()

        timeSteps = 1000

        optimal_policy_rewards = []

        for timeIndex in range(timeSteps):
            curr_state_discrete = self.getDiscreteState(curr_state)
            # np.max(self.Qmatrix[discreteState]))[0] -- will return a list of actions, as there could be multiple possible max actions
            optimal_action = np.random.choice(np.where(self.Qmatrix[curr_state_discrete] == np.max(self.Qmatrix[curr_state_discrete]))[0])

            curr_state, reward, _, _, _ = env1.step(optimal_action)
            optimal_policy_rewards.append(reward)

            if (terminated):
                break

            return optimal_policy_rewards, env1
