# Solving Cartpole with Q-Learning (Bellman)

## Env Setup

In [2]:
import gymnasium as gym
import numpy as np

In [3]:
env = gym.make('CartPole-v1', render_mode='human')

## Step and Render for random sample actions

In [10]:
episodeNumber = 5
timeSteps = 500

for episodeIndex in range(episodeNumber):
    initial_state = env.reset()
    print("Episode: ", episodeIndex)
    env.render()
    observations = []
    for timeIndex in range(timeSteps):
        print("TimeStep:", timeIndex)
        random_action = env.action_space.sample()
        observation, reward, terminated, truncated, info = env.step(random_action)
        observations.append(observation)
        if terminated or truncated:
            print("Episode finished after {} time steps".format(timeIndex+1))
            break

env.close()


Episode:  0
TimeStep: 0
TimeStep: 1
TimeStep: 2
TimeStep: 3
TimeStep: 4
TimeStep: 5
TimeStep: 6
TimeStep: 7
TimeStep: 8
TimeStep: 9
Episode finished after 10 time steps
Episode:  1
TimeStep: 0
TimeStep: 1
TimeStep: 2
TimeStep: 3
TimeStep: 4
TimeStep: 5
TimeStep: 6
TimeStep: 7
TimeStep: 8
TimeStep: 9
TimeStep: 10
TimeStep: 11
TimeStep: 12
TimeStep: 13
TimeStep: 14
TimeStep: 15
TimeStep: 16
TimeStep: 17
TimeStep: 18
TimeStep: 19
TimeStep: 20
TimeStep: 21
TimeStep: 22
TimeStep: 23
TimeStep: 24
TimeStep: 25
TimeStep: 26
TimeStep: 27
TimeStep: 28
TimeStep: 29
TimeStep: 30
TimeStep: 31
TimeStep: 32
TimeStep: 33
TimeStep: 34
TimeStep: 35
TimeStep: 36
TimeStep: 37
TimeStep: 38
TimeStep: 39
TimeStep: 40
TimeStep: 41
TimeStep: 42
Episode finished after 43 time steps
Episode:  2
TimeStep: 0
TimeStep: 1
TimeStep: 2
TimeStep: 3
TimeStep: 4
TimeStep: 5
TimeStep: 6
TimeStep: 7
TimeStep: 8
TimeStep: 9
TimeStep: 10
TimeStep: 11
TimeStep: 12
TimeStep: 13
TimeStep: 14
TimeStep: 15
TimeStep: 16
TimeStep: 

## Q-Learning Algorithm

Bins are used to discretize the state space of the environment. Otherwise, the state space would be too large to effectively store a Q-matrix for all possible states in memory.

- **alpha**: step size
- **gamma**: discount factor
- **epsilon**: parameter for epsilon-greedy policy

In [None]:
class QLearning:
    def __init__(self, env, alpha, gamma, epsilon, numberEpisodes, numberOfBins, lowerBounds, upperBounds):
        self.env = env
        self.alpha = alpha
        self.gamma = gamma 
        self.epsilon = epsilon 
        self.actionNumber = env.action_space.n 
        self.numberEpisodes = numberEpisodes
        self.numberOfBins = numberOfBins
        self.lowerBounds = lowerBounds
        self.upperBounds = upperBounds

        # rewards sum per episode
        self.sumRewardsEpisode=[]

        self.Qmatrix=np.random.uniform(low=0, high=1, size=(numberOfBins[0],numberOfBins[1],numberOfBins[2],numberOfBins[3],self.actionNumber))

    
    def getQValue(self, state):
        position = state[0]
        velocity = state[1]
        angle = state[2]
        angularVelocity = state[3]

        cartPositionBin = np.linspace(self.lowerBounds[0],self.upperBounds[0],self.numberOfBins[0])
        cartVelocityBin = np.linspace(self.lowerBounds[1],self.upperBounds[1],self.numberOfBins[1])
        poleAngleBin = np.linspace(self.lowerBounds[2],self.upperBounds[2],self.numberOfBins[2])
        poleAngularVelocityBin = np.linspace(self.lowerBounds[3],self.upperBounds[3],self.numberOfBins[3])

        indexPosition=np.maximum(np.digitize(position, cartPositionBin)-1,0)
        indexVelocity=np.maximum(np.digitize(velocity, cartVelocityBin)-1,0)
        indexAngle=np.maximum(np.digitize(angle, poleAngleBin)-1,0)
        indexAngularVelocity=np.maximum(np.digitize(angularVelocity, poleAngularVelocityBin)-1,0)

        return tuple([indexPosition, indexVelocity, indexAngle, indexAngularVelocity])

