In [81]:
import gymnasium

SEED = 0

env = gymnasium.make('MountainCar-v0', max_episode_steps = 1000)
# car moving along a line, the x-axis, between two "mountains"
# the car has to reach the flag at the top of the right mountain
# the car has to build momentum by driving back and forth between the mountains
# the car has to reach the flag before the episode ends
# https://gymnasium.farama.org/environments/classic_control/mountain_car/

In [97]:
import numpy as np

np.linspace(-1.2, 0.6, 20)

array([-1.2       , -1.10526316, -1.01052632, -0.91578947, -0.82105263,
       -0.72631579, -0.63157895, -0.53684211, -0.44210526, -0.34736842,
       -0.25263158, -0.15789474, -0.06315789,  0.03157895,  0.12631579,
        0.22105263,  0.31578947,  0.41052632,  0.50526316,  0.6       ])

In [82]:
env.reset(seed=SEED) # must reset before using step
# from the gym documentation:
# start position is uniformly random value between -0.6 and -0.4
# start velocity is 0
# use seed to make the start position deterministic

(array([-0.47260767,  0.        ], dtype=float32), {})

In [83]:
# Get the bounds for position and velocity
position_bounds = (env.observation_space.low[0], env.observation_space.high[0])
velocity_bounds = (env.observation_space.low[1], env.observation_space.high[1])

# Define the state bounds
STATE_BOUNDS = [position_bounds, velocity_bounds]

print("Position Bounds:", position_bounds)
print("Velocity Bounds:", velocity_bounds)


Position Bounds: (-1.2, 0.6)
Velocity Bounds: (-0.07, 0.07)


In [96]:
# choose discrete step delta for position and velocity
# to discretize the state space
delta_position = 0.1
delta_velocity = 0.01

DELTA = [delta_position, delta_velocity]

**NUM_BINS**
- decide the number of bins for each state value
- decides the *granularity* or *resulotion* of the state space

- the more bins, the more states, the finer resolution and capture more details of the state space
- but also increases the size of the Q-table, and the time to train the agent


- the fewer bins, the fewer states, the coarser resolution and capture less details of the state space
- but also decreases the size of the Q-table, and the time to train the agent

In [95]:
# calculate the number of bins for position and velocity based on the bounds and delta
NUM_BINS = [int(STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0] / DELTA[i])+1 for i in range(2)]
NUM_BINS

[13, 1]

In [85]:
actions = env.action_space
print(actions)
# so actions are 0, 1, 2
# 0 = accelerate to the left
# 1 = don't accelerate
# 2 = accelerate to the right

Discrete(3)


In [86]:
#notes for myself
# state is a tuple of 2 floats (position, velocity)
# reward is a float (negative for each time step) and 0 at the end (i assume)
# terminated if find end goal
# turnicated if reach max_episode_steps or something else
# info is a dictionary containing extra information about the environment: here nothing
state, reward, terminated, turnicated,info = env.step(0)
# hence 2 ways to be done
done = terminated or turnicated

env.step(0)

(array([-0.47674024, -0.00275165], dtype=float32), -1.0, False, False, {})

so discretize the state: 

from:

&emsp;combination of x poistion and velocity as floats

into:

&emsp;(x,v) where 0<=x<= NUM_BINS[0] and 0<=x<= NUM_BINS[1]

In [87]:
# Discretize the state space
def discretize_state(state):
    discretized_state = []
    for i in range(len(state)):
        lower_bound, upper_bound = STATE_BOUNDS[i]
        # make sure the state is within the bounds
        # above the lower bound and below the upper bound
        value = min(max(state[i], lower_bound), upper_bound)
        # discretize the value --> from foat to int
        discretized_value = int((value - lower_bound) / (upper_bound - lower_bound) * NUM_BINS[i])
        discretized_state.append(discretized_value)
    return tuple(discretized_state)
discretize_state(state)

(8, 9)