Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "CartPole-v1"

In [2]:
import gymnasium as gym
import numpy as np
import random

In [3]:
# Cambiar render_mode a rgb_array para entrenar/testear
env = gym.make('CartPole-v1', render_mode='rgb_array')

Observation Space

In [4]:
env.observation_space

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)

Action Space

In [5]:
env.action_space

Discrete(2)

Discretización de los estados

**Nota:** es importante que chequeen el espacio de observación y el espacio de acción del entorno. Los números usados son ejemplos y pueden no ser correctos

In [6]:
cart_position = np.linspace(-0.000001, 100., 2)
cart_velocity = np.linspace(-0.000001, 100., 2)
pole_angle = np.linspace(-0.000001, 100., 2)
pole_angular_velocity = np.linspace(-0.000001, 100., 2)
cart_position

array([-1.e-06,  1.e+02])

Obtener el estado a partir de la observación

In [7]:
def get_state(obs):
    cart_pos = obs[0]
    cart_vel = obs[1]
    pole_ang = obs[2]
    pole_ang_vel = obs[3]
    cart_pos_idx = np.digitize(cart_pos, cart_position)
    cart_vel_idx = np.digitize(cart_vel, cart_velocity)
    pole_angle_idx = np.digitize(pole_ang, pole_angle)
    pole_ang_vel_idx = np.digitize(pole_ang_vel, pole_angular_velocity)
    return cart_pos_idx, cart_vel_idx, pole_angle_idx, pole_ang_vel_idx

In [8]:
obs = env.observation_space.sample()
print(obs)
state = get_state(obs)
state

[ 4.6401935   0.43055007  0.11140139 -1.1124599 ]


(np.int64(1), np.int64(1), np.int64(1), np.int64(0))

Inicilización de la tabla Q

In [9]:
Q = np.zeros((len(cart_position), len(cart_velocity), len(pole_angle), len(pole_angular_velocity), env.action_space.n))
Q

array([[[[[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]]],


        [[[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]]]],



       [[[[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]]],


        [[[0., 0.],
          [0., 0.]],

         [[0., 0.],
          [0., 0.]]]]])

Obtención de la acción a partir de la tabla Q

In [10]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

Epsilon-Greedy Policy

In [11]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    else:
        action = optimal_policy(state, Q)
        
    return action

Ejemplo de episodio 

In [14]:
obs,_ = env.reset()
print(obs)
done = False
total_reward = 0
state = get_state(obs)
steps = 0
while not done:
    steps += 1
    
    # Acción del modelo
    action = epsilon_greedy_policy(state, Q, 0.5)
    print('action', action)

    obs, reward, done, _, _ = env.step(action)
    next_state = get_state(obs)
    
   # Q[state][action_idx] = ... # Completar
   
   # Actualizar estado
    state = next_state
   
    total_reward += reward

    env.render()

env.close() # Para cerrar la ventana, hay que crear el ambiente de nuevo si queremos correrlo otra vez   
print('total_reward', total_reward)
print('steps', steps)

[-0.00881185  0.04519531 -0.04260313 -0.01280498]
action 0
action 1
action 0
action 0
action 0
action 1
action 1
action 0
action 0
action 0
action 0
action 0
action 0
action 0
action 0
action 0
total_reward 16.0
steps 16
