In [1]:
import gym
import time
from math import sin, cos, radians, log10
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
from ipywidgets import IntProgress
from IPython.display import display
from cartpoles import Cart, DCMotor, Pole
from colors import Colors
from cartpolesenv import CartPolesEnv

In [2]:
cart = Cart(0.5, 0.05, 0, -0.8, 0.8, Colors.red,
    DCMotor(-12, 12, 0.05, 0.5, 0.05, 0.01, 0.05, Colors.black),
    Pole(0.2, radians(10), 0.2, 0.005, Colors.green, None 
    )
)

dt = 0.01
g = 9.81
env = CartPolesEnv(cart, dt, g)

In [3]:
n_bins = (6,12)
lower_bounds = [env.observation_space.low[2], -radians(50)]
upper_bounds = [env.observation_space.high[2], radians(50)]

def discretizer(_, __, angle, pole_velocity) -> tuple[int,...]:
    est = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="uniform")
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int, est.transform([[angle, pole_velocity]])[0]))

In [4]:
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(6, 12, 2)

In [5]:
def policy(state: tuple):
    return np.argmax(Q_table[state])

In [6]:
def new_Q_value(reward: float, new_state: tuple, discount_factor=1) -> float:
    future_optimal_value = np.max(Q_table[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

In [7]:
def learning_rate(n: int, min_rate=0.01) -> float:
    return max(min_rate, min(1.0, 1.0 - log10((n+1)/25)))

In [8]:
def exploration_rate(n: int, min_rate=0.1) -> float:
    return max(min_rate, min(1, 1.0-log10((n+1)/25)))

In [9]:
n_episodes = 1000

f = IntProgress(min=0, max=n_episodes) # instantiate the bar
display(f) # display the bar

env_e = env
for e in range(n_episodes):
    f.value += 1

    if e % 100 == 0:
        env_e = env_render
    elif (e-1) % 100 == 0:
        env_e = env

    obs, info = env_e.reset()
    current_state, done = discretizer(*obs), False

    while not done:
        action = policy(current_state)

        if np.random.random() < exploration_rate(e):
            action = env_e.action_space.sample()

        obs, reward, done, info, _ = env_e.step(action, )
        new_state = discretizer(*obs)

        lr = learning_rate(e)
        learnt_value = new_Q_value(reward, new_state, 0.9)
        old_value = Q_table[current_state][action]
        Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value

        current_state = new_state
env.close()
env_render.close()

IntProgress(value=0, max=1000)