In [1]:
import gym
import time
from math import sin, cos, radians, log10
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
from ipywidgets import IntProgress
from IPython.display import display
from lib.cartpolesystem import CartPoleSystem
from lib.colors import Colors
from lib.cartpolesenv import CartPolesEnv

In [2]:
dt = 0.025
g = 9.81

system = CartPoleSystem(
    (0.0, 0.5, 0.05, -0.8, 0.8, Colors.red),
    (0.05, 0.05, 0.01, 0.5, 0.05, -24.0, 24.0, Colors.black),
    [
        (radians(10), 0.2, 0.2, 0.005, Colors.green),
    ],
    g,
    "rk4"
)

env = CartPolesEnv(system, dt, g)
env.observation_space.shape

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


(4,)

In [3]:
n_obs_bins = 10
n_action_bins = 2

obs_est = KBinsDiscretizer(n_bins=n_obs_bins, encode="ordinal", strategy="uniform")
action_est = KBinsDiscretizer(n_bins=n_action_bins, encode="ordinal", strategy="uniform")

obs_low = env.observation_space.low
obs_high = env.observation_space.high

def discretizer(data, est: KBinsDiscretizer, low: list[float], high: list[float]) -> tuple[int,...]:
    est.fit([low, high])
    return tuple(map(int, est.transform([data])[0]))

def obs_discretizer(obs):
    return discretizer(obs, obs_est, obs_low, obs_high)

def action_discretizer(action):
    return discretizer(action, action_est, env.action_space.low, env.action_space.high)

def undiscretizer(data: int, est: KBinsDiscretizer, low: list[float], high: list[float]):
    est.fit([low, high])
    return est.inverse_transform([data])[0]

def obs_undiscretizer(obs):
    return undiscretizer(obs, obs_est, obs_low, obs_high)

def action_undiscretizer(action):
    return undiscretizer(action, action_est, env.action_space.low, env.action_space.high)

In [4]:
Q_table = np.zeros(tuple(np.repeat(np.array([n_obs_bins]), obs_low.shape[0])) + tuple(np.repeat(np.array([n_action_bins]), env.action_space.shape[0])), dtype=int)
print("Sum", Q_table.sum())

save = False

if save:
    with open('q_table.npy', 'wb') as f:
        np.save(f, Q_table)
Q_table.shape

Sum 0


(10, 10, 10, 10, 2)

In [5]:
def policy(state: tuple):
    return (np.argmax(Q_table[state]),)

In [6]:
def new_Q_value(reward: float, new_state: tuple, discount_factor=1.0) -> float:
    future_optimal_value = np.max(Q_table[new_state])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value

In [7]:
def learning_rate(n: int, min_rate=0.1) -> float:
    return max(min_rate, min(1.0, 1.0 - log10((n+1)/100)))

In [8]:
def exploration_rate(n: int, min_rate=0.05) -> float:
    return max(min_rate, min(1, 1.0-log10((n+1)/100)))

In [9]:
with open('q_table.npy', 'rb') as f:
    Q_table = np.load(f)
print("Before sum", Q_table.sum())

n_episodes = 10000

f = IntProgress(min=0, max=n_episodes) # instantiate the bar
display(f) # display the bar

env_e = env
for e in range(2000,n_episodes):
    f.value += 1

    obs, info = env_e.reset()
    current_state, done = obs_discretizer(obs), False

    i = 0

    while not done:
        action = policy(current_state)
        # print("policy action!", action)

        er = exploration_rate(e)
        exploring = np.random.random() < er
        if exploring:
            action = action_discretizer(env_e.action_space.sample())
            # print("random!")

        # print("action", action)
        # print("undiscretized", action_undiscretizer(action))

        obs, reward, done, info, _ = env_e.step(*action_undiscretizer(action))

        if i == 0 and done:
            print(obs)
            print("x", obs[0])
            print("max_x", env_e.system.max_x)
            print("min_x", env_e.system.min_x)

            print("y", env_e.system.end_height())

        new_state = obs_discretizer(obs)

        lr = learning_rate(e)
        learnt_value = new_Q_value(reward, new_state, 1)
        old_value = Q_table[current_state][action]
        Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value

        current_state = new_state
        
        if e % 100 == 0:
            env.render([
                "",
                f"Learning rate: {round(lr,2)}",
                f"Exploration rate: {round(er,2)}",
                "Exploring!" if exploring else "Not exploring",
                f"Reward: {round(reward,2)}"
            ])
            time.sleep(dt)
        i += 1

with open('q_table.npy', 'wb') as f:
    np.save(f, Q_table)
print("After sum:", Q_table.sum())

Before sum 0


IntProgress(value=0, max=10000)