In [81]:
from sklearn.preprocessing import KBinsDiscretizer

In [82]:
import numpy as np
import time, math, random
from typing import Tuple

import gym

In [83]:
env = gym.make('CartPole-v1', render_mode="human")

In [84]:
n_bins = ( 6 , 12 )
lower_bounds = [ env.observation_space.low[2], -math.radians(50)]
upper_bounds = [ env.observation_space.low[2], math.radians(50)]

def discretizer( _ , __, angle, pole_velocity ) -> Tuple[int,...]:
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

In [85]:
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(6, 12, 2)

In [86]:
def policy( state: tuple) :
    return np.argmax(Q_table[state])

In [87]:
def new_Q_value( reward : float , state_new : tuple , discount_factor=1 ) -> float:
    future_optimal_value = np.max(Q_table[state_new])
    learned_value = reward + discount_factor * future_optimal_value
    return learned_value


In [88]:
def learning_rate( n : int , min_rate=0.01) -> float :
    return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / 25)))

In [89]:
def exploration_rate( n : int, min_rate= 0.1) -> float :
     return max(min_rate, min(1, 1.0 - math.log10((n + 1) / 25)))

In [None]:
n_episodes = 10000
for e in range(n_episodes):
    #print(e)
    current_state, done = discretizer(*env.reset(), 0, 0), False
    while done == False:
        action = policy(current_state)
        if np.random.random() < exploration_rate(e) :
            action = env.action_space.sample()
        obs, reward, done, _ , R = env.step(action)
        new_state = discretizer(*obs)
        lr = learning_rate(e)
        learnt_value = new_Q_value(reward, new_state)
        old_value = Q_table[current_state][action]
        Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value

        current_state = new_state

        if e % 10000 == 0:
            env.render()

