In [12]:
import gym
import math
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from typing import Tuple

In [13]:
class CartPole():
    def __init__(self, n_bins=(6, 12),
                 n_episodes=1000,):
        self.n_episodes = n_episodes
        self.n_bins = n_bins

        self.env = gym.make('CartPole-v1')
        self.steps = np.zeros(self.n_episodes)

        # This is the action-value function being initialized to 0's
        self.Q_table = np.zeros(self.n_bins + (self.env.action_space.n,))

        # Used for discretizer
        self.upper_bounds = [self.env.observation_space.high[2], math.radians(50)]
        self.lower_bounds = [self.env.observation_space.low[2], -math.radians(50)]
        
    def discretizer(self, _ , __ , angle, pole_velocity ) -> Tuple[int,...]:
        #Continuous state to discrete state
        est = KBinsDiscretizer(n_bins=self.n_bins, encode='ordinal', strategy='uniform')
        est.fit([self.lower_bounds, self.upper_bounds ])
        return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))
        
    def policy(self, state : tuple ):
        #Picking best state
        return np.argmax(self.Q_table[state])
        
    def new_Q_value(self, reward : float ,  new_state : tuple , discount_factor=1 ) -> float:
        #Choosing new Q-value
        future_optimal_value = np.max(self.Q_table[new_state])
        learned_value = reward + discount_factor * future_optimal_value
        return learned_value
        
        # Adaptive learning of Learning Rate
    def learning_rate(n : int , min_rate=0.01 ) -> float  :
        #The rate of learning reduces a little for each iteration
        return max(min_rate, min(1.0, 1.0 - math.log10((n + 1) / 25)))
        
    def exploration_rate(n : int, min_rate= 0.1 ) -> float :
        #The rate of exploration reduces a little for each iteration
        return max(min_rate, min(1, 1.0 - math.log10((n  + 1) / 25)))
        
    def train(self):
        for e in range(self.n_episodes):
            # Siscretize state into buckets
            current_state, done = self.discretizer(*self.env.reset()), False
            if (e % 50 == 0): print(e)
            while done==False:

                # policy action 
                action = self.policy(current_state) # exploit

                if np.random.random() < max(0.1, min(1.0, 1.0 - math.log10((e + 1) / 25))) : 
                    action = self.env.action_space.sample() # explore 

                # increment enviroment
                obs, reward, done, _ = self.env.step(action)
                new_state = self.discretizer(*obs)

                # Update Q-Table
                lr = max(0.1, min(1.0, 1.0 - math.log10((e + 1) / 25)))
                learnt_value = self.new_Q_value(reward , new_state )
                old_value = self.Q_table[current_state][action]
                self.Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value

                current_state = new_state

                # Render the cartpole environment
                # self.env.render()
        print("FINISHED")
                
    def run(self):
        # Runs an episode while displaying the cartpole enviroment
        steps = 0
        done = False
        current_state = self.discretizer(*self.env.reset())
        while done==False:

            # policy action 
            action = self.policy(current_state) # exploit


            # increment enviroment
            obs, reward, done, _ = self.env.step(action)
            new_state = self.discretizer(*obs)

            current_state = new_state

            # Render the cartpole environment
            self.env.render()
        return steps
            
                    

In [None]:
model = CartPole()
model.train()
    

0
50
100
150


In [11]:
model.run()

0