In [1]:
'''
    File: cart-pole-problem.ipynb
    
    The Care-Pole Problem using OpenAI Gym.
    Agent is made to solve the problem with a random policy and then with an "optimal" policy.
'''

from IPython.display import clear_output
from sklearn.preprocessing import KBinsDiscretizer
import gym as g
import random as rnd
import numpy as np
import time, math

In [2]:
env = g.make("CartPole-v1").env

In [3]:
''' having the agent try solve the environment with a random policy '''

total_epochs, total_penalties, total_rewards = 0, 0, 0
done = False
episodes = 100

for _ in range(100):
    epochs, penalties, episode_rewards, reward = 0, 0, 0, 0
    obserservation = env.reset()
        
    while not done:
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
            
        epochs += 1
        episode_rewards += reward
        
    if episode_rewards < 200:
        penalties += 1

    total_epochs += epochs
    total_penalties += penalties
    total_rewards += episode_rewards

print("Results after {} episodes -".format(episodes))
print("Mean average of timesteps: {}".format(total_epochs / episodes))
print("Mean average of rewards: {}".format(total_rewards / episodes))
print("Mean average of penalties incurred: {}".format(total_penalties / episodes))

Results after 100 episodes -
Mean average of timesteps: 0.21
Mean average of rewards: 0.21
Mean average of penalties incurred: 1.0


In [4]:
n_bins = (4, 8) # selected arbitarily low, used as bins/buckets for discretizing the problem space
lower_bounds = [env.observation_space.low[2], -math.radians(50)]
upper_bounds = [env.observation_space.high[2], math.radians(50)]

q_table = np.zeros(n_bins + (env.action_space.n,))

def discretizer(state):
    # discretizer(state) changes the state from a continuous one to one that conforms to the selected bins
    _, __, pole_angle, pole_velocity = state
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int, est.transform([[pole_angle, pole_velocity]])[0]))

# hyperparameters, not optimized
alpha = 0.3
gamma = 0.8
epsilon = 0.2

for episode in range(1, 20001):
    state = discretizer(env.reset())
    done = False

    while not done:
        if rnd.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])
            
        observation, reward, done, info = env.step(action)
        next_state = discretizer(observation)

        # the next three lines relate to applying the equation to update the q-table
        old_value = q_table[state][action]
        next_max = np.max(q_table[next_state])
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)

        q_table[state][action] = new_value # write result to q-table

        state = next_state
        
    if episode % 100 == 0:
        clear_output(wait=True)
        print("Episode: {}".format(episode))

print("Training complete.")

Episode: 20000
Training complete.


In [5]:
''' Evaluating the agent in the environment after training with Q-Learning '''

total_epochs, total_penalties, total_rewards = 0, 0, 0
episodes = 100

for _ in range(episodes):
    epochs, penalties, episode_rewards, reward = 0, 0, 0, 0
    state = discretizer(env.reset())
    done = False

    while not done:
        # env.render() # uncomment this and the environment closer to render
        action = np.argmax(q_table[state])
        observation, reward, done, info = env.step(action)
        state = discretizer(observation)

        epochs += 1
        episode_rewards += reward

    if episode_rewards < 200:
        # if the agent lasted fewer than 200 timesteps in the episode, consider it a penalty (for analysis purposes)
        penalties += 1

    total_epochs += epochs
    total_penalties += penalties
    total_rewards += episode_rewards
# env.close()

print("\nResults after {} episodes -".format(episodes))
print("Mean average of timesteps: {}".format(total_epochs / episodes))
print("Mean average of rewards: {}".format(total_rewards / episodes))
print("Mean average of penalties incurred: {}".format(total_penalties / episodes))


Results after 100 episodes -
Mean average of timesteps: 30.06
Mean average of rewards: 30.06
Mean average of penalties incurred: 1.0
