In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
from IPython import display

from bareml.machinelearning.reinforcement import QLearning

In [3]:
env = gym.make('CartPole-v0') # https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py
max_episodes = 2000
max_timesteps = 200

In [4]:
# data transform utils

def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num + 1)[1:-1]

def digitize_state(observation):
    num_dizitized = 6 # number of chunks to divide the data into
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitized = [
        np.digitize(cart_pos, bins=bins(-2.4, 2.4, num_dizitized)),
        np.digitize(cart_v, bins=bins(-3.0, 3.0, num_dizitized)),
        np.digitize(pole_angle, bins=bins(-0.5, 0.5, num_dizitized)),
        np.digitize(pole_v, bins=bins(-2.0, 2.0, num_dizitized))
    ]
    return sum([x * (num_dizitized**i) for i, x in enumerate(digitized)])

In [5]:
q = QLearning(6**4, 2)

past_rewards = []
is_satisfied = False
imgs = []

for episode in range(max_episodes):
    # init environment 
    observation = env.reset()
    # encode state
    state = digitize_state(observation)
    episode_reward = 0

    for t in range(max_timesteps):
        
        # save rendering images when the exit criteria is met
        if is_satisfied:
            imgs.append(env.render(mode="rgb_array"))
            
        # decide action to take, based on the current state
        action = q.action(state, episode)
        # observe the new state of environment, based on the action
        observation, reward, done, info = env.step(action)
        # add penalty
        if done and t < 195:
            reward = -200
        # update state
        prev_state = state
        state = digitize_state(observation)
        # update q table 
        q.update(prev_state, action, reward, state)
        # add cumurative reward 
        episode_reward += reward

        if done:
            break 

    past_rewards.append(episode_reward)
    print('Episode %d: %d steps | Avg rewards in recent 100 ep: %f' %
                  (episode, t + 1, sum(past_rewards[-100:])/len(past_rewards[-100:])))
    
    if is_satisfied:
        np.save('gym_imgs.npy', np.array(imgs))
        break
    
    if len(past_rewards) >= 100 and sum(past_rewards[-100:]) >= 19500:
        print('Success in episode %d' % episode)
        is_satisfied = True

Episode 0: 12 steps | Avg rewards in recent 100 ep: -189.000000
Episode 1: 16 steps | Avg rewards in recent 100 ep: -187.000000
Episode 2: 65 steps | Avg rewards in recent 100 ep: -170.000000
Episode 3: 21 steps | Avg rewards in recent 100 ep: -172.500000
Episode 4: 16 steps | Avg rewards in recent 100 ep: -175.000000
Episode 5: 16 steps | Avg rewards in recent 100 ep: -176.666667
Episode 6: 12 steps | Avg rewards in recent 100 ep: -178.428571
Episode 7: 43 steps | Avg rewards in recent 100 ep: -175.875000
Episode 8: 28 steps | Avg rewards in recent 100 ep: -175.555556
Episode 9: 26 steps | Avg rewards in recent 100 ep: -175.500000
Episode 10: 21 steps | Avg rewards in recent 100 ep: -175.909091
Episode 11: 25 steps | Avg rewards in recent 100 ep: -175.916667
Episode 12: 63 steps | Avg rewards in recent 100 ep: -173.000000
Episode 13: 29 steps | Avg rewards in recent 100 ep: -172.928571
Episode 14: 92 steps | Avg rewards in recent 100 ep: -168.666667
Episode 15: 84 steps | Avg rewards 

Episode 127: 200 steps | Avg rewards in recent 100 ep: -5.020000
Episode 128: 200 steps | Avg rewards in recent 100 ep: -2.820000
Episode 129: 54 steps | Avg rewards in recent 100 ep: -2.450000
Episode 130: 200 steps | Avg rewards in recent 100 ep: -2.450000
Episode 131: 200 steps | Avg rewards in recent 100 ep: 0.090000
Episode 132: 200 steps | Avg rewards in recent 100 ep: 2.500000
Episode 133: 196 steps | Avg rewards in recent 100 ep: 4.640000
Episode 134: 200 steps | Avg rewards in recent 100 ep: 4.640000
Episode 135: 173 steps | Avg rewards in recent 100 ep: 2.360000
Episode 136: 200 steps | Avg rewards in recent 100 ep: 5.910000
Episode 137: 200 steps | Avg rewards in recent 100 ep: 9.060000
Episode 138: 200 steps | Avg rewards in recent 100 ep: 11.920000
Episode 139: 40 steps | Avg rewards in recent 100 ep: 11.970000
Episode 140: 200 steps | Avg rewards in recent 100 ep: 14.590000
Episode 141: 200 steps | Avg rewards in recent 100 ep: 18.020000
Episode 142: 200 steps | Avg rewar

Episode 255: 200 steps | Avg rewards in recent 100 ep: 161.760000
Episode 256: 200 steps | Avg rewards in recent 100 ep: 161.760000
Episode 257: 200 steps | Avg rewards in recent 100 ep: 161.760000
Episode 258: 200 steps | Avg rewards in recent 100 ep: 161.760000
Episode 259: 160 steps | Avg rewards in recent 100 ep: 159.350000
Episode 260: 200 steps | Avg rewards in recent 100 ep: 159.350000
Episode 261: 200 steps | Avg rewards in recent 100 ep: 159.350000
Episode 262: 200 steps | Avg rewards in recent 100 ep: 159.350000
Episode 263: 200 steps | Avg rewards in recent 100 ep: 159.350000
Episode 264: 200 steps | Avg rewards in recent 100 ep: 159.350000
Episode 265: 200 steps | Avg rewards in recent 100 ep: 159.350000
Episode 266: 136 steps | Avg rewards in recent 100 ep: 156.700000
Episode 267: 200 steps | Avg rewards in recent 100 ep: 156.700000
Episode 268: 155 steps | Avg rewards in recent 100 ep: 154.240000
Episode 269: 200 steps | Avg rewards in recent 100 ep: 154.240000
Episode 27

Episode 381: 200 steps | Avg rewards in recent 100 ep: 187.150000
Episode 382: 200 steps | Avg rewards in recent 100 ep: 187.150000
Episode 383: 200 steps | Avg rewards in recent 100 ep: 187.150000
Episode 384: 200 steps | Avg rewards in recent 100 ep: 187.150000
Episode 385: 200 steps | Avg rewards in recent 100 ep: 187.150000
Episode 386: 200 steps | Avg rewards in recent 100 ep: 187.150000
Episode 387: 200 steps | Avg rewards in recent 100 ep: 187.150000
Episode 388: 200 steps | Avg rewards in recent 100 ep: 189.750000
Episode 389: 200 steps | Avg rewards in recent 100 ep: 189.750000
Episode 390: 200 steps | Avg rewards in recent 100 ep: 189.750000
Episode 391: 200 steps | Avg rewards in recent 100 ep: 189.750000
Episode 392: 200 steps | Avg rewards in recent 100 ep: 192.340000
Episode 393: 200 steps | Avg rewards in recent 100 ep: 192.340000
Episode 394: 200 steps | Avg rewards in recent 100 ep: 192.340000
Episode 395: 200 steps | Avg rewards in recent 100 ep: 192.340000
Episode 39

Episode 506: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 507: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 508: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 509: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 510: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 511: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 512: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 513: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 514: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 515: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 516: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 517: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 518: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 519: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 520: 200 steps | Avg rewards in recent 100 ep: 122.550000
Episode 52