In [1]:
import gym

In [28]:
env = gym.make("Taxi-v2").env

In [29]:
# Example starting state
env.s = env.encode(3, 1, 2, 0) # 328
env.render()

+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [4]:
from IPython.display import clear_output
from time import sleep
        
def print_agent_run_starting_at_state(env, agent, state, sleep_val = 0.01):
    env.s = state  # set environment to illustration's state
    epochs = 0
    penalties, reward = 0, 0
    frames = [] # for animation
    done = False
    while not done:
        state, reward, done, info = agent.act(env, state)

        if reward == -10:
            penalties += 1

        # Put each rendered frame into dict for animation
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'reward': reward
            }
        )

        epochs += 1

    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print("Timestep:" + str(i + 1))
        print("State: " + str(frame['state']))
        print("Reward: " + str(frame['reward']))
        sleep(sleep_val)

    print("Timesteps taken: {}".format(epochs))
    print("Penalties incurred: {}".format(penalties))
    

In [30]:
# Define random agent that chooses a random action at every step
class RandomAgent:
    def act(self, env, state):
        action = env.action_space.sample()
        return env.step(action)
    
# Animation showing behavior of RandomAgent. Warning: May take several thousand timesteps to complete!
print_agent_run_starting_at_state(env, RandomAgent(), 328)

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |B: |
+---------+
  (North)

Timestep:619
State: 324
Reward: -1


KeyboardInterrupt: 

In [7]:
import numpy as np
import random
from IPython.display import clear_output

# Basic Q Learning algorithm, as described here:
# https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/

def q_learning_train(env, alpha = 0.1, gamma = 0.6, epsilon = 0.1, epochs = 30000):
    # Q Table is initialized to all zeroes
    q_table = np.zeros([env.observation_space.n, env.action_space.n])

    # Training using Q-Learning
    for i in range(1, epochs + 1):
        state = env.reset()

        epochs, penalties, reward, = 0, 0, 0
        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample() # Explore action space
            else:
                action = np.argmax(q_table[state]) # Exploit learned values

            next_state, reward, done, info = env.step(action) 

            old_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            # Q learning equation
            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            q_table[state, action] = new_value

            if reward == -10:
                penalties += 1

            state = next_state
            epochs += 1

        if i % 100 == 0:
            clear_output(wait=True)
            print("Episode: " + str(i))

    print("Training finished.\n")
    return q_table



In [8]:
def evaluate_agent(env, agent, episodes = 100):
    total_epochs, total_penalties, total_reward = 0, 0, 0

    for _ in range(episodes):
        state = env.reset()
        epochs, penalties, reward = 0, 0, 0

        done = False
        reward = 0

        while not done:
            state, reward, done, info = agent.act(env, state)

            if reward == -10 or (done and reward == 0): # TODO: don't hard-code penalty calculation but put it on env object
                penalties += 1

            epochs += 1
            if epochs > 10000:
                penalties += 10000
                break

        total_penalties += penalties
        total_epochs += epochs
        total_reward += reward

    print("Results after " + str(episodes) + " episodes")
    print("Average reward per episode: " + str(total_reward / episodes))
    print("Average timesteps per episode: " + str(total_epochs / episodes))
    print("Average penalties per episode: " + str(total_penalties / episodes))


In [9]:
%%time
"""Training the agent"""
taxi_q_table = q_learning_train(env)

Episode: 30000
Training finished.

CPU times: user 16.1 s, sys: 0 ns, total: 16.1 s
Wall time: 15.2 s


In [31]:
# Define a new agent that uses trained Q Table for all decisions
class QAgent:
    def __init__(self, q_table):
        self.q_table = q_table
    
    def act(self, env, state):
        action = np.argmax(self.q_table[state])
        return env.step(action)

# Show animation with behavior of this agent
print_agent_run_starting_at_state(env, QAgent(taxi_q_table), 328, sleep_val=0.25)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep:10
State: 0
Reward: 20
Timesteps taken: 10
Penalties incurred: 0


In [11]:
print("Evaluation for RandomAgent")
evaluate_agent(env, RandomAgent(), 200)

print("\nEvaluation for QAgent")
evaluate_agent(env, QAgent(taxi_q_table), 200)

Evaluation for RandomAgent
Results after 200 episodes
Average reward per episode: 20.0
Average timesteps per episode: 2144.225
Average penalties per episode: 694.63

Evaluation for QAgent
Results after 200 episodes
Average reward per episode: 20.0
Average timesteps per episode: 12.185
Average penalties per episode: 0.0


In [32]:
# Show 10 random episodes
for _ in range(10):
    print_agent_run_starting_at_state(env, QAgent(taxi_q_table), env.reset(), sleep_val=0.1)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep:12
State: 410
Reward: 20
Timesteps taken: 12
Penalties incurred: 0


In [13]:
env = gym.make("FrozenLake8x8-v0").env
env.render()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [25]:
print_agent_run_starting_at_state(env, RandomAgent(), 0)

  (Up)
SFFFFFFF
FFFFFFFF
FFF[41mH[0mFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

Timestep:31
State: 19
Reward: 0.0
Timesteps taken: 31
Penalties incurred: 0


In [15]:
# Adopting Q Learning for the different environment, that only has 1 reward at very end
def q_learning_train2(env, alpha = 0.1, gamma = 0.6, epsilon = 0.1, epochs = 30000):
    # Q Table is initialized to all zeroes
    q_table = np.zeros([env.observation_space.n, env.action_space.n])

    # Training using Q-Learning
    for i in range(1, epochs + 1):
        state = env.reset()

        epochs, reward, = 0, 0
        done = False

        while not done:
            if random.uniform(0, 1) < epsilon:
                action = env.action_space.sample() # Explore action space
            else:
                action = np.argmax(q_table[state]) # Exploit learned values

            next_state, reward, done, info = env.step(action) 
            
            # This is the only difference from previous algorithm; we need a way to code negative reward for holes
            # If we had control of the env, we wouldn't need this hack.
            if done and reward == 0:
                reward = -1

            old_value = q_table[state, action]
            next_max = np.max(q_table[next_state])

            # Q learning equation
            new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
            q_table[state, action] = new_value

            state = next_state
            epochs += 1

        if i % 100 == 0:
            clear_output(wait=True)
            print("Episode: " + str(i))

    print("Training finished.\n")
    return q_table

In [16]:
q_table2 = q_learning_train2(env, gamma=0.9, epochs = 5000)

Episode: 5000
Training finished.



In [27]:
# Show animation with behavior of this agent
print_agent_run_starting_at_state(env, QAgent(q_table2), 0, sleep_val=0.05)

  (Right)
SFFFF[41mF[0mFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG

Timestep:262
State: 5
Reward: 0.0


KeyboardInterrupt: 

In [19]:
# Here the "average reward" is going to be a percentage of how many successful
# and "average penalties" will be how many failed.
print("Evaluation for RandomAgent")
evaluate_agent(env, RandomAgent(), 200)

print("\nEvaluation for QAgent")
evaluate_agent(env, QAgent(q_table2), 200)

Evaluation for RandomAgent
Results after 200 episodes
Average reward per episode: 0.0
Average timesteps per episode: 30.68
Average penalties per episode: 1.0

Evaluation for QAgent
Results after 200 episodes
Average reward per episode: 0.875
Average timesteps per episode: 103.4
Average penalties per episode: 0.125


In [22]:
frozen_q_table_5k = q_learning_train2(env, gamma=0.6, epochs = 5000)
frozen_q_table_10k = q_learning_train2(env, gamma=0.6, epochs = 10000)
frozen_q_table_20k = q_learning_train2(env, gamma=0.6, epochs = 20000)

Episode: 20000
Training finished.



In [21]:
print("\nEvaluation for QAgent (5k epoch)")
evaluate_agent(env, QAgent(frozen_q_table_5k), 200)

print("\nEvaluation for QAgent (10k epoch)")
evaluate_agent(env, QAgent(frozen_q_table_10k), 200)

print("\nEvaluation for QAgent (20k epoch)")
evaluate_agent(env, QAgent(frozen_q_table_20k), 200)


Evaluation for QAgent (5k epoch)
Results after 200 episodes
Average reward per episode: 0.915
Average timesteps per episode: 88.51
Average penalties per episode: 0.085

Evaluation for QAgent (10k epoch)
Results after 200 episodes
Average reward per episode: 0.835
Average timesteps per episode: 125.93
Average penalties per episode: 0.165

Evaluation for QAgent (20k epoch)
Results after 200 episodes
Average reward per episode: 0.8
Average timesteps per episode: 79.635
Average penalties per episode: 0.2


In [23]:
print("\nEvaluation for QAgent (5k epoch)")
evaluate_agent(env, QAgent(frozen_q_table_5k), 200)

print("\nEvaluation for QAgent (10k epoch)")
evaluate_agent(env, QAgent(frozen_q_table_10k), 200)

print("\nEvaluation for QAgent (20k epoch)")
evaluate_agent(env, QAgent(frozen_q_table_20k), 200)


Evaluation for QAgent (5k epoch)
Results after 200 episodes
Average reward per episode: 0.61
Average timesteps per episode: 107.645
Average penalties per episode: 0.39

Evaluation for QAgent (10k epoch)
Results after 200 episodes
Average reward per episode: 0.84
Average timesteps per episode: 119.985
Average penalties per episode: 0.16

Evaluation for QAgent (20k epoch)
Results after 200 episodes
Average reward per episode: 0.655
Average timesteps per episode: 90.66
Average penalties per episode: 0.345
