In [1]:
import numpy as np
from numpy import random as rd
import matplotlib.pyplot as plt
import operator
import abc
import RLModule as rl

### Set Bob, the agent

In [2]:
Bob = rl.Agent()
init_state = (0,0)
Bob.state = init_state
Bob.summary()

Current state : (0, 0)
State history : [(0, 0)]
Total reward  : 0


### Basic Gridworld

In [3]:
n_h, n_v = 10, 10
terminal_state = (n_h-1, n_v-1)
Env = rl.BasicGridworld(n_h, n_v, terminal_state=terminal_state)
#Env.summary()

In [4]:
policy_params = {
    'policy_name': 'EpsilonGreedy',
    'epsilon': 0.1
}

agent_params = {
    'state': init_state
}

params = {
    'policy_params': policy_params,
    'agent_params': agent_params,
    'learning_rate': 0.5,
    'discount_factor': 1,
    
}

Sarsa_RL = rl.Sarsa(Agent=Bob, Environment=Env, params=params)
Q_RL = rl.QLearning(Agent=Bob, Environment=Env, params=params)

In [5]:
N_EPOCHS =1000
Sarsa_RL.learn(N_EPOCHS, init_state=init_state)
Q_RL.learn(N_EPOCHS, init_state=init_state)

In [6]:
optimal_path, reward_history = Sarsa_RL.optimalPath(init_state)
Env.printPath(optimal_path, reward_history)

optimal_path, reward_history = Q_RL.optimalPath(init_state)
Env.printPath(optimal_path, reward_history)

[[  .   .   .   .   .   .   .   .   . -1.]
 [  .   .   .   .   .   .   .   . -1. -1.]
 [  .   .   .   .   .   .   .   . -1.   .]
 [  .   .   .   .   .   . -1. -1. -1.   .]
 [  .   . -1. -1. -1. -1. -1.   .   .   .]
 [  .   . -1.   .   .   .   .   .   .   .]
 [  .   . -1.   .   .   .   .   .   .   .]
 [  .   . -1.   .   .   .   .   .   .   .]
 [-1. -1. -1.   .   .   .   .   .   .   .]
 [ 0.   .   .   .   .   .   .   .   .   .]]
Total reward : -18
[[  .   .   .   .   .   .   .   .   . -1.]
 [  . -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [  . -1.   .   .   .   .   .   .   .   .]
 [-1. -1.   .   .   .   .   .   .   .   .]
 [-1.   .   .   .   .   .   .   .   .   .]
 [-1.   .   .   .   .   .   .   .   .   .]
 [-1.   .   .   .   .   .   .   .   .   .]
 [-1.   .   .   .   .   .   .   .   .   .]
 [-1.   .   .   .   .   .   .   .   .   .]
 [ 0.   .   .   .   .   .   .   .   .   .]]
Total reward : -18


### Windy Gridworld

In [16]:
n_h, n_v = 10, 10
terminal_state = (n_h-1, n_v-1)
Env = rl.WindyGridworld(n_h, n_v, terminal_state=terminal_state)
#Env.summary()

In [17]:
policy_params = {
    'policy_name': 'EpsilonGreedy',
    'epsilon': 0.1
}

agent_params = {
    'state': init_state
}

params = {
    'policy_params': policy_params,
    'agent_params': agent_params,
    'learning_rate': 0.5,
    'discount_factor': 1,
    
}

Sarsa_RL = rl.Sarsa(Agent=Bob, Environment=Env, params=params)
Q_RL = rl.QLearning(Agent=Bob, Environment=Env, params=params)

In [18]:
N_EPOCHS=1000
Sarsa_RL.learn(N_EPOCHS, init_state=init_state)
Q_RL.learn(N_EPOCHS, init_state=init_state)

In [19]:
optimal_path, reward_history = Sarsa_RL.optimalPath(init_state)
Env.printPath(optimal_path, reward_history)

optimal_path, reward_history = Q_RL.optimalPath(init_state)
Env.printPath(optimal_path, reward_history)

[[  .   .   .   .   .   .   .   .   . -1.]
 [  .   .   .   .   .   . -1. -1. -1. -1.]
 [  .   .   .   .   . -1.   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   . -1.   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   . -1.   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   . -1. -1. -1. -1.   .   .   .   .]
 [ 0. -1. -1.   .   .   .   .   .   .   .]]
Total reward : -14
[[  .   .   .   .   .   . -1. -1. -1. -1.]
 [  .   .   .   .   . -1.   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   . -1.   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   . -1.   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   . -1.   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [ 0. -1. -1. -1. -1. -1.   .   .   .   .]]
Total reward : -13


### Cliff Gridworld

In [24]:
n_h, n_v = 10, 10
startover_state = (0, 0)
terminal_state = (n_h-1, 0)
Env = rl.CliffGridworld(n_h, n_v, terminal_state=terminal_state, startover_state=startover_state)
#Env.summary()
Env.printCliff()

[['.' '.' '.' '.' '.' '.' '.' '.' '.' '.']
 ['.' '.' '.' '.' '.' '.' '.' '.' '.' '.']
 ['.' '.' '.' '.' '.' '.' '.' '.' '.' '.']
 ['.' '.' '.' '.' '.' '.' '.' '.' '.' '.']
 ['.' '.' '.' '.' '.' '.' '.' '.' '.' '.']
 ['.' '.' '.' '.' '.' '.' '.' '.' '.' '.']
 ['.' '.' '.' '.' '.' '.' '.' '.' '.' '.']
 ['.' '.' '.' '.' '.' '.' '.' '.' '.' '.']
 ['.' '.' '.' '.' '.' '.' '.' '.' '.' '.']
 ['.' '*' '*' '*' '*' '*' '*' '*' '*' '.']]


In [25]:
policy_params = {
    'policy_name': 'EpsilonGreedy',
    'epsilon': 0.1
}

agent_params = {
    'state': init_state
}

params = {
    'policy_params': policy_params,
    'agent_params': agent_params,
    'learning_rate': 0.5,
    'discount_factor': 1,
    
}

Sarsa_RL = rl.Sarsa(Agent=Bob, Environment=Env, params=params)
Q_RL = rl.QLearning(Agent=Bob, Environment=Env, params=params)

In [26]:
N_EPOCHS=1000
Sarsa_RL.learn(N_EPOCHS, init_state=init_state)
Q_RL.learn(N_EPOCHS, init_state=init_state)

See how Sarsa follows a safe path, away from the cliff, whereas Q-learning finds the optimal path, albeit more risky

In [27]:
optimal_path, reward_history = Sarsa_RL.optimalPath(init_state)
Env.printPath(optimal_path, reward_history)

optimal_path, reward_history = Q_RL.optimalPath(init_state)
Env.printPath(optimal_path, reward_history)

[[  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [-1. -1. -1. -1. -1. -1. -1. -1.   .   .]
 [-1.   .   .   .   .   .   . -1. -1. -1.]
 [-1.   .   .   .   .   .   .   .   . -1.]
 [-1.   .   .   .   .   .   .   .   . -1.]
 [ 0.   .   .   .   .   .   .   .   . -1.]]
Total reward : -17
[[  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [  .   .   .   .   .   .   .   .   .   .]
 [-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [ 0.   .   .   .   .   .   .   .   . -1.]]
Total reward : -11
