In [1]:
import gym
import numpy as np

In [2]:
env = gym.make('Taxi-v2')
state = env.reset()
env.render()

total_reward = 0
done = False
while not done:
    state, reward, done, info = env.step(env.action_space.sample())
    total_reward += reward
    env.render()

print('Total reward:', total_reward)

+---------+
|[35mR[0m: | : :G|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|[35mR[0m: | : :G|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (Pickup)
+---------+
|[35mR[0m: | : :G|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (Dropoff)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (West)
+---------+
|[35mR[0m: | : :G|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
+---------+
|[35mR[0m: 

# Beat Random

In [3]:
print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)

Action Space: Discrete(6)
Observation Space: Discrete(500)


In [4]:
def init_Q_table(n_states, n_actions):
    Q_table = np.zeros((n_states, n_actions))
    return Q_table

In [5]:
def epsilon_greedy_selection(state, Q_table, epsilon):
    if np.random.random() < epsilon:
        return np.random.choice(Q_table.shape[1])
    
    actions = Q_table[state]
    return np.argmax(actions)

In [6]:
def update(Q_table, reward, last_state, last_action, state, alpha, gamma):
    current_value = Q_table[last_state][last_action]
    expected_future_reward = Q_table[state].max()
    Q_table[last_state][last_action] = (
        current_value + alpha * (reward + gamma*expected_future_reward - current_value)
    )
    
    return Q_table

In [10]:
def train(Q_table, gamma, alpha, 
          n_training_episodes=20000, 
          log_every=2000, 
          n_max_steps=1000, 
          linear_epsilon_decay=0.9):
    epsilon = 1    
    for episode in range(n_training_episodes):

        if episode % log_every == 0:
            print('{} episodes simulated'.format(episode))

        state = env.reset()
        for step in range(n_max_steps):
            action = epsilon_greedy_selection(state, Q_table, epsilon)
            last_state = state
            state, reward, done, info = env.step(action)
            Q_table = update(Q_table, reward, last_state, action, state, alpha, gamma)

            if done:
                break

        epsilon -= linear_epsilon_decay/(n_training_episodes)

    print("Done.")
    return Q_table

In [11]:
n_states = 500
n_actions = 6

gamma = 0.9
alpha = 0.05

Q_table = init_Q_table(n_states, n_actions)

In [12]:
Q_table = train(Q_table, gamma, alpha)

0 episodes simulated
2000 episodes simulated
4000 episodes simulated
6000 episodes simulated
8000 episodes simulated
10000 episodes simulated
12000 episodes simulated
14000 episodes simulated
16000 episodes simulated
18000 episodes simulated
Done.


In [13]:
def eval_policy(policy, *policy_args):
    episodes = 1000
    rewards = []
    max_steps = 99

    for episode in range(episodes):
        state = env.reset()  # Assuming you already have env created as above
        total_rewards = 0

        for step in range(max_steps):
            action = policy(state, *policy_args)  
            state, reward, done, info = env.step(action)
            total_rewards += reward
            if done:
                break
        rewards.append(total_rewards)        
    
    ave_score = sum(rewards) / episodes
    print('Average score over time:', ave_score)
    
    return ave_score

In [14]:
eval_policy(epsilon_greedy_selection, Q_table, 0)

Average score over time: 8.453


8.453

# Hyperparameter Tuning (Beat 8.467)

In [29]:
n_states = 500
n_actions = 6

gammas = [0.75, 0.775, 0.8, 0.825, 0.85]
alphas = [0.05, 0.075, 0.1, 0.125, 0.15]


In [31]:
best_score = -np.inf
best_hyperparams = None
best_Q = None

for gamma in gammas:
    for alpha in alphas:
        print('gamma: {}, alpha: {}'.format(gamma, alpha))
        Q_table = init_Q_table(n_states, n_actions)
        Q_table = train(Q_table, gamma, alpha, n_training_episodes=30000, log_every=15000)

        score = eval_policy(epsilon_greedy_selection, Q_table, 0)

        if score > best_score:
            best_score = score
            best_hyperparams = [gamma, alpha]
            best_Q = np.copy(Q_table)
        print('-'*30)

gamma: 0.75, alpha: 0.05
0 episodes simulated
15000 episodes simulated
Done.
Average score over time: 8.483
------------------------------
gamma: 0.75, alpha: 0.075
0 episodes simulated
15000 episodes simulated
Done.
Average score over time: 8.48
------------------------------
gamma: 0.75, alpha: 0.1
0 episodes simulated
15000 episodes simulated
Done.
Average score over time: 8.423
------------------------------
gamma: 0.75, alpha: 0.125
0 episodes simulated
15000 episodes simulated
Done.
Average score over time: 8.47
------------------------------
gamma: 0.75, alpha: 0.15
0 episodes simulated
15000 episodes simulated
Done.
Average score over time: 8.525
------------------------------
gamma: 0.775, alpha: 0.05
0 episodes simulated
15000 episodes simulated
Done.
Average score over time: 8.561
------------------------------
gamma: 0.775, alpha: 0.075
0 episodes simulated
15000 episodes simulated
Done.
Average score over time: 8.401
------------------------------
gamma: 0.775, alpha: 0.1


In [33]:
print('Best gamma: {}, Best alpha: {}'.format(*best_hyperparams))

Best gamma: 0.825, Best alpha: 0.15


In [79]:
# train for an additional 20000 episodes
Q_table = train(best_Q, *best_hyperparams)
# evaluate
eval_policy(epsilon_greedy_selection, Q_table, 0)

Average score over time: 8.539


8.539

While this hyperparameter configuration occasionally achieves high average performance, the results are unstable. 

I will instead try with `gamma=0.775`, `alpha=0.05`

In [83]:
Q_table = init_Q_table(n_states, n_actions)
Q_table = train(best_Q, 0.775, 0.05, n_training_episodes=30000, log_every=10000)
# evaluate
eval_policy(epsilon_greedy_selection, Q_table, 0)

0 episodes simulated
10000 episodes simulated
20000 episodes simulated
Done.
Average score over time: 8.647


8.647

In [92]:
ave = np.mean([eval_policy(epsilon_greedy_selection, Q_table, 0) for _ in range(10)])
print('Average score over 10000 episodes', round(ave, 3))    

Average score over time: 8.421
Average score over time: 8.465
Average score over time: 8.532
Average score over time: 8.546
Average score over time: 8.526
Average score over time: 8.452
Average score over time: 8.595
Average score over time: 8.417
Average score over time: 8.447
Average score over time: 8.462
Average score over 10000 episodes 8.486
