In [9]:
import gym

import numpy as np

In [2]:
env =  gym.make('FrozenLake-v0')

In [3]:
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [6]:
print("Action Space:", env.action_space)
print("Observation Space:", env.observation_space)

Action Space: Discrete(4)
Observation Space: Discrete(16)


In [10]:
def init_Q_table(states=16, actions=4):
    Q_table = np.zeros((states, actions))
    return Q_table

In [21]:
def epsilon_greedy_selection(state, epsilon):
    if np.random.random() < epsilon:
        return np.random.choice(Q_table.shape[1])
    
    actions = Q_table[state]
    return np.argmax(actions)

In [12]:
def update(Q_table, reward, last_state, last_action, state, alpha, gamma):
    current_value = Q_table[last_state][last_action]
    expected_future_reward = Q_table[state].max()
    Q_table[last_state][last_action] = (
        current_value + alpha * (reward + gamma*expected_future_reward - current_value)
    )
    
    return Q_table

In [40]:
Q_table = init_Q_table()

n_training_episodes = 20000
log_every = 5000

n_max_steps = 200
gamma = 0.9
alpha = 0.05

epsilon = 1    
for episode in range(n_training_episodes):
    
    if episode % log_every == 0:
        print('{} episodes simulated'.format(episode))
    
    state = env.reset()
    for step in range(n_max_steps):
        action = epsilon_greedy_selection(state, epsilon)
        last_state = state
        state, reward, done, info = env.step(action)
        Q_table = update(Q_table, reward, last_state, action, state, alpha, gamma)
        
        if done:
            break
            
    epsilon -= 0.3/(n_training_episodes)
    
print("Done.")

0 episodes simulated
5000 episodes simulated
10000 episodes simulated
15000 episodes simulated
Done.


In [41]:
state = env.reset()
for step in range(n_max_steps):
    action = epsilon_greedy_selection(state, 0)
    state, reward, done, info = env.step(action)
    
    env.render()
    if done:
        break

  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
F