In [27]:
import numpy as np
import gym
import random

In [28]:
env = gym.make("FrozenLake-v0")

In [29]:
action_size = env.action_space.n
state_size = env.observation_space.n

Qtable = np.zeros((state_size, action_size))
print(Qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [30]:
total_episodes = 15000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

In [31]:
# Get list of rewards
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        exper_expl_tradeoff = random.uniform(0,1)
        
        if exper_expl_tradeoff > epsilon: # exploitation
            action = np.argmax(Qtable[state, :])
        
        else: # experimentation
            action = env.action_space.sample()
        
        # # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)
        
        # Update the Qtable using the Bellman equation
        Qtable[state, action] = Qtable[state, action] + learning_rate*(reward + gamma*np.max(Qtable[new_state, :]) - Qtable[state, action])
        
        total_rewards += reward
        
        state = new_state
        
        if done: # finish episode
            break
    
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

In [32]:
print ("Score over time: " +  str(sum(rewards)/total_episodes))
print(Qtable)

Score over time: 0.48
[[7.38949135e-02 6.59328570e-02 5.01419789e-02 6.48885002e-02]
 [2.05556931e-02 7.73264006e-03 1.06875461e-02 5.46148834e-02]
 [1.25857188e-02 3.97285736e-02 3.92224035e-02 3.94954708e-02]
 [3.50109561e-03 1.26131174e-03 9.93401873e-04 3.90705522e-02]
 [8.90026213e-02 3.06119416e-02 1.33319708e-02 2.21164211e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.11617887e-02 4.02121429e-07 1.67105564e-07 1.10772054e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.67518166e-02 1.06637905e-02 2.78479317e-02 1.05053652e-01]
 [4.36460160e-03 5.26648326e-02 2.87836758e-03 1.61865135e-03]
 [1.12137196e-02 8.16571858e-03 1.10119375e-03 5.37978985e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.28510496e-01 1.45286391e-03 4.31275757e-01 1.77335977e-02]
 [1.18461954e-01 2.23323214e-01 1.31249051e-01 6.00162260e-01]
 [0.00000000e+00 0.00000000e+00 0

# Using the Q-table to play FrozenLake

In [38]:
env.reset()

for episode in range(10):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
#         env.render()
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 23
****************************************************
EPISODE  1
****************************************************
EPISODE  2
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 12
****************************************************
EPISODE  3
****************************************************
EPISODE  4
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 20
****************************************************
EPISODE  5
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 54
****************************************************
EPISODE  6
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 42
****************************************************
EPISODE  7
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 17
****************************************************
EPISODE  8
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 9
**********************************

[Citation](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb)