In [8]:
# Reference : https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb

In [9]:
import numpy as np
import gym
import random

In [10]:
env = gym.make("FrozenLake-v0")

In [11]:
# create the Q-table and initialise it

action_size = env.action_space.n
state_size= env.observation_space.n


qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [12]:
# create the hyperparameters

total_episodes = 15000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

### Q-Learning Algorithm

In [17]:
# List of rewards
rewards = []

# for life or untill learning is stopped
for episode in range(total_episodes):
    #reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        #3. Choose an action a in the current world_state (s)
        # Randomise the number
        exp_exp_tradeoff = random.uniform(0,1)
        
        ## if this number > epsilon --> exploitatin or taking biggest value for this state
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
        else:
        # else; doing a random choice for exploration
            action = env.action_space.sample()
            
        # Take the action (a) and observe the outcome state (s') and reward (r)
        new_state, reward, done, info = env.step(action)
        
        # Update using Bellman equation
        # Q(s,a) := Q(s,a) + lr*(R(s,a) + gamma * max Q(s', a') - Q(s,a))
        # qtabel[new_state, :] : all actions that we can take from new state
        
        qtable[state,action] = qtable[state,action] + learning_rate \
                                                    * (reward + gamma \
                                                    * np.max(qtable[new_state, :])
                                                    - qtable[state, action])
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If dead : finish episode
        if done:
            break
            
    # reduce epsilon becasue we need less and less exploration
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(total_rewards)

In [21]:
print ("Score over time: " +  str(sum(rewards)/total_episodes))

Score over time: 0.4786666666666667


### Use our Q-table to play Frozen Lake

- after 10 000 episodes, the q-table can be used for testing our learning
- by running this cell one can see our agent playing Frozen Lake

In [26]:
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("***********************************************")
    print("EPISODE : ", episode)
    
    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        
        action = np.argmax(qtable[state, :])
        
        new_state, reward, done, _ = env.step(action)
        
        if done:
            env.render()
            print("Number of steps", step)
            break
        state = new_state

env.close()

***********************************************
EPISODE :  0
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
Number of steps 77
***********************************************
EPISODE :  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 12
***********************************************
EPISODE :  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 30
***********************************************
EPISODE :  3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 82
***********************************************
EPISODE :  4
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 13
