In [2]:
#www.deeplizard.com
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [3]:
env = gym.make('FrozenLake-v0')

In [4]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

#Initialize q-table with zeros
q_table = np.zeros((state_space_size,action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
num_episodes= 10000
max_steps_per_episode = 100

learning_rate=0.1
discount_rate=0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [6]:
rewards_all_episodes=[]

# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset() #Reset game
    
    done = False #Initialize flag that indicates the game has ended
    rewards_current_episode = 0 #Initialize reward for this episode
    
    for step in range(max_steps_per_episode):
        #Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0,1) #Get a random number between 0 and 1
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:]) #Exploitation -> Best known action
        else:
            action = env.action_space.sample() #Exploration -> Random action
        
        new_state, reward, done, info = env.step(action) #Run action in the game
        
        #Update Q-table for Q(s,a)
        q_table[state,action] = q_table[state,action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))
        
        state = new_state #Update state
        rewards_current_episode += reward #Save reward obtained in this step
        
        if done == True:
            break
        
    #Exploration rate decay -> Reduce exploration after each episode
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    
    rewards_all_episodes.append(rewards_current_episode)
        
#Calculate and print the average reward per thousand of episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)
count = 1000
print("*******************Average reward per thousand episodes**********************\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000
    
#Print updated Q-table
print("\n\n************************Q-table******************************************\n")
print(q_table)

*******************Average reward per thousand episodes**********************

1000 :  0.03300000000000002
2000 :  0.18900000000000014
3000 :  0.4100000000000003
4000 :  0.5630000000000004
5000 :  0.6490000000000005
6000 :  0.6990000000000005
7000 :  0.6700000000000005
8000 :  0.7070000000000005
9000 :  0.6860000000000005
10000 :  0.6900000000000005


************************Q-table******************************************

[[0.50342054 0.50102024 0.50115278 0.50138552]
 [0.3019417  0.26320341 0.3713792  0.4804228 ]
 [0.43253619 0.42003587 0.4314511  0.46588879]
 [0.30359911 0.21107235 0.27095985 0.44460458]
 [0.51815917 0.3364136  0.42122458 0.36864471]
 [0.         0.         0.         0.        ]
 [0.21540272 0.10630024 0.32317285 0.15979933]
 [0.         0.         0.         0.        ]
 [0.29859595 0.32846446 0.42899604 0.54926906]
 [0.52389612 0.58274694 0.52587147 0.37142187]
 [0.53145569 0.43194663 0.41468532 0.36389486]
 [0.         0.         0.         0.        ]
 [0.   

In [7]:
#Watch train agent
#Note: Agent do not always move to where it wants due to the slippery condition of the ice
for episode in range(3):
    state = env.reset() #Reset game
    done = False
    
    print("**************EPISODE ", episode+1, "**************\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True) #Clears the output in the jupyter notebook
        env.render() #Display the environment
        time.sleep(0.15)
        
        action = np.argmax(q_table[state,:]) #Choose the best action according to q_table
        new_state, reward, done, info = env.step(action) #Run action in the game
        
        if done:
            clear_output(wait=True) #Clears the output in the jupyter notebook
            env.render()  #Display the environment
            if reward == 1:
                print("***You reached the goal!***")
                time.sleep(3)
            else:
                print("***You fell through a hole!***")
                time.sleep(3)
            clear_output(wait=True) #Clears the output in the jupyter notebook
            break
        
        state= new_state #Update state
        
env.close() #Close the game
        

  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m
***You reached the goal!***
