In [2]:
import gym
import numpy as np
import random
import time
from IPython.display import clear_output

In [4]:
env = gym.make("FrozenLake-v1")

In [5]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
q_table = np.zeros((state_space_size, action_space_size))
print("The initialised quality table is \n {}".format(q_table))

The initialised quality table is 
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [6]:
num_episodes = 100000
max_steps_per_episode = 100
learning_rate = 0.1
discount_rate = 0.99
exploration_rate=1
max_exploration_rate=1
min_exploration_rate=0.01
exploration_rate_decay=0.001

In [7]:
# Q-learning algorithm
reward_all_episodes = []

for episode in range(num_episodes):
    state = env.reset()
    done = False
    episode_reward = 0
    for step in range(max_steps_per_episode):
        explr_th = random.uniform(0,1)
        if explr_th > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)

        # Update Q table

        q_table[state, action] = q_table[state, action]*(1-learning_rate) + \
            learning_rate*(reward+discount_rate*np.max(q_table[new_state,:]))
        state = new_state
        episode_reward += reward

        if done:
            break

    # Update the Exploration rate
    exploration_rate = min_exploration_rate +\
                       (max_exploration_rate-min_exploration_rate)*np.exp(-exploration_rate_decay*episode)

    # Append the episode reward to total reward
    reward_all_episodes.append(episode_reward)

# Calculate and print the rewards per thousand episodes
rewards_per_thousand  = np.split(np.array(reward_all_episodes), num_episodes/1000)

for i, r  in enumerate(rewards_per_thousand):
    print("The rewards for {} thousand episodes are {}".format(i, sum(r/1000)))

print("\n *** The q table after 10000 episodes is *** \n{}".format(q_table))

The rewards for 0 thousand episodes are 0.05500000000000004
The rewards for 1 thousand episodes are 0.20200000000000015
The rewards for 2 thousand episodes are 0.3960000000000003
The rewards for 3 thousand episodes are 0.5550000000000004
The rewards for 4 thousand episodes are 0.6180000000000004
The rewards for 5 thousand episodes are 0.6550000000000005
The rewards for 6 thousand episodes are 0.6640000000000005
The rewards for 7 thousand episodes are 0.6800000000000005
The rewards for 8 thousand episodes are 0.6880000000000005
The rewards for 9 thousand episodes are 0.6930000000000005
The rewards for 10 thousand episodes are 0.6640000000000005
The rewards for 11 thousand episodes are 0.6940000000000005
The rewards for 12 thousand episodes are 0.6730000000000005
The rewards for 13 thousand episodes are 0.6550000000000005
The rewards for 14 thousand episodes are 0.6870000000000005
The rewards for 15 thousand episodes are 0.6610000000000005
The rewards for 16 thousand episodes are 0.68500

In [8]:
# Visualise the Agent play the game
env.reset()
for episode in range(5):
    state = env.reset()
    done = False
    time.sleep(1)
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)

        if done:
            clear_output(wait=True)
            env.render()
            if reward==1:
                print("***You've reached the goal***")
            else:
                print("***You've fell through hole***")
                time.sleep(0.3)
            clear_output(wait=True)
            break

        state = new_state

env.close()




***You've reached the goal***
