Naive solution of Frozen Lake with Q-Table

In [1]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
action_space_size =  env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [4]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [5]:
rewards_all_epsodes = []

for episode in range(num_episodes):
    state = env.reset()

    done = False
    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        exp_rate_threshold = random.uniform(0,1)
        if exp_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)
        q_table[state, action] = q_table[state, action]*(1-learning_rate) + \
            learning_rate*(reward+discount_rate*np.max(q_table[new_state,:]))
        state = new_state
        rewards_current_episode += reward

        if done: break

    exploration_rate = min_exploration_rate + (max_exploration_rate-min_exploration_rate)*np.exp(-exploration_decay_rate*episode)
    rewards_all_epsodes.append(rewards_current_episode)


rewards_per_thousand_episodes = np.split(np.array(rewards_all_epsodes), num_episodes/1000)
count = 1000
print("*****Average rewards per 1000 episodes*****\n")
for r in rewards_per_thousand_episodes:
    print(count, ":", str(sum(r)/1000))
    count+=1000

print("\n\n*****Q-Table*****")
print("LEFT DOWN RIGHT UP")
print(q_table)


*****Average rewards per 1000 episodes*****

1000 : 0.04
2000 : 0.2
3000 : 0.412
4000 : 0.541
5000 : 0.671
6000 : 0.677
7000 : 0.662
8000 : 0.698
9000 : 0.693
10000 : 0.657


*****Q-Table*****
LEFT DOWN RIGHT UP
[[0.57240933 0.48740034 0.49840042 0.48906647]
 [0.33388988 0.34573473 0.29881201 0.49091645]
 [0.40817581 0.40713693 0.41194006 0.44531026]
 [0.22112783 0.28285319 0.41230877 0.42438057]
 [0.59619425 0.28932984 0.39508398 0.38633044]
 [0.         0.         0.         0.        ]
 [0.16717887 0.13348114 0.36635197 0.10993801]
 [0.         0.         0.         0.        ]
 [0.38188776 0.50100333 0.36944746 0.63352074]
 [0.34696123 0.69278414 0.41316551 0.40104796]
 [0.60829066 0.37049301 0.40277046 0.26489526]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.54088636 0.55105224 0.79985092 0.41766628]
 [0.71859267 0.84632002 0.72047395 0.73258703]
 [0.         0.         0.         0.        ]]


In [6]:
for episode in range(3):
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)

    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)

        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("Agent reached his freesbie")
            else:
                print("Agent has drown :(")

            time.sleep(3)
            clear_output(wait=True)
            break
        state = new_state

env.close()

(Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Agent reached his freesbie
