In [1]:
import gym
import numpy as np
import random
import time

In [2]:
from IPython.display import clear_output
from tqdm import tqdm_notebook as tqdm

# Environment Setup

In [3]:
env = gym.make('FrozenLake-v0')

In [4]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
q_table = np.zeros((state_space_size, action_space_size))

In [5]:
q_table.shape

(16, 4)

In [6]:
num_episodes = 10000
max_steps_per_episode = 100 #Termination Condition

In [7]:
lr = 0.1 # Learning Rate
dr = 0.99 # Discount Rate

Epsilon Greedy Algorithm Parameters

$ r_f = r_{min} + (r_{max} - r_{min}) * e^{-\lambda t}$

In [8]:
exploration_rate = 1
exploration_decay_rate = 0.001

In [9]:
min_exp_rate = 0.01
max_exp_rate = 1

# Learning Q-Table

Update Equation: $ q_f(s,a) = (1 - \alpha) * q_i(s,a) + \alpha * (R_{t+1} + \gamma * max(s',a'))$

In [10]:
# Check Average Reward of past n episodes
def check_progress(episode, rewards, n):
    if ((episode + 1) % n != 0):
        return
    else:
        avg_reward = np.mean(rewards[(episode - n + 1):])
        print('Episode: {} Average Reward: {}'.format(episode + 1, avg_reward))

In [11]:
rewards = []

In [12]:
for episode in tqdm(range(num_episodes)):
    
    state = env.reset()
    done = False # Episode Termination Switch
    episode_reward = 0 # Overall reward for the Episode
    
    for step in range(max_steps_per_episode):
        
        threshold = random.uniform(0,1)
        if (threshold > exploration_rate):
            action = np.argmax(q_table[state, :]) # Exploit
        else:
            action = env.action_space.sample() # Explore
            
        new_state, reward, done, info = env.step(action) # Take Step with selected Action
        
        # Q-Table Update
        q_table[state, action] = (1-lr) * q_table[state, action] + lr * (reward + dr * np.max(q_table[new_state, :]))
        
        state = new_state # Update State
        episode_reward += reward # Accumulate reward from each step
        
        if (done):
            break # Terminate
        
    #Exploration Rate Decay
    exploration_rate = min_exp_rate + (max_exp_rate - min_exp_rate) * np.exp(-exploration_decay_rate * episode)
        
    rewards.append(episode_reward)
    check_progress(episode, rewards, 1000)

Episode: 1000 Average Reward: 0.039
Episode: 2000 Average Reward: 0.235
Episode: 3000 Average Reward: 0.385
Episode: 4000 Average Reward: 0.542
Episode: 5000 Average Reward: 0.632
Episode: 6000 Average Reward: 0.632
Episode: 7000 Average Reward: 0.687
Episode: 8000 Average Reward: 0.666
Episode: 9000 Average Reward: 0.663
Episode: 10000 Average Reward: 0.675



In [13]:
q_table

array([[0.53505349, 0.50484086, 0.50474339, 0.49548158],
       [0.19554661, 0.38072052, 0.23655282, 0.45402011],
       [0.36756438, 0.24424303, 0.28829625, 0.27052928],
       [0.15391262, 0.03965372, 0.03802641, 0.07170435],
       [0.55950198, 0.37385618, 0.38738822, 0.28279125],
       [0.        , 0.        , 0.        , 0.        ],
       [0.19746888, 0.10208733, 0.14211531, 0.12840029],
       [0.        , 0.        , 0.        , 0.        ],
       [0.41206145, 0.44186514, 0.32148788, 0.60543911],
       [0.3347863 , 0.661227  , 0.47620774, 0.33739398],
       [0.58843177, 0.379851  , 0.28765794, 0.30881125],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.43588327, 0.45487625, 0.78966314, 0.57921869],
       [0.66852449, 0.90658595, 0.72376885, 0.73912602],
       [0.        , 0.        , 0.        , 0.        ]])

# Watching Agent Play

In [19]:
n_episodes = 3

In [20]:
for episode in range(n_episodes):
    
    state = env.reset()
    done = False
    print('Episode:', (episode+1))
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.2)
        
        action = np.argmax(q_table[state, :])
        new_state, reward, done, info = env.step(action)
        
        if(done):
            clear_output(wait=True)
            env.render()
            if (reward == 1):
                print('Goal Reached!')
                time.sleep(3)
            else:
                print('You Fell in the Hole!')
                time.sleep(3)
            clear_output(wait=True)
            break
        state = new_state

env.close()

  (Down)
SFFF
FHFH
FFFH
HFFG
Goal Reached!
