### Import needed libraries 

In [75]:
import numpy as np
import gym
import random

### Creating enviroment using Gym

In [76]:
env = gym.make("FrozenLake-v0")    # Regular Frozen Lake 4x4
# env = gym.make("FrozenLake8x8-v0")   # Frozen Lake 8x8 

action_space = env.action_space.n     # Number of actions available
state_size = env.observation_space.n  
print(action_space, state_size)

4 16


### Creating Q-Table

In [77]:
qtable = np.zeros((state_size, action_space)) # Initialize Q-Table with zeros
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


### Defining Hyper-parameters

In [78]:
# General parameters
total_episodes = 15000
learning_rate = 0.8
max_steps = 60
gamma = 0.95

# Exploration parameters
epsilon = 0.1
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

### Q-Learning Algorithm

In [79]:
rewards = []

for episode in range(total_episodes):
    state = env.reset()
    step = 0
    done = False
    total_reward = 0
    
    for step in range(max_steps):
        
        # Decide between exploration and explotation
        if random.uniform(0,1) > epsilon:
            action = np.argmax(qtable[state,:])
            
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action) # Advance one step in the environment
        
        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        qtable[state, action] = qtable[state, action] + learning_rate*(reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_reward += reward
        
        state = new_state
        
        if done:
            break
            
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) # Modify epsilon
    rewards.append(total_reward)
    
print("Score over time: " + str(sum(rewards)/total_episodes)) # Returns average reward per episode
print(qtable)

Score over time: 0.4090666666666667
[[1.77772381e-01 1.55922825e-02 5.20796243e-02 2.25100942e-02]
 [2.39909102e-03 2.05018354e-03 9.62228921e-04 1.29305202e-01]
 [1.03737658e-02 7.15110659e-03 3.02555954e-03 6.81492976e-03]
 [1.60796170e-03 2.81667525e-03 1.28017616e-05 6.95542966e-03]
 [2.71292990e-01 5.83334096e-02 1.73080539e-02 6.35382700e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.31318458e-01 7.19507522e-06 2.93611558e-06 2.34973846e-06]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.13444560e-02 9.63670467e-03 7.51125677e-02 4.26511579e-01]
 [2.21046640e-02 6.81425568e-01 2.52720943e-01 9.52481117e-03]
 [8.52713292e-01 2.57544595e-03 2.31851958e-04 5.14068485e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.51157931e-01 2.31180384e-02 7.23379926e-01 1.51975411e-01]
 [1.34234942e-01 9.49804671e-01 3.29525692e-01 2.80950329e-01]
 [0.00000000e+00 0.

In [80]:
env.reset()

for episode in range(10):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        # env.render()    # Un-comment to watch every game step
            
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 30
****************************************************
EPISODE  1
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 24
****************************************************
EPISODE  2
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 27
****************************************************
EPISODE  3
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 39
****************************************************
EPISODE  4
****************************************************
EPISODE  5
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 21
****************************************************
EPISODE  6
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 41
****************************************************
EPISODE  7
****************************************************
EPISODE  8
  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Number of steps 15
*************************