In [1]:
import numpy as np
import gym
import random

In [2]:
env = gym.make("FrozenLake-v0")

  result = entry_point.load(False)


In [3]:
# Create Q-Table
action_size = env.action_space.n
state_size = env.observation_space.n

In [4]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [5]:
# Hyperparameter
total_episodes = 15000        # Total episodes
learning_rate = 0.8           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

In [6]:
# Q-Learning Algo
# Step 1 (Initalize the table)
# Step 2 (Loop)
# Loop till learning is stopped
# .    choose an action at current state (s) base on current Q-value estimation
# .     Take the action, observe the outcome state (s') and reward (r)
# .     Update Q-value

# List of rewards
rewards = []

# 
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        # Choose an action at current state(S) base on the current Q-value estimation
        # Get a random number
        exp_exp_tradeoff = random.uniform(0, 1)
        
        # if the exp_exp_tradeoff number is greater than epsilon, we do EXPLOITATION (ie: take the biggest Q-value at this state to determine our action)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
            
        else:
            # Perform random action (EXPLORATION)
            action = env.action_space.sample()
            
        # Take the action, observe the outcome state (s') and reward (r)
        new_state, reward, done, info = env.step(action)
        
        # Update the Q-value
        # Q(s,a) := Q(s,a) + lr[R(s,a) + gamma* max Q(s',a') - Q(s,a)]
        # qtable[new_state, :] <= All the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
    
        
                                                                      
        # Update state
        state = new_state

        if done == True: 
            break
        

    # outside the loop
    # Reduce epsilon (because we as we train, we don't need much Exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)


                                                                      
print("Score over time: " + str(sum(rewards)/total_episodes))
print(qtable)


Score over time: 0.4735333333333333
[[8.01319672e-02 5.21787087e-02 4.07178394e-02 6.16008594e-02]
 [9.30743593e-03 3.60267794e-03 6.91455680e-03 1.05984202e-01]
 [9.23260644e-03 8.90009831e-03 8.93741257e-03 1.95112564e-02]
 [2.75494468e-03 1.81585680e-03 7.07173438e-03 1.50256543e-02]
 [7.73169400e-02 1.97513716e-02 1.00984000e-02 7.69168479e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.10868356e-03 2.20818695e-13 3.26232696e-03 1.87159119e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [3.47536136e-02 3.22691606e-02 1.59868815e-03 2.10982960e-01]
 [2.40791306e-02 5.71461599e-01 6.46754961e-03 1.55124839e-03]
 [5.68380366e-01 1.83054288e-03 1.18712048e-03 4.28900553e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.54978690e-02 1.92421046e-02 2.05002492e-01 1.27194014e-01]
 [1.51311863e-01 8.45415683e-01 2.20763031e-01 1.95051716e-01]
 [0.00000000e+00 0.

In [8]:
# Use this Q-table to play FrozenLake
env.reset()

for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print(" **************************************************** ")
    print(" EPISODE ", episode)
    
    for step in range(max_steps):
        # Take the action according to the calcuated Q-table
        # Pick the action (indicate by index), that has the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        print(qtable[state,:])
        print('action', action)
        
        # Take the action and get the new_state, reward
        new_state, reward, done, info = env.step(action)
        print('new_state . reward . done', new_state, reward, done)
        
        if done:
            env.render()
            print("Number of steps: ", step)
            break
            
        state = new_state
        
env.close()

 **************************************************** 
 EPISODE  0
[0.08013197 0.05217871 0.04071784 0.06160086]
action 0
new_state . reward . done 4 0.0 False
[0.07731694 0.01975137 0.0100984  0.00769168]
action 0
new_state . reward . done 4 0.0 False
[0.07731694 0.01975137 0.0100984  0.00769168]
action 0
new_state . reward . done 8 0.0 False
[0.03475361 0.03226916 0.00159869 0.21098296]
action 3
new_state . reward . done 4 0.0 False
[0.07731694 0.01975137 0.0100984  0.00769168]
action 0
new_state . reward . done 4 0.0 False
[0.07731694 0.01975137 0.0100984  0.00769168]
action 0
new_state . reward . done 8 0.0 False
[0.03475361 0.03226916 0.00159869 0.21098296]
action 3
new_state . reward . done 8 0.0 False
[0.03475361 0.03226916 0.00159869 0.21098296]
action 3
new_state . reward . done 4 0.0 False
[0.07731694 0.01975137 0.0100984  0.00769168]
action 0
new_state . reward . done 4 0.0 False
[0.07731694 0.01975137 0.0100984  0.00769168]
action 0
new_state . reward . done 4 0.0 False
[0.