In [1]:
import numpy as np
import gymnasium as gym
import random
import time
from IPython.display import clear_output

#### Create the Environment

In [2]:
env = gym.make("FrozenLake-v1", render_mode="ansi")

In [3]:
env

<TimeLimit<OrderEnforcing<PassiveEnvChecker<FrozenLakeEnv<FrozenLake-v1>>>>>

#### Creating The Q-Table
* initialize all the `Q-values` to zero for each `state-action` pair.
* The `number of rows` in the table is equivalent to `the size of the state space` in the environment, and the `number of columns` is equivalent to `the size of the action space`. 

In [4]:
action_space_size = env.action_space.n ## Left, Right, Up, Down
print(action_space_size)
state_space_size = env.observation_space.n
print(state_space_size)

4
16


In [5]:
q_table = np.zeros((state_space_size, action_space_size))

In [6]:
print(q_table.shape)
print(q_table)

(16, 4)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


#### Initializing Q-Learning Parameters

In [7]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1 ## alpha
discount_rate = 0.99 ## gamma

## exploration-exploitation trade-off
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

#### The Q-Learning Algorithm Training Loop

In [8]:
rewards_all_episodes = [] ## list to hold all of the rewards we'll get from each episode.

In [9]:
# Q-learning algorithm
for episode in range(num_episodes):
    # initialize new episode params
    state = env.reset()[0] ## Reset env back to the starting state
    done = False
    rewards_current_episode = 0 ## since we start out with no rewards at the beginning of each episode.

    for step in range(max_steps_per_episode): ## which runs for each time-step within an episode
        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:]) ## chose the action with the highest Q-value (exploit the env)
        else:
            action = env.action_space.sample() ## explore the environment, and sample an action randomly.
        # Take new action
        new_state, reward, done, truncated, info = env.step(action) ## Taking step on our env
        # Update Q-table (update the Q-value for that state-action pair in the Q-table using Bellman Equation)
        q_table[state, action] = (1 - learning_rate) * q_table[state, action] + \
                                learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        # Set new state (transition to the new state)
        state = new_state        
        # Add new reward    
        rewards_current_episode += reward 
        # Is last action ended in hole or goal
        if done == True: 
            break
    # Exploration rate decay   
    exploration_rate = min_exploration_rate + \
                        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    # Add current episode reward to total rewards list
    rewards_all_episodes.append(rewards_current_episode)    

In [10]:
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.06200000000000005
2000 :  0.23100000000000018
3000 :  0.4020000000000003
4000 :  0.5460000000000004
5000 :  0.6220000000000004
6000 :  0.6390000000000005
7000 :  0.7130000000000005
8000 :  0.7060000000000005
9000 :  0.6940000000000005
10000 :  0.6950000000000005


In [11]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)



********Q-table********

[[0.56407027 0.51153636 0.51547462 0.5175049 ]
 [0.44568464 0.40324534 0.27767469 0.50685368]
 [0.43996793 0.4209854  0.42584576 0.46913017]
 [0.25218849 0.30245422 0.27836781 0.45916785]
 [0.57579156 0.46349436 0.47433213 0.32956227]
 [0.         0.         0.         0.        ]
 [0.2018282  0.11529949 0.28694064 0.1647688 ]
 [0.         0.         0.         0.        ]
 [0.45672442 0.53078736 0.44081172 0.60086117]
 [0.42052261 0.62199515 0.54035528 0.32904685]
 [0.64938378 0.49157216 0.45047792 0.28574901]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.4668474  0.45241369 0.71768898 0.5380266 ]
 [0.74582597 0.88179436 0.78405872 0.730314  ]
 [0.         0.         0.         0.        ]]


#### Checking the game in action for 3 episodes

In [12]:
for episode in range(3):
    # initialize new episode params
    state = env.reset()[0]
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):        
        # Show current state of environment on screen
        clear_output(wait=True)
        print(env.render())
        time.sleep(0.3)
        
        # Choose action with highest Q-value for current state  
        action = np.argmax(q_table[state,:])        
        new_state, reward, done, truncated, info = env.step(action)
        
        # Take new action

        if done:
            clear_output(wait=True)
            print(env.render())
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break         

        # Set new state
        state = new_state
        
env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m

****You reached the goal!****


In [28]:
state

14

In [29]:
q_table[state,:]

array([0.71042121, 0.8770418 , 0.73338575, 0.7152258 ])