# Playing Frozen Lake with Q-learning

You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mostly frozen, but there are a few holes where the ice has melted. If you step into one of those holes, you'll fall into the freezing water. 

The task is to retrive the fresbee and come back safe.

The surface is described using a grid like the following:

SFFF       
FHFH       
FFFH       
HFFG

This way there are 16 state action pair and just 4 actions which are - up, down, left, right

S: starting point, safe  
F: frozen surface, safe  
H: hole, fall to your doom  
G: goal, where the frisbee is located

The episode ends when you reach the goal or fall in a hole.  
You receive a reward of 1 if you reach the goal, and 0 otherwise.

In [1]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

In [2]:
env = gym.make("FrozenLake-v0")

In [3]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [4]:
assert state_space_size == 16
assert action_space_size == 4

In [5]:
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [6]:
#Created a list to store all the rewards of each episode. This way I can see how the game score changes overtime.
rewards_all_episodes = [] 

# Q-learning algorithm
#This for loop is doing the work for everything that happens in a single episode
for episode in range(num_episodes):
    
    #for each episode I am going to set the state back to 0
    state = env.reset()
    
    #Done variable - keeps track weather or not the episode is finished. So at the start of each episode we initialise
    #the variable to false. Simillary we set the reward variable to 0 for initial stage.
    done = False
    rewards_current_episode = 0
    
    #This for loop accounts for everything that happens in a timestep within a single episode
    for step in range(max_steps_per_episode):       
        
        # Exploration-exploitation trade-off
        #setting exploration threshold to a random number between 0 and 1
        exploration_rate_threshold = random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:]) #exploitate the env and choose the action that has the highest q value from the q table.
        else:
            action = env.action_space.sample() #explore the snv and sample an action randomly 
        
        #once we get action and put it to the step function
        #this function returns a tuple which gives -:
        
        new_state, reward, done, info = env.step(action)

        # After getting the reward, now is the time to update Q-table for Q(s,a) using bellman equation
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        
        #setting state and reward values
        state = new_state
        rewards_current_episode += reward        
        
        #checking if out last action ended the episode or not. If not then we jump out of this episode and move to the next one
        if done == True: 
            break
           
    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)    
    
    #and adding the rewards that we received from the previous episodes
    rewards_all_episodes.append(rewards_current_episode)

# Calculate and print the average reward per thousand episodes
rewards_per_thosand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000
print("********Average reward per thousand episodes********\n")
for r in rewards_per_thosand_episodes:
    avg_reward = sum(r/1000)
    print(count, ": ", str(avg_reward))
    count += 1000    

# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)

********Average reward per thousand episodes********

1000 :  0.03800000000000003
2000 :  0.21500000000000016
3000 :  0.4100000000000003
4000 :  0.5350000000000004
5000 :  0.6050000000000004
6000 :  0.6610000000000005
7000 :  0.6800000000000005
8000 :  0.6670000000000005
9000 :  0.6570000000000005
10000 :  0.6920000000000005


********Q-table********

[[0.55678702 0.506413   0.52140672 0.4871214 ]
 [0.33941874 0.24596305 0.25443055 0.47698089]
 [0.41451739 0.29127795 0.25991448 0.27252175]
 [0.18749375 0.08172572 0.03427734 0.05800487]
 [0.59911034 0.28341184 0.30877972 0.35580102]
 [0.         0.         0.         0.        ]
 [0.18535934 0.16141085 0.32000242 0.08515063]
 [0.         0.         0.         0.        ]
 [0.34401899 0.33345867 0.27120495 0.61851279]
 [0.40385178 0.65604485 0.46408725 0.46903986]
 [0.66659043 0.39571696 0.30359662 0.26398548]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.3740533  0.50034793 0.73048338 

In [7]:
assert avg_reward > 0.6

In [8]:
# Watch  agent play Frozen Lake by playing the best action 
# from each state according to the Q-table.
# Run for more episodes to watch longer.

for episode in range(1):
    state = env.reset()
    done = False #same as above, just keeps track of wheather or not the last action ended the episode
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)

    for step in range(max_steps_per_episode):        
        clear_output(wait=True) #ipython display function whcih clears the output of the cell
        #with wait = true, it waits to clear out the output until there is another value to override it
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])        
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
            clear_output(wait=True)
            break
            
        state = new_state
        
env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
****You reached the goal!****
