In [1]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

**Winter is here. You and your friends were tossing around a frisbee at the park when you made a wild throw that left the frisbee out in the middle of the lake. The water is mostly frozen, but there are a few holes where the ice has melted. If you step into one of those holes, you'll fall into the freezing water. At this time, there's an international frisbee shortage, so it's absolutely imperative that you navigate across the lake and retrieve the disc. However, the ice is slippery, so you won't always move in the direction you intend.**

In [157]:
# start a new environment "FrozenLake-v0"
env = gym.make("FrozenLake8x8-v0") 
# Query how many no. of states
n_states = env.observation_space.n 
# Query how many possible actions
n_actions =  env.action_space.n 
print ("Number of states: " + str(n_states))
print ("Number of possible actions: " + str(n_actions))

Number of states: 64
Number of possible actions: 4


## Have a look at the setup

**===STATE TYPES===**

(S: starting point, safe, reward = 0)

(F: frozen surface, safe, reward = 0)

(H: hole, failed, reward = 0)

(G: goal, where the frisbee is located, reward = 1)

**===ACTIONS===**

(0: Move Left)

(1: Move Down)

(2: Move Right)

(3: Move Up)

In [158]:
env.reset()
env.render()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


**Start Q-learning**

Do the necessary initialization

In [159]:
#intialize the Q table, rows are #states and columns are #actions
Q_table = np.zeros((n_states, n_actions))
print(Q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [169]:
n_episodes = 10000 #number of episodes of learning
max_steps_per_episode = 100

learning_rate = 0.1
gamma = 0.99 #how much we want to penalize for delay

epsilon = 0.2 #exploration rate


In [170]:
rewards_all_episodes = np.zeros((n_episodes)) # to store the cummalative rewards per episode to see how we progress
# Q-learning algorithm
for episode in range(n_episodes):
    # initialize new episode params
    state = env.reset()
    done = False
    rewards_current_episode = 0
    for step in range(max_steps_per_episode): 
        # Exploration-exploitation trade-off
        if(random.uniform(0, 1)>epsilon):
            #action = np.argmax(Q_table[state,:]) 
            max_list = np.argwhere(Q_table[state] == np.max(Q_table[state,:]))
            #print(max_list)
            if(max_list.shape[0] > 1):
                pos = random.randint(0,max_list.shape[0]-1)
                action = max_list[pos,0]
            else:
                action = np.argmax(Q_table[state,:]) 
        else:
            action = env.action_space.sample()
        # Take new action
        new_state, reward, done, info = env.step(action)
        # Update Q-table
        Q_table[state, action]=(1 - learning_rate)*Q_table[state, action] + learning_rate*(reward + gamma*np.max(Q_table[new_state, :]))
        # Set new state
        state = new_state
        # Add new reward
        rewards_current_episode = rewards_current_episode + reward
        if done == True: 
            break
        # Exploration rate decay
        
    # Add current episode reward to total rewards list
    rewards_all_episodes[episode] = rewards_current_episode

In [171]:
# Calculate and print the average reward per thousand episodes
rewards_per_thosand_episodes = np.split(np.array(rewards_all_episodes),n_episodes/1000)
count = 1000
print("********Average reward per thousand episodes********\n")
for r in rewards_per_thosand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.20400000000000015
2000 :  0.22600000000000017
3000 :  0.21800000000000017
4000 :  0.19200000000000014
5000 :  0.23400000000000018
6000 :  0.21100000000000016
7000 :  0.20800000000000016
8000 :  0.21900000000000017
9000 :  0.22700000000000017
10000 :  0.21500000000000016


In [172]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(Q_table)



********Q-table********

[[3.79144351e-01 3.80694794e-01 3.88023653e-01 3.79647754e-01]
 [3.86120523e-01 3.93772026e-01 3.99571011e-01 3.89772878e-01]
 [4.07306105e-01 4.12288261e-01 4.15244137e-01 4.11947026e-01]
 [4.25334282e-01 4.33469081e-01 4.39870547e-01 4.35531862e-01]
 [4.50365207e-01 4.53564484e-01 4.67339357e-01 4.50852859e-01]
 [4.70972844e-01 4.81828502e-01 4.99151545e-01 4.79800908e-01]
 [4.99264861e-01 5.03694424e-01 5.24972230e-01 5.04435181e-01]
 [5.09579614e-01 5.34140821e-01 5.14555865e-01 5.05890318e-01]
 [3.78357688e-01 3.76741226e-01 3.78364114e-01 3.81872431e-01]
 [3.80695801e-01 3.84694556e-01 3.82689110e-01 3.91843624e-01]
 [3.86533146e-01 3.93740108e-01 3.97298630e-01 4.07723026e-01]
 [3.01865530e-01 2.39181616e-01 2.57987942e-01 4.31341385e-01]
 [4.27061166e-01 4.35839114e-01 4.25993468e-01 4.69865290e-01]
 [4.73347209e-01 4.73581491e-01 5.08425578e-01 4.74552685e-01]
 [5.06172152e-01 5.23637798e-01 5.32199927e-01 5.12436733e-01]
 [5.29237887e-01 5.60062817e

## Agent plays the game

In [173]:
# Watch our agent play Frozen Lake by playing the best action 
# from each state according to the Q-table
for episode in range(3):
    state = env.reset()
    done = False
    print("====EPISODE ", episode+1, "====\n\n\n\n")
    print("=============================\n")
    time.sleep(1)    
    
    for step in range(100):        
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(Q_table[state,:])        
        new_state, reward, done, info = env.step(action)
        
        #print(reward, done)
        if done:
            clear_output(wait=True)
            env.render()
            
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                
            clear_output(wait=True)
            break
        state = new_state
env.close()


  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHF[41mF[0m
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [176]:
# Check how many times it reached the goal in 100 attempts
n_attempts = 100
n_success = 0
for episode in range(n_attempts):
    state = env.reset()
    done = False
   
    for step in range(100):        
        
        action = np.argmax(Q_table[state,:])        
        new_state, reward, done, info = env.step(action)

        if done:
            if reward == 1:
                n_success +=1
            break
        state = new_state
env.close()
print ('times success: ' + str(n_success))


times success: 64


# Things for you to try

1. What if the environment was not stochastic, i.e., actions lead you to correct location? Do you still need learning? What changes are needed? You can use "env = gym.make("FrozenLake-v0", is_slippery=False)" to make the actions deterministic

2. What if we make Gamma as 1.0? What if we make it too small?

3. Instead of epsilon being fixed, change it such that it starts high (~1) and decays with time.
4. Try a bigger board 'FrozenLake8x8-v0'