 Solving the Frozen Lake Problem with Policy Iteration


In [1]:
# Import the libraries
import gym
import numpy as np

In [2]:
# create the frozen lake environment using gym:
env = gym.make('FrozenLake-v1')

In [3]:
# Computing value function using policy

# Define a function called compute_value_function 
# which takes the policy as a parameter:
def compute_value_function(policy):
    
    # Define the number of iterations
    num_iterations = 1000
    
    # Define the threshold value
    threshold = 1e-20
    
    # Set the discount factor
    gamma = 1.0
    
    # Initialize the value table, with the value of all states to zero
    value_table = np.zeros(env.observation_space.n)
    
    # for every iteration
    for i in range(num_iterations):
        
        # update the value table, 
        # that is, we learned that on every iteration, 
        # we use the updated value table (state values)
        # from the previous iteration
        updated_value_table = np.copy(value_table)
        
        

        # thus, for each state, we select the action according 
        # to the given policy and then we update the
        # value of the state using the selected action as shown below
        
        # for each state
        for s in range(env.observation_space.n):
            
            # select the action in the state according to the policy
            a = policy[s]
            
            # compute the value of the state using the selected action
            value_table[s] = sum([prob * (r + gamma * updated_value_table[s_]) 
                                        for prob, s_, r, _ in env.P[s][a]])
            
        # after computing the value table, that is, value of all the states, we check whether the
        # difference between value table obtained in the current iteration and previous iteration is
        # less than or equal to a threshold value if it is less then we break the loop and return the
        # value table as an accurate value function of the given policy

        if (np.sum((np.fabs(updated_value_table - value_table))) <= threshold):
            break
            
    return value_table


In [4]:
# Extracting policy from the value function

# We define a function called extract_policy to extract 
# a policy given the value function as shown below:
def extract_policy(value_table):
    
    # set the discount factor
    gamma = 1.0
     
    # Initialize the policy with zeros
    policy = np.zeros(env.observation_space.n) 

    
    # For each state, we compute the Q values for all the actions in the state and
    # then we extract policy by selecting the action which has maximum Q value.
    
    # for each state
    for s in range(env.observation_space.n):
        
        # compute the Q value of all the actions in the state
        Q_values = [sum([prob*(r + gamma * value_table[s_])
                             for prob, s_, r, _ in env.P[s][a]]) 
                                   for a in range(env.action_space.n)] 
                
        # extract policy by selecting the action which has maximum Q value
        policy[s] = np.argmax(np.array(Q_values))        
    
    return policy

In [5]:
# Putting it all together
# First, let's define a function called "policy_iteration" 
# which takes the environment as a parameter
def policy_iteration(env):
    
    # set the number of iterations
    num_iterations = 1000

    # We will initialize the random policy which selects the action 0 in all the states
    policy = np.zeros(env.observation_space.n)  
    
    # for every iteration
    for i in range(num_iterations):
        # compute the value function using the policy
        value_function = compute_value_function(policy)
        
        # extract the new policy from the computed value function
        new_policy = extract_policy(value_function)
           
        # if the policy and new_policy are same 
        if (np.all(policy == new_policy)):
            
            # then break the loop
            break        
        # else, update the "current policy" to" new_policy"
        policy = new_policy
        
    return policy

In [6]:
# Now, let's learn how to perform policy iteration 
# and find the optimal policy in the frozen lake environment.

# So, we just feed the frozen lake environment to our 
# policy_iteration function and get the optimal policy:

optimal_policy = policy_iteration(env)

In [7]:
# Print the optimal policy:
print(optimal_policy)

[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


In [8]:
# As we can observe, our optimal policy tells us to perform the correct action in each state. 
# Thus, we learned how to perform the policy iteration method to compute the optimal policy.