In [1]:
# Import libraries
#!pip install --user gym
import gym
import numpy as np

In [3]:
# Create the environment
env = gym.make('FrozenLake-v1')

In [None]:
# Render the environment
# env.render()

In [4]:
# Computing optimal value function
# We will define a function called value_iteration where we compute the 
# optimal value function iteratively by taking maximum over Q function. 
# Define value_iteration function which takes the environment as a parameter:

def value_iteration(env):

    #set the number of iterations
    num_iterations = 1000
    
    #set the threshold number for checking the convergence of the value function
    threshold = 1e-20
    
    #set the discount factor
    gamma = 1.0
    
    # Initialize the value table, with the value of all states to zero
    value_table = np.zeros(env.observation_space.n)
    
    # for every iteration
    for i in range(num_iterations):
        
        # update the value table
        updated_value_table = np.copy(value_table)              
       
        
        # For each state, we compute the Q values of all the actions in the state and then
        # we update the value of the state as the one which has maximum Q value as shown below:
        for s in range(env.observation_space.n):
            
            Q_values = [sum([prob*(r + gamma * updated_value_table[s_])
                             for prob, s_, r, _ in env.P[s][a]]) 
                                   for a in range(env.action_space.n)] 
                                        
            value_table[s] = max(Q_values) 
                        
        # Check whether the difference between value table obtained in the current iteration 
        # and previous iteration is less than or equal to a threshold value.   
    
        # if it is less 
        if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
            
            # then break the loop
             break
    
    # Return the value table as our optimal value function    
    return value_table



In [5]:
# Extracting optimal policy from the optimal value function
# First, we define a function called extract_policy which takes the value_table as a parameter:
def extract_policy(value_table):
    
    #set the discount factor
    gamma = 1.0
     
    #first, we initialize the policy with zeros
    policy = np.zeros(env.observation_space.n) 
    
  
    # For each state, we compute the Q values for all the actions in the state and
    # then we extract policy by selecting the action which has maximum Q value.
    
    # For each state
    for s in range(env.observation_space.n):
        
        # Compute the Q value of all the actions in the state
        Q_values = [sum([prob*(r + gamma * value_table[s_])
                             for prob, s_, r, _ in env.P[s][a]]) 
                                   for a in range(env.action_space.n)] 
                
        # Extract policy by selecting the action which has maximum Q value
        policy[s] = np.argmax(np.array(Q_values))        
    
    return policy

In [6]:
# Putting it all together

# First, we compute the optimal value function using our "value_iteration" function 
# by passing our frozen lake environment as the parameter:
optimal_value_function = value_iteration(env=env)

In [7]:
# Next, we extract the optimal policy from the optimal value function using 
# our extract_policy function as shown below:

optimal_policy = extract_policy(optimal_value_function)

In [8]:
# We can print the obtained optimal policy:

print(optimal_policy)

[0. 3. 3. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]
