# DSE 3260
## Week - 8
### Reg. No - 200968216
### Pratinav Seth

In [1]:
!pip install gym

import gym
import numpy as np
env = gym.make("FrozenLake-v1")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


  deprecation(
  deprecation(


##  Policy Iteration function with the following parameters 
 - policy: 2D array of a size n(S) x n(A), each cell represents a probability of taking action 'a' in state 's'
 - environment: Initialized Open AI gym environment object
 - discount_factor: MDP discount factor
 - theta:  A  threshold  of  a  value  function  change.  Once  the  update  to  value function is lesser than this number
 - max_iterations: Maximum number of iterations

In [2]:
import numpy as np

def policy_iteration(policy, env, discount_factor, theta, max_iterations):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    
    value_function = np.zeros(num_states)
    
    for i in range(max_iterations):
        
        while True:
            delta = 0
            
            for s in range(num_states):
                old_value = value_function[s]
                
                action_prob = policy[s]
                
                action_values = np.zeros(num_actions)
                
                for a in range(num_actions):
                    for prob, next_state, reward, done in env.P[s][a]:
                        action_values[a] += prob * (reward + discount_factor * value_function[next_state])
                
                value_function[s] = np.sum(action_prob * action_values)
                
                delta = max(delta, np.abs(old_value - value_function[s]))
            
            if delta < theta:
                break
        
        policy_stable = True
        
        for s in range(num_states):
            old_action = np.argmax(policy[s])
            
            action_values = np.zeros(num_actions)
            
            for a in range(num_actions):
                for prob, next_state, reward, done in env.P[s][a]:
                    action_values[a] += prob * (reward + discount_factor * value_function[next_state])
            
            best_action = np.argmax(action_values)
            
            if old_action != best_action:
                policy_stable = False
            
            policy[s] = np.eye(num_actions)[best_action]
        
        if policy_stable:
            break
    
    return policy, value_function


  and should_run_async(code)


## Value Iteration function with the following parameters 
 - environment: Initialized Open AI gym environment object 
 - discount_factor: MDP discount factor 
 - theta:  A  threshold  of  a  value  function  change.  Once  the  update  to  value function is below this number
 - max_iterations: Maximum number of iterations

In [3]:
def value_iteration(env, discount_factor, theta, max_iterations):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    
    value_function = np.zeros(num_states)
    
    for i in range(max_iterations):
        delta = 0
        for s in range(num_states):
            old_value = value_function[s]
            action_values = np.zeros(num_actions)
            for a in range(num_actions):
                for prob, next_state, reward, done in env.P[s][a]:
                    action_values[a] += prob * (reward + discount_factor * value_function[next_state])
            value_function[s] = np.max(action_values)
            delta = max(delta, np.abs(old_value - value_function[s]))
        if delta < theta:
            break
    
    policy = np.zeros((num_states, num_actions))
    for s in range(num_states):
        action_values = np.zeros(num_actions)
        for a in range(num_actions):
            for prob, next_state, reward, done in env.P[s][a]:
                action_values[a] += prob * (reward + discount_factor * value_function[next_state])
        best_action = np.argmax(action_values)
        policy[s][best_action] = 1.0
    
    return policy, value_function



## We need to Compare  the 
 - number of  wins
 - average  return  after  1000  episodes 

In [4]:
def run_episodes(policy, env, num_episodes):
    total_reward = 0
    num_wins = 0
    for i in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        done = False
        while not done:
            action = np.random.choice(env.action_space.n, p=policy[state])
            next_state, reward, done, _ = env.step(action)
            episode_reward += reward
            state = next_state
        total_reward += episode_reward
        if episode_reward == 1:
            num_wins += 1
    return num_wins, total_reward / num_episodes

n_states = env.observation_space.n
n_actions = env.action_space.n

discount_factor = 0.99
theta = 1e-8
max_iterations = 2000
num_episodes = 1000

policy = np.ones([n_states, n_actions]) / n_actions
opt_policy, value_func = policy_iteration(policy, env, discount_factor, theta, max_iterations)
num_wins_policy, avg_return_policy = run_episodes(opt_policy, env, num_episodes)

opt_policy, value_func = value_iteration(env, discount_factor, theta, max_iterations)
num_wins_value, avg_return_value = run_episodes(opt_policy, env, num_episodes)

print(f"Policy Iteration: Number of wins = {num_wins_policy}, Average Return = {avg_return_policy}")
print(f"Value Iteration: Number of wins = {num_wins_value}, Average Return = {avg_return_value}")


Policy Iteration: Number of wins = 746, Average Return = 0.746
Value Iteration: Number of wins = 746, Average Return = 0.746


### Inference: 
Both Policy Iteration and Value Iteration successfully learnt the optimal policy for the FrozenLake-v1 environment, as indicated by their large number of wins and close to one average return.

However, in terms of the number of wins and average return, the Policy Iteration strategy did somewhat better. This could be due to the fact that Policy Iteration optimises the policy directly and then updates the value function, whereas Value Iteration updates the value function directly and then extracts the policy from it. In some circumstances, this may result in suboptimal policies.

Overall, both strategies are effective for dealing with the FrozenLake-v1 environment, however Policy Iteration may be a somewhat superior choice in this case.