### Q1

In [3]:
import numpy as np

def policy_iteration(policy, environment, discount_factor, theta, max_iterations):
    # Extract the number of states and actions from the policy matrix
    num_states, num_actions = policy.shape

    # Initialize the value function to zeros for all states
    value_function = np.zeros(num_states)

    for i in range(max_iterations):
        # Policy Evaluation: Evaluate the current policy by iteratively updating the value function until it converges
        while True:
            delta = 0
            for s in range(num_states):
                v = value_function[s]
                new_value = 0
                for a in range(num_actions):
                    prob = policy[s][a]
                    next_states_rewards = []
                    for prob_next, next_state, reward, done in environment.P[s][a]:
                        next_states_rewards.append(prob_next * (reward + discount_factor * value_function[next_state]))
                    new_value += prob * np.sum(next_states_rewards)
                value_function[s] = new_value
                delta = max(delta, np.abs(v - value_function[s]))
            if delta < theta:
                break

        # Policy Improvement: Improve the policy by greedily selecting the best action in each state
        policy_stable = True
        for s in range(num_states):
            old_action = np.argmax(policy[s])
            action_values = np.zeros(num_actions)
            for a in range(num_actions):
                for prob_next, next_state, reward, done in environment.P[s][a]:
                    action_values[a] += prob_next * (reward + discount_factor * value_function[next_state])
            best_action = np.argmax(action_values)
            if old_action != best_action:
                policy_stable = False
            policy[s] = np.eye(num_actions)[best_action]
        
        # If the policy has not changed in this iteration, we have found the optimal policy
        if policy_stable:
            break

    return policy, value_function


In [4]:
# Create the FrozenLake-v1 environment
env = gym.make('FrozenLake-v1')

# Initialize the policy matrix to a random policy
policy = np.ones([env.observation_space.n, env.action_space.n]) / env.action_space.n

# Set the discount factor, convergence threshold, and maximum number of iterations
discount_factor = 0.99
theta = 1e-8
max_iterations = 1000

# Run the policy iteration algorithm
optimal_policy, optimal_value_function = policy_iteration(policy, env, discount_factor, theta, max_iterations)

# Print the optimal policy and value function
print("Optimal Policy:")
print(optimal_policy)

print("Optimal Value Function:")
print(optimal_value_function)


Optimal Policy:
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Optimal Value Function:
[0.54202581 0.49880303 0.4706955  0.4568515  0.55845085 0.
 0.35834799 0.         0.59179866 0.64307976 0.6152075  0.
 0.         0.7417204  0.86283741 0.        ]


Create a Value Iteration function with the following parameters

a. environment: Initialized OpenAI gym environment object<br>
b.discount_factor: MDP discount factor<br>
c.theta:  A  threshold  of  a  value  function  change.  Once  the  update  to  value function is below this number<br>
d.max_iterations: Maximum number of iterations 

In [5]:
import numpy as np
import gym

def value_iteration(env, discount_factor, theta, max_iterations):
    # Extract the number of states and actions from the environment object
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    
    # Initialize the value function as a vector of zeros with length equal to the number of states
    value_function = np.zeros(num_states)

    for i in range(max_iterations):
        delta = 0
        for s in range(num_states):
            v = value_function[s]
            action_values = np.zeros(num_actions)
            for a in range(num_actions):
                for prob_next, next_state, reward, done in env.P[s][a]:
                    action_values[a] += prob_next * (reward + discount_factor * value_function[next_state])
            best_value = np.max(action_values)
            value_function[s] = best_value
            delta = max(delta, np.abs(v - best_value))
        if delta < theta:
            break
    
    # Derive the optimal policy from the optimal value function
    policy = np.zeros((num_states, num_actions))
    for s in range(num_states):
        action_values = np.zeros(num_actions)
        for a in range(num_actions):
            for prob_next, next_state, reward, done in env.P[s][a]:
                action_values[a] += prob_next * (reward + discount_factor * value_function[next_state])
        best_action = np.argmax(action_values)
        policy[s, best_action] = 1.0

    return policy, value_function


In [6]:
import gym
import numpy as np

# Initialize the FrozenLake-v1 environment
env = gym.make('FrozenLake-v1')

# Set the discount factor, threshold, and maximum number of iterations for value iteration
discount_factor = 0.99
theta = 1e-8
max_iterations = 10000

# Run the value iteration algorithm to obtain the optimal policy and value function
policy, value_function = value_iteration(env, discount_factor, theta, max_iterations)

# Print the optimal policy and value function
print("Optimal Policy:\n", policy)
print("Optimal Value Function:\n", value_function)


Optimal Policy:
 [[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]]
Optimal Value Function:
 [0.54202581 0.49880303 0.47069551 0.4568515  0.55845085 0.
 0.35834799 0.         0.59179866 0.64307976 0.6152075  0.
 0.         0.7417204  0.86283741 0.        ]


Compare  the number of  wins, average  return  after  1000  episodes andcomment  on which method performed better.

In [7]:
import gym
import numpy as np

def run_policy(policy, env, num_episodes):
    num_wins = 0
    total_return = 0

    for i in range(num_episodes):
        state = env.reset()
        done = False
        episode_return = 0

        while not done:
            action = np.random.choice(np.arange(env.action_space.n), p=policy[state])
            next_state, reward, done, _ = env.step(action)
            episode_return += reward
            state = next_state

        total_return += episode_return
        if episode_return > 0:
            num_wins += 1

    return num_wins, total_return / num_episodes


# Initialize the FrozenLake-v1 environment
env = gym.make('FrozenLake-v1')

# Set the discount factor, threshold, and maximum number of iterations for policy and value iteration
discount_factor = 0.99
theta = 1e-8
max_iterations = 10000

# Run policy iteration to obtain the optimal policy and value function
policy_pi, _ = policy_iteration(np.ones((env.observation_space.n, env.action_space.n)) / env.action_space.n, env, discount_factor, theta, max_iterations)

# Run value iteration to obtain the optimal policy and value function
_, value_vi = value_iteration(env, discount_factor, theta, max_iterations)
policy_vi = np.zeros((env.observation_space.n, env.action_space.n))
for s in range(env.observation_space.n):
    q_values = np.zeros(env.action_space.n)
    for a in range(env.action_space.n):
        for prob, next_state, reward, done in env.P[s][a]:
            q_values[a] += prob * (reward + discount_factor * value_vi[next_state])
    best_a = np.argmax(q_values)
    policy_vi[s, best_a] = 1.0

# Run the policies for 1000 episodes and compare the results
num_episodes = 1000
num_wins_pi, avg_return_pi = run_policy(policy_pi, env, num_episodes)
num_wins_vi, avg_return_vi = run_policy(policy_vi, env, num_episodes)

# Print the results
print("Policy Iteration Results:")
print("Number of wins:", num_wins_pi)
print("Average return:", avg_return_pi)

print("Value Iteration Results:")
print("Number of wins:", num_wins_vi)
print("Average return:", avg_return_vi)


Policy Iteration Results:
Number of wins: 725
Average return: 0.725
Value Iteration Results:
Number of wins: 736
Average return: 0.736
