In [1]:
pip install gym



In [2]:
import gym
import numpy as np

# Create the FrozenLake environment
env = gym.make("FrozenLake-v1")

# Initialize value function and policy
V = np.zeros(env.observation_space.n)
policy = np.zeros(env.observation_space.n, dtype=int)

# Discount factor
gamma = 0.99

# Policy evaluation
def evaluate_policy():
    delta = 1e-6
    while True:
        delta = 0
        for s in range(env.observation_space.n):
            v = V[s]
            V[s] = sum([p * (r + gamma * V[s_]) for p, s_, r, _ in env.P[s][policy[s]]])
            delta = max(delta, abs(v - V[s]))
        if delta < 1e-6:
            break

# Policy improvement
def improve_policy():
    policy_stable = True
    for s in range(env.observation_space.n):
        old_action = policy[s]
        policy[s] = np.argmax([sum([p * (r + gamma * V[s_]) for p, s_, r, _ in env.P[s][a]]) for a in range(env.action_space.n)])
        if old_action != policy[s]:
            policy_stable = False
    return policy_stable

# Value iteration
while True:
    evaluate_policy()
    if improve_policy():
        break

# Print optimal value function and policy
print("Optimal Value Function:")
print(V.reshape((4, 4)))
print("Optimal Policy (0=Left, 1=Down, 2=Right, 3=Up):")
print(policy.reshape((4, 4)))


Optimal Value Function:
[[0.54201384 0.49878716 0.47067695 0.45683159]
 [0.55844022 0.         0.35833998 0.        ]
 [0.59178998 0.64307352 0.61520205 0.        ]
 [0.         0.7417161  0.86283524 0.        ]]
Optimal Policy (0=Left, 1=Down, 2=Right, 3=Up):
[[0 3 3 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]


  deprecation(
  deprecation(


In [3]:
#DP- Policy Improvement and Value Iteration
import numpy as np

class MDP:
    def __init__(self, num_states, num_actions, transition_probs, rewards, gamma=0.9):
        self.num_states = num_states
        self.num_actions = num_actions
        self.transition_probs = transition_probs  # shape: (num_states, num_actions, num_states)
        self.rewards = rewards  # shape: (num_states, num_actions)
        self.gamma = gamma

    def policy_evaluation(self, policy, tol=1e-6):
        V = np.zeros(self.num_states)
        while True:
            delta = 0
            for s in range(self.num_states):
                v = V[s]
                action = policy[s]
                V[s] = sum(self.transition_probs[s, action, s_prime] *
                           (self.rewards[s, action] + self.gamma * V[s_prime])
                           for s_prime in range(self.num_states))
                delta = max(delta, abs(v - V[s]))
            if delta < tol:
                break
        return V

    def policy_improvement(self, V):
        policy = np.zeros(self.num_states, dtype=int)
        for s in range(self.num_states):
            policy[s] = np.argmax(self._bellman_operator(s, V))
        return policy

    def value_iteration(self, tol=1e-6):
        V = np.zeros(self.num_states)  # Initialize value function
        while True:
            delta = 0
            for s in range(self.num_states):
                v = V[s]
                V[s] = max(self._bellman_operator(s, V))
                delta = max(delta, abs(v - V[s]))
            if delta < tol:
                break
        policy = self.policy_improvement(V)
        return V, policy

    def _bellman_operator(self, state, V):
        Q = np.zeros(self.num_actions)
        for a in range(self.num_actions):
            for s_prime in range(self.num_states):
                Q[a] += self.transition_probs[state, a, s_prime] * (self.rewards[state, a] + self.gamma * V[s_prime])
        return Q

# Example usage
num_states = 3
num_actions = 2
transition_probs = np.array([[[0.5, 0.5, 0.0], [1.0, 0.0, 0.0]],
                             [[0.0, 0.0, 1.0], [0.0, 0.0, 1.0]],
                             [[0.0, 1.0, 0.0], [0.5, 0.5, 0.0]]])
rewards = np.array([[1.0, 2.0], [0.0, 0.0], [5.0, -1.0]])

mdp = MDP(num_states, num_actions, transition_probs, rewards)
V, policy = mdp.value_iteration()
print("Optimal value function:", V)
print("Optimal policy:", policy)


Optimal value function: [21.1961682  23.68420728 26.31578656]
Optimal policy: [0 0 0]
