<a href="https://colab.research.google.com/github/premkumar6/Reinforcement_Learning/blob/main/Policy_iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pprint
import sys
if "../" not in sys.path:
  sys.path.append("../") 

In [2]:
import gym
env = gym.make('FrozenLake-v0')
env.reset()

0

In [3]:
def policy_eval(policy, discount_factor=1.0 , thetha=0.00001):
  """
  Evaluate a policy given an environment and a full description of the environment's dynamics

  Args:
  policy: [S,A] shaped matrix representing the policy.
  env: OpenAI env. env.P represents the transition probabilities of the environment
      env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
      env.nS represent no. of states in the environment
      env.nA represent no. of actions in the environment
   theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.

   Returns:
        Vector of length env.nS representing the value function.

  """

  V = np.zeros(env.nS)
  while True:
    delta = 0

    for s in range(env.nS):
      v=0
      #look at next possible actions
      for a, action_prob in enumerate(policy[s]):
        #For each action, Look at the possible next states....
        for prob,next_state,reward,done in env.P[s][a]:
          #calculated the expected value
          v+= action_prob*prob(reward + discount_factor * V[next_state])

          delta = max(delta,np.abs(v-V[s]))
          V[s]=v

        if delta < theta:
          break
  return np.array(V)
    

In [14]:
def policy_improvement(env, policy_eval_fn = policy_eval, discount_factor=0.1):
  def one_step_lookahead(state,V):
    A = np.zeros(env.nA)
    for a in range(env.nA):
      for prob,next_state,reward,done in env.P[s][a]:
        A[a]+= prob*(reward + discount_factor*V[next_state])
    return A

  #start with a random policy

  policy = np.ones([env.nS,env.nA]) / env.nA

  while True:
    V = policy_eval_fn(policy,env,discount_factor)
    
    policy_stable = True
    for s in range(env.nS):
      chosen_a = np.argmax(policy[s])
      action_values = one_step_lookahead(s,V)
      best_a = np.argmax(action_values)

      if chosen_a!=best_a:
        policy_stable=False
      policy[s] = np.eye(env.nA)[best_a]

    if policy_stable:
      return policy, V


In [None]:
policy, v = policy_improvement(env)
print("Policy Probability Distribution:")
print(policy)
print("")

print("Reshaped Grid Policy (0=up, 1=right, 2=down, 3=left):")
print(np.reshape(np.argmax(policy, axis=1), env.shape))
print("")

print("Value Function:")
print(v)
print("")

print("Reshaped Grid Value Function:")
print(v.reshape(env.shape))
print("")