In [1]:
!pip install gymnasium



In [2]:
import numpy as np
import matplotlib.pyplot as plt
import random
from gymnasium import Env, spaces, register, make

In [3]:
#Random-Maze Environment Implementation

class RMEnv(Env):
    metadata = {"render_modes": ["human", "rgb_array"], "render_fps": 4}

    def __init__(self, render_mode=None):

        self.P = {
            0: {
                0: [(0.9, 0, -0.04, False),(0.1, 1, -0.04, False)],
                1: [(0.8, 1, -0.04, False),(0.1, 4, -0.04, False),(0.1, 0, -0.04, False)],
                2: [(0.8, 4, -0.04, False),(0.1, 1, -0.04, False),(0.1, 0, -0.04, False)],
                3: [(0.9, 0, -0.04, False),(0.1, 4, -0.04, False)]

            },
            1: {
                0: [(0.8, 1, -0.04, False),(0.1, 0, -0.04, False),(0.1, 2, -0.04, False)],
                1: [(0.8, 2, -0.04, False),(0.2, 1, -0.04, False)],
                2: [(0.8, 1, -0.04, False),(0.1, 0, -0.04, False),(0.1, 2, -0.04, False)],
                3: [(0.8, 0, -0.04, False),(0.2, 1, -0.04, False)]

            },
            2: {
                0: [(0.8, 2, -0.04, False),(0.1, 3, 1, True),(0.1, 1, -0.04, False)],
                1: [(0.8, 3, 1, True),(0.1, 2, -0.04, False),(0.1, 6, -0.04, False)],
                2: [(0.8, 6, -0.04, False),(0.1, 1, -0.04, False),(0.1, 3, 1, True)],
                3: [(0.8, 1, -0.04, False),(0.1, 2, -0.04, False),(0.1, 6, -0.04, False)]

            },
            3: {
                0: [(1.0, 3, 0, True)],
                1: [(1.0, 3, 0, True)],
                2: [(1.0, 3, 0, True)],
                3: [(1.0, 3, 0, True)]

            },
            4: {
                0: [(0.8, 0, -0.04, False),(0.2, 4, -0.04, False)],
                1: [(0.8, 4, -0.04, False),(0.1, 0, -0.04, False),(0.1, 8, -0.04, False)],
                2: [(0.8, 8, -0.04, False),(0.2, 4, -0.04, False)],
                3: [(0.8, 4, -0.04, False),(0.1, 0, -0.04, False),(0.1, 8, -0.04, False)]

            },
            5: {
                0: [(0.0, 5, 0, False)],
                1: [(0.0, 5, 0, False)],
                2: [(0.0, 5, 0, False)],
                3: [(0.0, 5, 0, False)]

            },

            6: {
                0: [(0.8, 2, -0.04, False),(0.1, 7, -1, True),(0.1, 6, -0.04, False)],
                1: [(0.8, 7, -1, True),(0.1, 2, -0.04, False),(0.1, 10, -0.04, False)],
                2: [(0.8, 10, -0.04, False),(0.1, 7, -1, True),(0.1, 6, -0.04, False)],
                3: [(0.8, 6, -0.04, False),(0.1, 2, -0.04, False),(0.1, 10, -0.04, False)]

            },
            7: {
                0: [(1.0, 7, 0, True)],
                1: [(1.0, 7, 0, True)],
                2: [(1.0, 7, 0, True)],
                3: [(1.0, 7, 0, True)]

            },
            8: {
                0: [(0.8, 4, -0.04, False),(0.1, 9, -0.04, False),(0.1, 8, -0.04, False)],
                1: [(0.8, 9, -0.04, False),(0.1, 4, -0.04, False),(0.1, 8, -0.04, False)],
                2: [(0.9, 8, -0.04, False),(0.1, 9, -0.04, False)],
                3: [(0.9, 8, -0.04, False),(0.1, 4, -0.04, False)]

            },
            9: {
              0: [(0.8, 9, -0.04, False),(0.1, 8, -0.04, False),(0.1, 10, -0.04, False)],
              1: [(0.8, 10, -0.04, False),(0.2, 9, -0.04, False)],
              2: [(0.8, 9, -0.04, False),(0.1, 10, -0.04, False),(0.1, 8, -0.04, False)],
              3: [(0.8, 8, -0.04, False),(0.2, 9, -0.04, False)]

            },
            10: {
              0: [(0.8, 6, -0.04, False),(0.1, 11, -0.04, False),(0.1, 9, -0.04, False)],
              1: [(0.8, 11, -0.04, False),(0.1, 6, -0.04, False),(0.1, 10, -0.04, False)],
              2: [(0.8, 10, -0.04, False),(0.1, 11, -0.04, False),(0.1, 9, -0.04, False)],
              3: [(0.8, 9, -0.04, False),(0.1, 10, -0.04, False),(0.1, 6, -0.04, False)]

            },
            11: {
                0: [(0.8, 7, -1, True),(0.1, 11, -0.04, False),(0.1, 10, -0.04, False)],
                1: [(0.9, 11, -0.04, False),(0.1, 7, -1, True)],
                2: [(0.9, 11, -0.04, False),(0.1, 10, -0.04, False)],
                3: [(0.8, 10, -0.04, False),(0.1, 11, -0.04, False),(0.1, 7, -1, True)]

            },
        }
        self.size = 12 # The size of the 1D grid

       #We have 3 observations, corresponding to each position in the 1-D grid
        self.observation_space = spaces.Discrete(self.size)

        #We have 2 actions, corresponding to "left" & "right"
        self.action_space = spaces.Discrete(4)

        assert render_mode is None or render_mode in self.metadata["render_modes"]
        self.render_mode = render_mode

        self.window = None
        self.clock = None

    def _get_obs(self):

        return self._agent_location


    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._agent_location = 8
        self._target_location = 3
        self._dead_state = 7


        observation = self._get_obs()

        if self.render_mode == "human":
            self._render_frame()

        return observation


    def step(self, action):

        prev_location = self._agent_location
        transitions = self.P[prev_location][action]
        probabilities, next_states, rewards, termination = zip(*transitions)

        # Randomly select a transition based on the probabilities
        index = random.choices(range(len(probabilities)), weights=probabilities, k=1)[0]
        prob,self._agent_location, reward, terminated = probabilities[index],next_states[index], rewards[index], termination[index]

        observation = self._get_obs()
        truncated = False

        # "current_state": prev_location,"action":action,"next_state": self._agent_location,"reward":reward,"done":terminated
        tansition_info={"current_state": prev_location,
                       "action":action,
                        "next_state": self._agent_location,
                        "reward":reward,
                        "done":terminated}

        if self.render_mode == "human":
            self._render_frame()

        # Return the required 5-tuple
        return observation, reward, terminated,truncated,tansition_info

register(id='RMEnv', entry_point=RMEnv)

In [63]:
#implementing enviroment
env1 = make('RMEnv', render_mode="rgb_array")
observation= env1.reset(seed=32)
for i in range(10):
  action = env1.action_space.sample()
  observation, reward, terminated,truncated,info = env1.step(action)

  print(info,"\n")

  if terminated or truncated:
      observation= env1.reset(seed=32)
      if reward==1:
        print("Goal")
      else:
        print("Hole")

{'current_state': 8, 'action': 1, 'next_state': 9, 'reward': -0.04, 'done': False} 

{'current_state': 9, 'action': 0, 'next_state': 9, 'reward': -0.04, 'done': False} 

{'current_state': 9, 'action': 2, 'next_state': 10, 'reward': -0.04, 'done': False} 

{'current_state': 10, 'action': 2, 'next_state': 10, 'reward': -0.04, 'done': False} 

{'current_state': 10, 'action': 0, 'next_state': 6, 'reward': -0.04, 'done': False} 

{'current_state': 6, 'action': 0, 'next_state': 2, 'reward': -0.04, 'done': False} 

{'current_state': 2, 'action': 2, 'next_state': 6, 'reward': -0.04, 'done': False} 

{'current_state': 6, 'action': 1, 'next_state': 7, 'reward': -1, 'done': True} 

Hole
{'current_state': 8, 'action': 2, 'next_state': 8, 'reward': -0.04, 'done': False} 

{'current_state': 8, 'action': 1, 'next_state': 9, 'reward': -0.04, 'done': False} 



question 2


In [61]:
gamma=0.99
#Policy
#towards goal
p_g=[[0.0,1.0,0.0,0.0],[0.0,1.0,0.0,0.0],[0.0,1.0,0.0,0.0],[0.25,0.25,0.25,0.25],[0.0,1.0,0.0,0.0],[0.25,0.25,0.25,0.25],[1.0,0.0,0.0,0.0],[0.25,0.25,0.25,0.25],[1.0,0.0,0.0,0.0],[0.0,1.0,0.0,0.0],[1.0,0.0,0.0,0.0],[1.0,0.0,0.0,0.0]]
#away from hole
p_h=[[0.0,0.0,0.0,1.0],[0.0,0.0,0.0,1.0],[0.0,0.0,0.0,1.0],[0.25,0.25,0.25,0.25],[0.0,0.0,0.0,1.0],[0.25,0.25,0.25,0.25],[0.0,0.0,0.0,1.0],[0.25,0.25,0.25,0.25],[0.0,0.0,0.0,1.0],[0.0,0.0,1.0,0.0],[0.0,0.0,1.0,0.0],[0.0,0.0,1.0,0.0]]
#random
p_r=[[0.1,0.1,0.7,0.1],[0.1,0.7,0.1,0.1],[0.7,0.1,0.1,0.1],[0.25,0.25,0.25,0.25],[0.1,0.7,0.1,0.1],[0.25,0.25,0.25,0.25],[0.1,0.1,0.1,0.7],[0.25,0.25,0.25,0.25],[0.1,0.1,0.7,0.1],[0.1,0.1,0.1,0.7],[0.7,0.1,0.1,0.1],[0.1,0.7,0.1,0.1]]


In [52]:
#1
#policy evaluation
def policy_evaluation(env,pi,gamma,theta):
  v_old=np.zeros(env.observation_space.n)
  i=0
  while 1:
    v_new=np.zeros(env.observation_space.n)
    for s in range(env.observation_space.n):
      for a in range(env.action_space.n):
        temp=0
        for t_prob,n_s,r,_ in env.P[s][a]:
          temp=temp+t_prob*(r+gamma*v_old[n_s])

        v_new[s]+=pi[s][a]*temp
    diff=np.abs(v_new-v_old)
    i=i+1
    if np.max(diff)<theta:
      break
    v_old=v_new


  return v_new,i

#policy improvement
def policy_improvement(env,v,gamma):
  Q=np.zeros((env.observation_space.n,env.action_space.n))
  pi=np.zeros(env.observation_space.n)
  for s in range(env.observation_space.n):
    for a in range(env.action_space.n):
      for t_prob,n_s,r,_ in env.P[s][a]:
        Q[s][a]+=t_prob*(r+gamma*v[n_s])

  for s in range(env.observation_space.n):
    pi[s]=np.argmax(Q[s])

  return pi

#policy iteration
def poilcy_iteration(env,gamma,theta,pi):
  pie=np.zeros(env.observation_space.n)
  for s in range(env.observation_space.n):
    pie[s]=random.choices(range(env.action_space.n), weights=pi[s], k=1)[0]
  i=0
  while 1:
    pi_old=pie
    v,_=policy_evaluation(env,pi,gamma,theta)
    pie=policy_improvement(env,v,gamma)
    i=i+1
    if np.array_equal(pi_old, pie):
      break
  return v,pie,i


In [53]:
#implementing policy iteration
theta=1e-10
#towards goal
V_g,Pi_g,n_g=poilcy_iteration(env1,gamma,theta,p_g)

#away from hole
V_h,P_h,n_h=poilcy_iteration(env1,gamma,theta,p_h)

#random
V_r,P_r,n_r=poilcy_iteration(env1,gamma,theta,p_r)



  logger.warn(


In [54]:
print("policy:towards the goal")
print("value function",V_g,"\n")
print("optimum policy",Pi_g,"\n")
print("num of iterations",n_g,"\n\n")

print("policy:away from Hole")
print("value function",V_h,"\n")
print("optimum policy",P_h,"\n")
print("num of iterations",n_h,"\n\n")

print("policy:Random")
print("value function",V_r,"\n")
print("optimum policy",P_r,"\n")
print("num of iterations",n_r,"\n\n")


policy:towards the goal
value function [ 0.7737106   0.89286374  0.95464233  0.          0.30267844  0.
  0.68820946  0.          0.26626026  0.40585022  0.46147965 -0.84607493] 

optimum policy [1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 3.] 

num of iterations 2 


policy:away from Hole
value function [-3.99999999 -3.99999999 -3.99999999  0.         -3.99999999  0.
 -3.99999999  0.         -3.99999999 -3.99999999 -3.99999999 -3.99999999] 

optimum policy [0. 1. 1. 0. 0. 0. 1. 0. 2. 1. 0. 0.] 

num of iterations 2 


policy:Random
value function [-1.3039768  -0.1752878   0.08592316  0.         -1.5683866   0.
 -0.62675083  0.         -1.67928545 -1.60208747 -0.91212583 -1.06170702] 

optimum policy [1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 3.] 

num of iterations 2 




In [55]:
#value iteration
def value_iteration(env,gamma,theta):
  v=np.zeros(env.observation_space.n)
  pi=np.zeros(env.observation_space.n)
  v_new=np.zeros(env.observation_space.n)
  i=0
  while 1:
    Q=np.zeros((env.observation_space.n,env.action_space.n))
    for s in range(env.observation_space.n):
      for a in range(env.action_space.n):
        for t_prob,n_s,r,_ in env.P[s][a]:
          Q[s][a]+=t_prob*(r+gamma*v[n_s])
    v_new=np.max(Q, axis=1)
    diff=np.abs(v-v_new)
    i=i+1
    if np.max(diff)<theta:
      break
    v=v_new

  for s in range(env.observation_space.n):
    pi[s]=np.argmax(Q[s])

  return v,pi,i

In [56]:
V,Pi,n=value_iteration(env1,gamma,theta)
print("Value Iteration")
print("value function",V,"\n")
print("optimum policy",Pi,"\n")
print("num of iterations",n,"\n\n")

Value Iteration
value function [0.82442985 0.89286374 0.95464233 0.         0.76427487 0.
 0.68820946 0.         0.69763948 0.63906542 0.60613373 0.38186228] 

optimum policy [1. 1. 1. 0. 0. 0. 0. 0. 0. 3. 0. 3.] 

num of iterations 38 


