In [15]:
import gym
import numpy as np

In [2]:
gym.envs.register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.74
)

In [3]:
# Create the gridworld-like environment
env=gym.make('FrozenLakeNotSlippery-v0')
# Let's look at the model of the environment (i.e., P):
env.env.P
# Question: what is the data in this structure saying? Relate this to the course
# presentation of P

{0: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 4, 0.0, False)],
  2: [(1.0, 1, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 1: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 5, 0.0, True)],
  2: [(1.0, 2, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)],
  1: [(1.0, 6, 0.0, False)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 2, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)],
  1: [(1.0, 7, 0.0, True)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 3, 0.0, False)]},
 4: {0: [(1.0, 4, 0.0, False)],
  1: [(1.0, 8, 0.0, False)],
  2: [(1.0, 5, 0.0, True)],
  3: [(1.0, 0, 0.0, False)]},
 5: {0: [(1.0, 5, 0, True)],
  1: [(1.0, 5, 0, True)],
  2: [(1.0, 5, 0, True)],
  3: [(1.0, 5, 0, True)]},
 6: {0: [(1.0, 5, 0.0, True)],
  1: [(1.0, 10, 0.0, False)],
  2: [(1.0, 7, 0.0, True)],
  3: [(1.0, 2, 0.0, False)]},
 7: {0: [(1.0, 7, 0, True)],
  1: [(1.0, 7, 0, True)],
  2: [(1.0, 7, 0, True)],
  3: [(1.0, 7, 0, True)]},
 8: {0: [(1.0, 8, 0.0, False)],
  1: [(1.0, 12, 0.0, True)],
  2: [(

Data in this structure represents the dynamics function p. The first number states the current state index, the number from 0 to 3 represents the four actions the agent can take at each state(0 : $\leftarrow$, 1 : $\downarrow$, 2 : $\rightarrow$, 3 : $\uparrow$). The four elements in the list represent to:
1. Probability of transitioning to the next state. Because the environment is deterministic, the probability equals 1 all the time.
2. The index of the next state
3. The reward. You receive a reward of 1 if you reach the goal, and zero otherwise.
4. Whether the state ends the episode. It is `True` if the state is the hole or goal, `False` if the state is the starting point or frozen surface.


In [4]:
# Now let's investigate the observation space (i.e., S using our nomenclature),
# and confirm we see it is a discrete space with 16 locations
print(env.observation_space)

Discrete(16)


In [5]:
stateSpaceSize = env.observation_space.n
print(stateSpaceSize)

16


In [6]:
# Now let's investigate the action space (i.e., A) for the agent->environment
# channel
print(env.action_space)

Discrete(4)


In [7]:
# The gym environment has ...sample() functions that allow us to sample
# from the above spaces:
for g in range(1,10,1):
  print("sample from S:",env.observation_space.sample()," ... ","sample from A:",env.action_space.sample())

sample from S: 15  ...  sample from A: 0
sample from S: 11  ...  sample from A: 2
sample from S: 6  ...  sample from A: 1
sample from S: 1  ...  sample from A: 1
sample from S: 12  ...  sample from A: 0
sample from S: 3  ...  sample from A: 1
sample from S: 15  ...  sample from A: 0
sample from S: 8  ...  sample from A: 2
sample from S: 10  ...  sample from A: 1


In [8]:
# The enviroment also provides a helper to render (visualize) the environment
env.reset()
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [9]:
# We can act as the agent, by selecting actions and stepping the environment
# through time to see its responses to our actions
env.reset()
exitCommand=False
while not(exitCommand):
  env.render()
  print("Enter the action as an integer from 0 to",env.action_space.n," (or exit): ")
  userInput=input()
  if userInput=="exit":
    break
  action=int(userInput)
  (observation, reward, compute, probability) = env.step(action)
  print("--> The result of taking action",action,"is:")
  print("     S=",observation)
  print("     R=",reward)
  print("     p=",probability)

  env.render()



[41mS[0mFFF
FHFH
FFFH
HFFG
Enter the action as an integer from 0 to 4  (or exit): 
1
--> The result of taking action 1 is:
     S= 4
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
Enter the action as an integer from 0 to 4  (or exit): 
2
--> The result of taking action 2 is:
     S= 5
     R= 0.0
     p= {'prob': 1.0}
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG
Enter the action as an integer from 0 to 4  (or exit): 
exit


In [None]:
# Question: draw a table indicating the correspondence between the action
# you input (a number) and the logic action performed.
# Question: draw a table that illustrates what the symbols on the render image
# mean?
# Question: Explain what the objective of the agent is in this environment?

| Number | Action |
| ----------- | ----------- |
| `0` | $\leftarrow$ left |
| `1` | $\downarrow$ down | 
| `2` | $\rightarrow$ right| 
| `3` | $\uparrow$ up|


---
| Symbol | Meaning |
|---------|--------|
| S | starting point, safe |
| F | frozen surface, safe |
| H | hole, fall to your doom |
| G | goal, where the frisbee is located|


---


The object of the agent is to reach the goal(G).









In [None]:
# Practical: Code up an AI that will employ random action selection in order
# to drive the agent. Test this random action selection agent with the
# above environment (i.e., code up a loop as I did above, but instead
# of taking input from a human user, take it from the AI you coded).

In [9]:
#An AI that will employ random action selection
def random_ai(env,policy):
  env.reset()
  exitCommand = False
  env.render()
  while not(exitCommand):
    action = policy(env)
    (observation, reward, compute, probability) = env.step(action)
    print("--> The result of taking action",action,"is:")
    print("     S=",observation)
    print("     R=",reward)
    print("     p=",probability)

    env.render()
    #exit when the agent terminates the episode
    if compute is True:
      exitCommand = True

env.seed(3)
policy = lambda env: env.action_space.sample()
random_ai(env,policy)


[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 3 is:
     S= 0
     R= 0.0
     p= {'prob': 1.0}
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 1 is:
     S= 4
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
--> The result of taking action 3 is:
     S= 0
     R= 0.0
     p= {'prob': 1.0}
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 1 is:
     S= 4
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
--> The result of taking action 2 is:
     S= 5
     R= 0.0
     p= {'prob': 1.0}
  (Right)
SFFF
F[41mH[0mFH
FFFH
HFFG


In [None]:
# Now towards dynamic programming. Note that env.env.P has the model
# of the environment.
#
# Question: How would you represent the agent's policy function and value function?
# Practical: revise the above AI solver to use a policy function in which you
# code the random action selections in the policy function. Test this.
# Practical: Code the C-4 Policy Evaluation (Prediction) algorithm. You may use
# either the inplace or ping-pong buffer (as described in the lecture). Now
# randomly initialize your policy function, and compute its value function.
# Report your results: policy and value function. Ensure your prediction
# algo reports how many iterations it took.
#
# (Optional): Repeat the above for q.
#
# Policy Improvement:
# Question: How would you use P and your value function to improve an arbitrary
# policy, pi, per Chapter 4?
# Practical: Code the policy iteration process, and employ it to arrive at a
# policy that solves this problem. Show your testing results, and ensure
# it reports the number of iterations for each step: (a) overall policy
# iteration steps and (b) evaluation steps.
# Practical: Code the value iteration process, and employ it to arrive at a
# policy that solves this problem. Show your testing results, reporting
# the iteration counts.
# Comment on the difference between the iterations required for policy vs
# value iteration.
#
# Optional: instead of the above environment, use the "slippery" Frozen Lake via
# env = gym.make("FrozenLake-v0")



> Question: How would you represent the agent's policy function and value function?

The agent's policy function can be implemented as an N-element(N is the length of the observation_space, the number of states) array lookup table of integer(0 to 3), representing choosing the action at that state.

The value function can be implemented as an N-element array lookup table of float values.

In [10]:
# Practical: revise the above AI solver to use a policy function in which you
# code the random action selections in the policy function. Test this.
manual_policy = [1,2,1,0,
                 1,1,1,0,
                 2,1,1,0,
                 2,2,2,1]
policy = lambda env: manual_policy[env.env.s]
random_ai(env,policy)


[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 1 is:
     S= 4
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
--> The result of taking action 1 is:
     S= 8
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
--> The result of taking action 2 is:
     S= 9
     R= 0.0
     p= {'prob': 1.0}
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
--> The result of taking action 1 is:
     S= 13
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
--> The result of taking action 2 is:
     S= 14
     R= 0.0
     p= {'prob': 1.0}
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
--> The result of taking action 2 is:
     S= 15
     R= 1.0
     p= {'prob': 1.0}
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [62]:
# Practical: Code the C-4 Policy Evaluation (Prediction) algorithm. You may use
# either the inplace or ping-pong buffer (as described in the lecture). Now
# randomly initialize your policy function, and compute its value function.
# Report your results: policy and value function. Ensure your prediction
# algo reports how many iterations it took.
def policy_prediction(policy, threshold, gamma, V, env):
  # Initialize V(s), for all s in S+, arbitrarily except that V(terminal) = 0
  S = env.observation_space.n
  if V is None:
    V = np.zeros(S)
  
  iterations = 0
  converged = False
  while not converged:
    delta = 0
    iterations += 1

    for state in range(S):
      v = V[state]
      #Assumes the policy only returns one action with probability 1.
      action = policy[state]
      sum = 0
      for p, next_state, r, terminate in env.env.P[state][action]: 
        sum += p*(r + gamma*V[next_state])
      V[state] = sum
      delta = max(delta, np.abs(v - V[state]))
    converged = True if delta < threshold else False
  return V, iterations

#randomly initialize your policy function
print("Random policy:")
policy = np.random.RandomState(0).randint(0, 4, size=16)
print(policy.reshape(4, 4))
V, iterations = policy_prediction(policy, 1e-6, 0.9, None, env)
print("policy prediction converged in %d itreations"%iterations)
print("Final values:")
print(V.reshape(4, 4))

Random policy:
[[0 3 1 0]
 [3 3 3 3]
 [1 3 1 2]
 [0 3 2 0]]
policy prediction converged in 3 itreations
Final values:
[[0.  0.  0.  0. ]
 [0.  0.  0.  0. ]
 [0.  0.  0.9 0. ]
 [0.  0.  1.  0. ]]


> Question: How would you use P and your value function to improve an arbitrary policy, pi, per Chapter 4?

By using policy iteration which iteratively improves policies and value functions through implementing policy evaluation and policy improvement until we find the optimal policy and value function.

In [72]:
# Practical: Code the policy iteration process, and employ it to arrive at a
# policy that solves this problem. Show your testing results, and ensure
# it reports the number of iterations for each step: (a) overall policy
# iteration steps and (b) evaluation steps.
def policy_iteration(V, gamma, threshold,env):
  S = env.observation_space.n
  A = env.action_space.n
  #initialize
  if V is None:
    V = np.zeros(S)
  pi = np.random.randint(0, A, size = S)
  print("random policy:")
  print(pi.reshape(4,4))
  policy_step = 0
  while True:
    policy_step += 1
    print('policy iteration',policy_step)
    #policy evaluation
    V, iter = policy_prediction(pi,threshold,gamma,V,env)
    print("policy evaluation converged in %d itreations"%iter)
    policy_stable = True
    for state in range(S):
      old_action = pi[state]
      values = np.zeros(A)
      for action in range(A):
        for p, next_state, r, terminate in env.env.P[state][action]: 
          values[action] += p*(r+gamma*V[next_state])
      pi[state] = np.argmax(values)
      if old_action != pi[state]:
        policy_stable = False
    if policy_stable:
      return V, pi

V_star, pi_star = policy_iteration(None, 0.9, 1e-6, env)
print("Final value:")
print(V_star.reshape(4, 4))
print("Final policy:")
print(pi_star.reshape(4, 4))

random policy:
[[1 2 3 1]
 [2 3 1 3]
 [2 2 3 2]
 [0 1 2 0]]
policy iteration 1
policy evaluation converged in 2 itreations
policy iteration 2
policy evaluation converged in 2 itreations
policy iteration 3
policy evaluation converged in 2 itreations
policy iteration 4
policy evaluation converged in 2 itreations
policy iteration 5
policy evaluation converged in 2 itreations
policy iteration 6
policy evaluation converged in 2 itreations
Final value:
[[0.59049 0.6561  0.729   0.6561 ]
 [0.6561  0.      0.81    0.     ]
 [0.729   0.81    0.9     0.     ]
 [0.      0.9     1.      0.     ]]
Final policy:
[[1 2 1 0]
 [1 0 1 0]
 [2 1 1 0]
 [0 2 2 0]]


In [73]:
#test results
policy = lambda env: pi_star[env.env.s]
random_ai(env,policy)


[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 1 is:
     S= 4
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
--> The result of taking action 1 is:
     S= 8
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
--> The result of taking action 2 is:
     S= 9
     R= 0.0
     p= {'prob': 1.0}
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
--> The result of taking action 1 is:
     S= 13
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
--> The result of taking action 2 is:
     S= 14
     R= 0.0
     p= {'prob': 1.0}
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
--> The result of taking action 2 is:
     S= 15
     R= 1.0
     p= {'prob': 1.0}
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m


In [80]:
# Practical: Code the value iteration process, and employ it to arrive at a
# policy that solves this problem. Show your testing results, reporting
# the iteration counts.
def value_iteration(V, gamma, threshold,env):
  S = env.observation_space.n
  A = env.action_space.n
  #initialize
  if V is None:
    V = np.zeros(S)
  pi = np.random.randint(0, A, size = S)
  print("random policy:")
  print(pi.reshape(4,4))
  value_step = 0
  while True:
    delta = 0
    value_step += 1
    for state in range(S):
      v = V[state]
      values = np.zeros(A)
      for action in range(A):
        for p, next_state, r, terminate in env.env.P[state][action]: 
          values[action] += p*(r+gamma*V[next_state])
      V[state] = max(values)
      delta = max(delta,np.abs(v-V[state]))
    if delta < threshold:
      print("value interation converged in %d itreations"%value_step)
      break
  pi = np.zeros(S,dtype=int)
  for s in range(S):
    values = np.zeros(A)
    for action in range(A):
      for p, next_state, r, terminate in env.env.P[s][action]: 
        values[action] += p*(r+gamma*V[next_state])
    pi[s] = np.argmax(values)

  return V, pi



V_star, pi_star = value_iteration(None, 0.9, 1e-6, env)
print("Final value:")
print(V_star.reshape(4, 4))
print("Final policy:")
print(pi_star.reshape(4, 4))

random policy:
[[2 0 0 2]
 [2 2 2 3]
 [3 1 0 1]
 [0 0 2 0]]
value interation converged in 7 itreations
Final value:
[[0.59049 0.6561  0.729   0.6561 ]
 [0.6561  0.      0.81    0.     ]
 [0.729   0.81    0.9     0.     ]
 [0.      0.9     1.      0.     ]]
Final policy:
[[1 2 1 0]
 [1 0 1 0]
 [2 1 1 0]
 [0 2 2 0]]


In [81]:
#test results
policy = lambda env: pi_star[env.env.s]
random_ai(env,policy)


[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 1 is:
     S= 4
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
--> The result of taking action 1 is:
     S= 8
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
--> The result of taking action 2 is:
     S= 9
     R= 0.0
     p= {'prob': 1.0}
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG
--> The result of taking action 1 is:
     S= 13
     R= 0.0
     p= {'prob': 1.0}
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
--> The result of taking action 2 is:
     S= 14
     R= 0.0
     p= {'prob': 1.0}
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
--> The result of taking action 2 is:
     S= 15
     R= 1.0
     p= {'prob': 1.0}
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m




> Comment on the difference between the iterations required for policy vs value iteration.

Policy iteration requires 6 iterations and each iteration takes 2 steps of policy evaluation. Value iteration requires 7 iterations. Value iteration converges faster than policy iteration.


In [90]:
# Optional: instead of the above environment, use the "slippery" Frozen Lake via
# env = gym.make("FrozenLake-v0")
env = gym.make("FrozenLake-v0")
#policy iteration
V_star, pi_star = policy_iteration(None, 0.9, 1e-8, env)
print("Final value:")
print(V_star.reshape(4, 4))
print("Final policy:")
print(pi_star.reshape(4, 4))

random policy:
[[3 0 1 3]
 [0 2 0 1]
 [2 0 0 3]
 [3 1 1 3]]
policy iteration 1
policy evaluation converged in 45 itreations
policy iteration 2
policy evaluation converged in 80 itreations
policy iteration 3
policy evaluation converged in 13 itreations
policy iteration 4
policy evaluation converged in 48 itreations
Final value:
[[0.06889086 0.06141454 0.07440974 0.0558073 ]
 [0.09185451 0.         0.1122082  0.        ]
 [0.14543633 0.24749694 0.29961759 0.        ]
 [0.         0.37993589 0.63902014 0.        ]]
Final policy:
[[0 3 0 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]


In [91]:
#test results
policy = lambda env: pi_star[env.env.s]
random_ai(env,policy)


[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 0
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 0
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 0
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 0
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 4
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 8
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
--> The result of taking action 3 is:
     S= 4
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
--> The result of taking ac

In [92]:
#value iteration
V_star, pi_star = value_iteration(None, 0.9, 1e-8, env)
print("Final value:")
print(V_star.reshape(4, 4))
print("Final policy:")
print(pi_star.reshape(4, 4))

random policy:
[[1 0 0 2]
 [0 1 0 2]
 [2 2 3 1]
 [1 0 1 1]]
value interation converged in 85 itreations
Final value:
[[0.06889086 0.06141454 0.07440974 0.0558073 ]
 [0.09185451 0.         0.1122082  0.        ]
 [0.14543633 0.24749694 0.29961758 0.        ]
 [0.         0.37993589 0.63902014 0.        ]]
Final policy:
[[0 3 0 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]


In [94]:
#test results
policy = lambda env: pi_star[env.env.s]
random_ai(env,policy)


[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 0
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 4
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 0
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 0
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 4
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 0
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking action 0 is:
     S= 0
     R= 0.0
     p= {'prob': 0.3333333333333333}
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
--> The result of taking 