In [1]:
import numpy as np
import gym

In [20]:
def policy_evaluation(states, policy, env_model, discount, theta=0.0001):
    delta = theta * 2
    state_len = env.nS
    action_len = env.nA
    while delta > theta:
        delta = 0
        for s in range(state_len):
            new_s = 0.
            for a in range(action_len):
                transitions_list = env_model[s][a]
                for i in transitions_list:
                    transition_prob, next_state, reward, done = i
                    if done:
                        new_s += policy[s,a] * transition_prob * reward
                    else:
                        new_s += policy[s,a] * transition_prob * (reward + discount * states[next_state])
                        
            delta = max(delta, np.abs(states[s] - new_s))
            states[s] = new_s
            
    return states

In [27]:
def policy_improvement(states, policy, env_model, discount):
    policy_stable = True
    state_len = env.nS
    action_len = env.nA
    
    for s in range(state_len):
        old_action = np.argmax(policy[s])
        temp_array = np.zeros((action_len))
        
        for a in range(action_len):
            transitions_list = env_model[s][a]
            for i in transitions_list:
                transition_prob, next_state, reward, done = i
                
                if done:
                    temp_array[a] += transition_prob * reward
                else:
                    temp_array[a] += transition_prob * (reward + discount * states[next_state])
        policy[s] = np.zeros((action_len))
        policy[s, np.argmax(temp_array)] = 1.
        
        if old_action != np.argmax(policy[s]):
            policy_stable = False
    
    return policy_stable, states, policy

In [28]:
def policy_iteration(env_model, discount, theta=0.0001):
    policy = np.ones((env.nS, env.nA)) / env.nA
    states = np.zeros(env.nS)
    policy_stable = False
    while not policy_stable:
        states = policy_evaluation(states, policy, env_model, discount, theta)
        policy_stable, states, policy = policy_improvement(states, policy, env_model, discount)
        
    return states, policy

In [23]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.78, # optimum = .8196
)

Error: Cannot re-register id: FrozenLakeNotSlippery-v0

In [24]:
env = gym.make('FrozenLakeNotSlippery-v0')

In [25]:
gamma = 0.99

In [29]:
state_value_array, policy_array = policy_iteration(env.P, gamma)

In [30]:
np.round(state_value_array.reshape(4,4), 3)

array([[0.951, 0.961, 0.97 , 0.961],
       [0.961, 0.   , 0.98 , 0.   ],
       [0.97 , 0.98 , 0.99 , 0.   ],
       [0.   , 0.99 , 1.   , 0.   ]])

In [31]:
np.round(policy_array.reshape(16, 4), 3)

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.]])

In [33]:
env = gym.make('Taxi-v3')

In [34]:
env.reset()
for i in range(10):
    env.render()
    state, reward, done, info = env.step(env.action_space.sample())
    if done:
        print("Episode is done, resetting the environment")
        env.reset()
env.close()

+---------+
|[34;1mR[0m: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+

+---------+
|[34;1mR[0m: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|[34;1mR[0m: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Dropoff)
+---------+
|[34;1mR[0m: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|[34;1mR[0m: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|[34;1mR[0m: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Dropoff)
+---------+
|[34;1mR[0m: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Dropoff)
+---------+
|[34;1m

In [35]:
env = gym.make('Taxi-v3')
env = env.unwrapped
gamma = 0.95
state_value_array, policy_array = policy_iteration(env.P, gamma)

In [36]:
episodes = 100
episode_reward_list, episode_len_list = [], []

for i in range(episodes):
    state = env.reset()
    episode_reward = 0
    episode_length = 0
    while True:
        action = np.argmax(policy_array[state])
        state, reward, done, info = env.step(action)
        episode_reward += reward
        episode_length += 1
        if done:
            episode_reward_list.append(episode_reward)
            episode_len_list.append(episode_length)
            break

print("Average Reward {} Average Length {}".format(np.mean(episode_reward_list), np.mean(episode_len_list)))

Average Reward 7.84 Average Length 13.16
