## open ai frozen lake toy example
> solving it using mdp
* environment: https://github.com/openai/gym/blob/master/gym/envs/toy_text/frozen_lake.py

#### import environment

In [1]:
import gym
env = gym.make('FrozenLake-v0')

* env.render() : renders the environment
* env.reset()  : resets the environment
* env.P[state][action] : returns a pair of (prob, next_state, reward, done) list where prop is the probability of the next state for the given state and action pair
* There are 16 states
* There are 4 actions:
    * LEFT = 0
    * DOWN = 1
    * RIGHT = 2
    * UP = 3

In [2]:
# example 
print(env.P[1][0])
# take a step
env.reset()
env.render()
print(env.step(1))
env.render()

[(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 5, 0.0, True)]

[41mS[0mFFF
FHFH
FFFH
HFFG
(0, 0.0, False, {'prob': 0.3333333333333333})
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG


In [3]:
env.P[1][1]

[(0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 5, 0.0, True),
 (0.3333333333333333, 2, 0.0, False)]

In [4]:
env.render()

  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG


In [5]:
env.reset()
env.step(1)

(0, 0.0, False, {'prob': 0.3333333333333333})

In [6]:
env.step(2)

(1, 0.0, False, {'prob': 0.3333333333333333})

#### Calculate state values
            v(s) = r(s) + v(s+1)
            v(s) = r(s) + gamma*sum(prob_i*v(si)) where v(si) potential next states from state s and prob_i is the probability of the next state si from state s

In [7]:
env.nS

16

In [8]:
env.P[12][3]

[(1.0, 12, 0, True)]

In [9]:
import numpy as np
states = np.arange(0,16,1)
actions = np.arange(0,4,1)

In [10]:
states,actions

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]),
 array([0, 1, 2, 3]))

In [117]:
env.reset()
v_states = np.zeros(len(states))
gamma = 0.9

k = 0
while k<1000:
    delta = 0
    for state in states:
        vs = 0
        
        for action in actions: # for a given state loop through actions
            for prob,next_state,reward,done in env.P[state][action]: # for a given s and a there are more than one outcome due to frozen ground
                vs += (1/len(actions))*prob*(reward + gamma*v_states[next_state])
        delta = max(delta,np.abs(v_states[state]-vs))
        v_states[state] = vs
        
    if delta<=0.000000001:
        break
    
    k+=1
print('''number of updates = {}'''.format(k))

number of updates = 43


In [118]:
v_states

array([0.00447726, 0.00422246, 0.01006676, 0.00411822, 0.00672196,
       0.        , 0.02633371, 0.        , 0.01867615, 0.05760701,
       0.10697195, 0.        , 0.        , 0.13038305, 0.39149016,
       0.        ])

In [68]:
v_states[0]

0.0

In [126]:
values={}
for state,value in enumerate(v_states):
    
    next_states=[]
    for action in actions:
        for prob,next_state,reward,done in env.P[state][action]:
            next_states.append(next_state)
    values[state]=next_states

In [127]:
next_states

[15, 15, 15, 15]

In [128]:
values

{0: [0, 0, 4, 0, 4, 1, 4, 1, 0, 1, 0, 0],
 1: [1, 0, 5, 0, 5, 2, 5, 2, 1, 2, 1, 0],
 2: [2, 1, 6, 1, 6, 3, 6, 3, 2, 3, 2, 1],
 3: [3, 2, 7, 2, 7, 3, 7, 3, 3, 3, 3, 2],
 4: [0, 4, 8, 4, 8, 5, 8, 5, 0, 5, 0, 4],
 5: [5, 5, 5, 5],
 6: [2, 5, 10, 5, 10, 7, 10, 7, 2, 7, 2, 5],
 7: [7, 7, 7, 7],
 8: [4, 8, 12, 8, 12, 9, 12, 9, 4, 9, 4, 8],
 9: [5, 8, 13, 8, 13, 10, 13, 10, 5, 10, 5, 8],
 10: [6, 9, 14, 9, 14, 11, 14, 11, 6, 11, 6, 9],
 11: [11, 11, 11, 11],
 12: [12, 12, 12, 12],
 13: [9, 12, 13, 12, 13, 14, 13, 14, 9, 14, 9, 12],
 14: [10, 13, 14, 13, 14, 15, 14, 15, 10, 15, 10, 13],
 15: [15, 15, 15, 15]}