In [3]:
import gym
import numpy as np
import math

In [4]:
env = gym.make('FrozenLake-v0')

In [5]:
#Check number of states and actions
print(env.observation_space)
print(env.action_space)

nS = env.env.nS
nA = env.env.nA

Discrete(16)
Discrete(4)


In [16]:
num_episodes = 5000
epsilon = 0.4
gamma = 0.9

Q = np.zeros((nS,nA))
pi = np.ones((nS,nA))*(epsilon/nA) #epsilon-soft policy initialization

for state in range(nS):
    best_action = np.argmax(Q[state,:])
    pi[state,best_action]+=(1.0 - epsilon)
    
num_visits = np.zeros((nS,nA)) #Returns

In [17]:
for ep in range(num_episodes):
    if ep%1000 == 0:
        print("Episode : " + str(ep))
        # print(Q)
    done = False
    state = env.reset()
    r = 0.0
    episode_buffer = []
    while not done:
        probs = pi[state,:]
        action = np.random.choice(np.arange(len(probs)),p=probs)
        next_state,reward,done,_ = env.step(action)
        r+=reward
        episode_buffer.append((state,action,reward))
        state = next_state

    visited = np.zeros((nS,nA))
    visited_tuple = []
    gamma_power_count = np.zeros((nS,nA))
    G = np.zeros((nS,nA))
    for state,action,reward in episode_buffer:
        for s,a in visited_tuple:
            G[s][a]+=math.pow(gamma,gamma_power_count[s][a])*(reward)
            gamma_power_count[s][a]+=1
            
        if visited[state][action] == 0:
            visited[state][action] = 1
            G[state][action] += reward
            num_visits[state][action]+=1
            visited_tuple.append((state,action))
            gamma_power_count[state][action]+=1
#     print(gamma_power_count)
    for state,action,_ in episode_buffer:
        #Make an incremental update
        Q[state][action] += (G[state][action] - Q[state][action])/(1.0*num_visits[state][action])
        
        pi[state,:] = epsilon/nA
        best_action = np.argmax(Q[state,:])
        pi[state,best_action]+=(1.0 - epsilon)

Episode : 0
Episode : 1000
Episode : 2000
Episode : 3000
Episode : 4000


In [18]:
print(Q)

[[0.04719301 0.06118635 0.04859932 0.0507526 ]
 [0.03654624 0.04695839 0.0466121  0.07370083]
 [0.09893041 0.07310237 0.07782264 0.07621305]
 [0.01616046 0.06456213 0.05861156 0.03892767]
 [0.04757973 0.04869966 0.04103533 0.02703451]
 [0.         0.         0.         0.        ]
 [0.16014758 0.07868203 0.14813655 0.04137687]
 [0.         0.         0.         0.        ]
 [0.04380191 0.11842072 0.05596702 0.08567153]
 [0.11443505 0.23973862 0.14871172 0.12467976]
 [0.37532809 0.2538884  0.30973546 0.1264216 ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.13912079 0.26752539 0.45348154 0.25762773]
 [0.36061056 0.61036987 0.76353768 0.56610513]
 [0.         0.         0.         0.        ]]


In [106]:
#Running an episode
state = env.reset()
done = False

while not done:
    state = env.env.s
    print(Q[state])
    action = np.argmax(Q[state,:])
    env.render()
    state,reward,done,_= env.step(action)

[0.01502387 0.01503342 0.01504843 0.01494322]

[41mS[0mFFF
FHFH
FFFH
HFFG
[0.01502387 0.01503342 0.01504843 0.01494322]
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
[0.01502387 0.01503342 0.01504843 0.01494322]
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
[0.02338613 0.02338266 0.0220476  0.01354953]
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
[0.01502387 0.01503342 0.01504843 0.01494322]
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
[0.01502387 0.01503342 0.01504843 0.01494322]
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
[0.02338613 0.02338266 0.0220476  0.01354953]
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
[0.02564389 0.05190918 0.04554654 0.05217292]
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
[0.02564389 0.05190918 0.04554654 0.05217292]
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
[0.08641257 0.12125193 0.12088778 0.07193664]
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
[0.10802547 0.21916296 0.22121595 0.19327382]
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
[0.08641257 0.12125193 0.12088778 0.07193664]
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFF