In [3]:
import gym
import numpy as np
import math

In [4]:
env = gym.make('FrozenLake-v0')

In [5]:
#Check number of states and actions
print(env.observation_space)
print(env.action_space)

nS = env.env.nS
nA = env.env.nA

Discrete(16)
Discrete(4)


In [19]:
num_episodes = 5000
epsilon = 0.4
gamma = 0.9

Q = np.zeros((nS,nA))
pi = np.ones((nS,nA))*(epsilon/nA) #epsilon-soft policy initialization

for state in range(nS):
    best_action = np.argmax(Q[state,:])
    pi[state,best_action]+=(1.0 - epsilon)
    
num_visits = np.zeros((nS,nA)) #Returns

In [20]:
for ep in range(num_episodes):
    if ep%1000 == 0:
        print("Episode : " + str(ep))
        # print(Q)
    done = False
    state = env.reset()
    r = 0.0
    episode_buffer = []
    while not done:
        probs = pi[state,:]
        action = np.random.choice(np.arange(len(probs)),p=probs)
        next_state,reward,done,_ = env.step(action)
        r+=reward
        episode_buffer.append((state,action,reward))
        state = next_state

    visited = np.zeros((nS,nA))
    visited_tuple = []
    gamma_power_count = np.zeros((nS,nA))
    G = np.zeros((nS,nA))
    for state,action,reward in episode_buffer:
        for s,a in visited_tuple:
            G[s][a]+=math.pow(gamma,gamma_power_count[s][a])*(reward)
            gamma_power_count[s][a]+=1
            
        if visited[state][action] == 0:
            visited[state][action] = 1
            G[state][action] += reward
            num_visits[state][action]+=1
            visited_tuple.append((state,action))
            gamma_power_count[state][action]+=1
#     print(gamma_power_count)
    for state,action,_ in episode_buffer:
        #Make an incremental update
        Q[state][action] += (G[state][action] - Q[state][action])/(1.0*num_visits[state][action])
        
        pi[state,:] = epsilon/nA
        best_action = np.argmax(Q[state,:])
        pi[state,best_action]+=(1.0 - epsilon)

Episode : 0
Episode : 1000
Episode : 2000
Episode : 3000
Episode : 4000


In [21]:
print(Q)

[[0.01277381 0.01308898 0.01362146 0.01355179]
 [0.0085388  0.01230004 0.0112641  0.01356675]
 [0.02680476 0.01904971 0.0219625  0.01280737]
 [0.0088186  0.00853259 0.00842733 0.00860397]
 [0.02260974 0.01558602 0.01657201 0.00795008]
 [0.         0.         0.         0.        ]
 [0.05089498 0.05484866 0.05884548 0.009546  ]
 [0.         0.         0.         0.        ]
 [0.01660255 0.03924209 0.0401247  0.0427431 ]
 [0.07370651 0.10804032 0.08803687 0.05449507]
 [0.15603511 0.14856261 0.14950874 0.02521516]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.16162299 0.14933116 0.2169039  0.17037641]
 [0.22248361 0.45006711 0.44863225 0.42804713]
 [0.         0.         0.         0.        ]]


In [106]:
#Running an episode
state = env.reset()
done = False

while not done:
    state = env.env.s
    print(Q[state])
    action = np.argmax(Q[state,:])
    env.render()
    state,reward,done,_= env.step(action)

[0.01502387 0.01503342 0.01504843 0.01494322]

[41mS[0mFFF
FHFH
FFFH
HFFG
[0.01502387 0.01503342 0.01504843 0.01494322]
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
[0.01502387 0.01503342 0.01504843 0.01494322]
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
[0.02338613 0.02338266 0.0220476  0.01354953]
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
[0.01502387 0.01503342 0.01504843 0.01494322]
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
[0.01502387 0.01503342 0.01504843 0.01494322]
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
[0.02338613 0.02338266 0.0220476  0.01354953]
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
[0.02564389 0.05190918 0.04554654 0.05217292]
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
[0.02564389 0.05190918 0.04554654 0.05217292]
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
[0.08641257 0.12125193 0.12088778 0.07193664]
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
[0.10802547 0.21916296 0.22121595 0.19327382]
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
[0.08641257 0.12125193 0.12088778 0.07193664]
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFF