In [11]:
#Citation: https://gist.github.com/jojonki/6291f8c3b19799bc2f6d5279232553d7
import numpy as np
import gym
from gym import wrappers

# Q learning params
ALPHA = 0.2 # learning rate
GAMMA = 0.99 # reward discount
LEARNING_COUNT = 100000
TEST_COUNT = 10000

TURN_LIMIT = 100
IS_MONITOR = False

class Agent:
    def __init__(self, env):
        self.env = env
        self.episode_reward = 0.0
        self.q_val = np.zeros(16 * 4).reshape(16, 4).astype(np.float32)

    def learn(self):
        # one episode learning
        state = self.env.reset()
        #self.env.render()
        
        for t in range(TURN_LIMIT):
            act = self.env.action_space.sample() # random
            next_state, reward, done, info = self.env.step(act)
            q_next_max = np.max(self.q_val[next_state])
            # Q <- Q + a(Q' - Q)
            # <=> Q <- (1-a)Q + a(Q')
            self.q_val[state][act] = (1 - ALPHA) * self.q_val[state][act]\
                                 + ALPHA * (reward + GAMMA * q_next_max)
            
            #self.env.render()
            if done:
                return reward
            else:
                state = next_state

    def test(self):
        state = self.env.reset()
        for t in range(TURN_LIMIT):
            act = np.argmax(self.q_val[state])
            next_state, reward, done, info = self.env.step(act)
            if done:
                return reward
            else:
                state = next_state
        return 0.0 # over limit

def main():
    env = gym.make("FrozenLake-v0")
    if IS_MONITOR:
        env = wrappers.Monitor(env, './FrozenLake-v0')
    agent = Agent(env)

    print("###### LEARNING #####")
    reward_total = 0.0
    for i in range(LEARNING_COUNT):
        reward_total += agent.learn()
    print("episodes      : {}".format(LEARNING_COUNT))
    print("total reward  : {}".format(reward_total))
    print("average reward: {:.2f}".format(reward_total / LEARNING_COUNT))
    print("Q Value       :{}".format(agent.q_val))

    print("###### TEST #####")
    reward_total = 0.0
    for i in range(TEST_COUNT):
        reward_total += agent.test()
    print("episodes      : {}".format(TEST_COUNT))
    print("total reward  : {}".format(reward_total))
    print("average reward: {:.2f}".format(reward_total / TEST_COUNT))
    
main()

###### LEARNING #####
episodes      : 100000
total reward  : 1403.0
average reward: 0.01
Q Value       :[[0.640426   0.62866944 0.6163429  0.6104189 ]
 [0.40500814 0.42207444 0.46481207 0.58121634]
 [0.5156474  0.5197279  0.4990199  0.5472102 ]
 [0.2575971  0.2353905  0.3371065  0.52059215]
 [0.6480759  0.43968114 0.37216946 0.31229   ]
 [0.         0.         0.         0.        ]
 [0.502186   0.29844293 0.32352242 0.18075205]
 [0.         0.         0.         0.        ]
 [0.50992465 0.44389325 0.47928143 0.68002367]
 [0.35597792 0.73284745 0.49358422 0.33331195]
 [0.7262751  0.51253504 0.39673144 0.39644498]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.42843625 0.669475   0.7885045  0.45064753]
 [0.77582973 0.939305   0.93235344 0.8547679 ]
 [0.         0.         0.         0.        ]]
###### TEST #####
episodes      : 10000
total reward  : 7449.0
average reward: 0.74
