# Q-Learning

# Import

In [None]:
import gym
import numpy as np
from collections import deque
from time import sleep
from IPython.display import clear_output

# Environment

In [2]:
env = gym.make('FrozenLake-v0', is_slippery=True).env

In [3]:
help(env)

Help on FrozenLakeEnv in module gym.envs.toy_text.frozen_lake object:

class FrozenLakeEnv(gym.envs.toy_text.discrete.DiscreteEnv)
 |  FrozenLakeEnv(desc=None, map_name='4x4', is_slippery=True)
 |  
 |  Winter is here. You and your friends were tossing around a frisbee at the park
 |  when you made a wild throw that left the frisbee out in the middle of the lake.
 |  The water is mostly frozen, but there are a few holes where the ice has melted.
 |  If you step into one of those holes, you'll fall into the freezing water.
 |  At this time, there's an international frisbee shortage, so it's absolutely imperative that
 |  you navigate across the lake and retrieve the disc.
 |  However, the ice is slippery, so you won't always move in the direction you intend.
 |  The surface is described using a grid like the following
 |  
 |      SFFF
 |      FHFH
 |      FFFH
 |      HFFG
 |  
 |  S : starting point, safe
 |  F : frozen surface, safe
 |  H : hole, fall to your doom
 |  G : goal, where

In [4]:
env.action_space

Discrete(4)

In [5]:
env.observation_space

Discrete(16)

# Q-Learning

In [6]:
class Agent:
    def __init__(self, env):
        self._q_table = np.zeros([env.observation_space.n, env.action_space.n])
        
    def choose_action(self, state, *, epsilon):
        if np.random.uniform() > epsilon:
            # exploration
            return env.action_space.sample()
        else:
            # exploitation
            return np.argmax(self._q_table[state])

In [7]:
class Agent(Agent):
    def play(self, *, n_steps=100, render=False):
        state = env.reset()
        done = False
        rewards = 0
        for i_steps in range(1, n_steps+1):            
            action = self.choose_action(state, epsilon=1)
            next_state, reward, done, info = env.step(action)
            rewards += reward
            if render: 
                clear_output(wait=True)
                env.render()
                sleep(0.2)            
            if done: 
                break
            state = next_state            
        if render:
            print(f'\nSteps taken: {i_steps}, rewards earned: {rewards}')
        else:
            return rewards

In [8]:
class Agent(Agent):
    def train(self, state, action, reward, next_state, done, *, alpha=0.8, gamma=0.99):
        old_value = self._q_table[state, action]
        next_max = np.max(self._q_table[next_state])
        new_value = (1-alpha)*old_value \
                    + alpha*(reward+gamma*next_max)
        self._q_table[state, action] = new_value

In [9]:
class Agent(Agent):
    def run(self, *, n_eps=15000, n_steps=100):
        scores = deque(maxlen=100)
        for i_eps in range(1, n_eps+1):            
            state = env.reset()
            done = False
            for _ in range(n_steps):                
                action = self.choose_action(state, epsilon=i_eps/n_eps)
                next_state, reward, done, info = env.step(action)                
                self.train(state, action, reward, next_state, done)
                state = next_state
            if i_eps%10==0:
                scores.append(self.play())
            if i_eps%20==0:
                print('#', end='')
            if i_eps%1000==0:
                mean_score = sum(scores)/len(scores)
                print(f' | Episode {i_eps:>5d} | mean rewards: {mean_score:.2f}')

In [10]:
agent = Agent(env)

In [11]:
agent.run()

################################################## | Episode  1000 | mean rewards: 0.15
################################################## | Episode  2000 | mean rewards: 0.17
################################################## | Episode  3000 | mean rewards: 0.15
################################################## | Episode  4000 | mean rewards: 0.21
################################################## | Episode  5000 | mean rewards: 0.29
################################################## | Episode  6000 | mean rewards: 0.17
################################################## | Episode  7000 | mean rewards: 0.23
################################################## | Episode  8000 | mean rewards: 0.23
################################################## | Episode  9000 | mean rewards: 0.22
################################################## | Episode 10000 | mean rewards: 0.22
################################################## | Episode 11000 | mean rewards: 0.20
################################

# Evaluation

In [14]:
agent.play(render=True)

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m

Steps taken: 9, rewards earned: 1.0


# Comment

* The instruction said "ice is slippery, so you won't always move in the direction you intend". If you set `is_slippery` to False, this problems will be easily solved within seconds by using a Q-Table.
* The source code actually add some sort of randomness to the action taken such that the action is not always predictable, thus increased the overall difficulty to train.
* Q-Learning may still able to solve some of the case, but not sure if it can effectively learn the randomness behavior for is_slippery.
* Can use DQN or other more advanced method to solve.
* May be can use replay memory to let the agent learn the randomness pattern if any.