# Q-Learning
- 탐험(Exploration)을 하면서도 최적의 정책을 학습이 가능함(상대적으로 SARSA에 비해서)
- Off-Policy
    - SARSA와 같은 on-policy는 이전 정책으로부터 얻은 샘플을 재사용할 수 없고, 자신의 샘플로 자신을 업데이트하므로 문제점이 존재할 수 있음
    - 정책이 복수개이므로 behavior policy(현재의 예시에선 입실론 그리디)로 샘플을 수집하고 target policy로 최적의 정책을 학습
- 학습하는 정책(TD) : 탐욕정책 -> exploitation

$$q(s, a) = q(s, a) + \alpha \big( r + \gamma\ max_{a'} q(s', a') - q(s, a)  \big )$$

- 행동하는 정책 : $\epsilon$-greedy -> exploration
    - 종류 : $\epsilon$-greedy, BoltzMann, Bayesian, ...
    - 아래의 예는 SARSA와 같은 입실론 그리디
    
$$\pi(s) = \begin{cases} 
    a^* = argmax_{a \in A} q(s, a), 1 - \epsilon \\
    \text{random action}, \ \epsilon 
\end{cases}$$

In [1]:
import matplotlib
matplotlib.use("Agg")
from matplotlib import pyplot as plt
from matplotlib.pyplot import ylim
import os
import numpy as np
import random
from collections import defaultdict
import gym
import gym_maze
np.random.seed(1)

In [2]:
plt.style.use('ggplot')
ylim((-2, 1))
env = gym.make('maze-sample-10x10-v0')

# State 의 boundary
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
# Maze의 size (10, 10)
NUM_GRID = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int))
# action space
ACTION = ['up', 'dw', 'ri', 'le']

# gui환경의 Render 여부
RENDER = False

class QLearningAgent:
    def __init__(self, actions):
        self.actions = actions
        self.learning_rate = 0.9
        self.discount_factor = 0.9
        self.epsilon = 1.0
        self.e_step = (1.0 - 0.01) / 100 # 100번의 에피소드에 결처서 입실론 디케이 설정
        self.lr_step = (0.9 - 0.2) / 100 # 100번의 에피소드에 컬쳐서 lr디케이 설정
        self.q_table = defaultdict(lambda : [0.0, 0.0, 0.0, 0.0])

    # <s, a, r, s'>로 큐함수를 업데이트
    def learn(self, state, action, reward, next_state):
        q_1 = self.q_table[state][action]
        # 벨만 최적 방정식
        q_2 = reward + self.discount_factor * max(self.q_table[next_state])
        self.q_table[state][action] += self.learning_rate * (q_2 - q_1)
    
    # 큐함수에 근거해서 엡실론 그리디하게 샘플을 획득
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            # 무작위 행동 선택
            action = np.random.choice(self.actions)
        else:
            # 큐함수에 따른 행동 반환
            state_action = self.q_table[state]
            action = self.arg_max(state_action)
        return int(action)
    
    @staticmethod
    def arg_max(state_action):
        max_index_list = []
        max_value = state_action[0]
        for index, value in enumerate(state_action):
            if value > max_value:
                max_index_list.clear()
                max_value = value
                max_index_list.append(index)
            elif value == max_value:
                max_index_list.append(index)
        return random.choice(max_index_list)
    
    # greedy 정책 출력
    def print_policy(self):
        for y in range(NUM_GRID[0]):
            for x in range(NUM_GRID[1]):
                print("% s"%ACTION[self.arg_max(self.q_table[str((x, y))])], end=" ")
            print("")
            
def state_to_bucket(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_GRID[i] - 1
        else:
            # Mapping the state bounds to the bucket array
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_GRID[i] - 1) * STATE_BOUNDS[i][0] / bound_width
            scaling = (NUM_GRID[i] - 1) / bound_width
            bucket_index = int(round(scaling * state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)

if __name__ == "__main__":
    env.reset()
    agent = QLearningAgent(actions=list(range(env.action_space.n)))
    scores = []
    episodes = []
    
    for episode in range(250):
        state = env.reset()
        state = state_to_bucket(state)
        total_reward = 0
        
        while True:
            if RENDER:
                env.render()
            
            # 1. 스텝마다 exploration을 수행함
            # 2. 스텝마다 학습을 수행함
            
            # 현재 상태에 대한 행동을 선택
            action = agent.get_action(str(state))
            
            next_state, reward, done, _ = env.step(action)
            next_state = state_to_bucket(next_state)
            
            # <s, a, r, s'>
            agent.learn(str(state), action, reward, str(next_state))
            total_reward += reward
            state = next_state
            
            # 에피소드마다 epsilone과 lr를 decay설정
            if done:
                print("Episodes : %d, Total reward : %f ."%(episode, total_reward))
                print(agent.learning_rate, agent.epsilon)
                agent.epsilon -= agent.e_step
                agent.learning_rate -= agent.lr_step
                episodes.append(episode)
                scores.append(total_reward)
                
                if agent.learning_rate < 0.2:
                    agent.learning_rate = 0.2
                
                if agent.epsilon < 0.01:
                    agent.epsilon = 0.01
                    
                if episode % 50 == 0:
                    if not os.path.isdir('./save_graph'):
                        os.mkdir('./save_graph')
                    plt.plot(episodes, scores)
                    plt.savefig('./save_graph/q_learning_both_decay.png')
                break
                
        if np.mean(scores[-min(10, len(scores)):]) > 0.93:
            RENDER = True
            agent.print_policy()
        else:
            RENDER = False
    env.close()

[2018-05-07 16:59:11,226] Making new env: maze-sample-10x10-v0


Episodes : 0, Total reward : -4.801000 .
0.9 1.0
Episodes : 1, Total reward : -3.961000 .
0.893 0.9901
Episodes : 2, Total reward : -4.462000 .
0.886 0.9802
Episodes : 3, Total reward : -3.812000 .
0.879 0.9702999999999999
Episodes : 4, Total reward : -0.062000 .
0.872 0.9603999999999999
Episodes : 5, Total reward : -3.665000 .
0.865 0.9504999999999999
Episodes : 6, Total reward : -0.894000 .
0.858 0.9405999999999999
Episodes : 7, Total reward : -1.933000 .
0.851 0.9306999999999999
Episodes : 8, Total reward : -0.608000 .
0.844 0.9207999999999998
Episodes : 9, Total reward : 0.019000 .
0.837 0.9108999999999998
Episodes : 10, Total reward : 0.158000 .
0.83 0.9009999999999998
Episodes : 11, Total reward : 0.141000 .
0.823 0.8910999999999998
Episodes : 12, Total reward : 0.025000 .
0.816 0.8811999999999998
Episodes : 13, Total reward : 0.637000 .
0.8089999999999999 0.8712999999999997
Episodes : 14, Total reward : 0.444000 .
0.8019999999999999 0.8613999999999997
Episodes : 15, Total reward

Episodes : 97, Total reward : 0.936000 .
0.22099999999999942 0.03969999999999866
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
up dw ri dw up le dw up dw le 
le dw ri ri ri up ri up dw ri 
dw ri up ri ri ri ri ri ri up 
Episodes : 98, Total reward : 0.937000 .
0.2139999999999994 0.02979999999999866
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
up dw ri dw up le dw up dw le 
ri dw ri ri ri up ri up dw up 
up ri up ri ri ri ri ri ri up 
Episodes : 99, Total reward : 0.938000 .
0.2069999999999994 0.019899999999998662
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri

Episodes : 120, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
le dw ri dw up le dw up dw le 
ri dw ri ri ri up ri up dw up 
dw ri up ri ri ri ri ri ri dw 
Episodes : 121, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
le dw ri dw up le dw up dw le 
dw dw dw ri ri up ri up dw le 
up ri up ri ri ri ri ri ri dw 
Episodes : 122, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
le dw ri d

Episodes : 143, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
dw dw ri dw up le dw up dw le 
le dw ri ri ri up ri up dw ri 
dw ri up ri ri ri ri ri ri ri 
Episodes : 144, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
dw dw ri dw up le dw up dw le 
ri dw dw ri ri up ri up dw ri 
dw ri up ri ri ri ri ri ri up 
Episodes : 145, Total reward : 0.937000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
le dw ri d

Episodes : 166, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
dw dw ri dw up le dw up dw le 
le dw dw ri ri up ri up dw le 
dw ri up ri ri ri ri ri ri up 
Episodes : 167, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
le dw ri dw up le dw up dw le 
up dw dw ri ri up ri up dw le 
up ri up ri ri ri ri ri ri le 
Episodes : 168, Total reward : 0.938000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
le dw ri d

Episodes : 189, Total reward : 0.938000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
le dw ri dw up le dw up dw le 
ri dw dw ri ri up ri up dw dw 
up ri up ri ri ri ri ri ri up 
Episodes : 190, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
dw dw ri dw up le dw up dw le 
up dw dw ri ri up ri up dw le 
dw ri up ri ri ri ri ri ri le 
Episodes : 191, Total reward : 0.938000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
dw dw ri d

Episodes : 212, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
le dw ri dw up le dw up dw le 
ri dw ri ri ri up ri up dw ri 
dw ri up ri ri ri ri ri ri up 
Episodes : 213, Total reward : 0.937000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
ri dw ri dw up le dw up dw le 
ri dw dw ri ri up ri up dw ri 
up ri up ri ri ri ri ri ri dw 
Episodes : 214, Total reward : 0.938000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
ri dw ri d

Episodes : 235, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
up dw ri dw up le dw up dw le 
dw dw ri ri ri up ri up dw ri 
up ri up ri ri ri ri ri ri dw 
Episodes : 236, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
ri dw ri dw up le dw up dw le 
le dw dw ri ri up ri up dw le 
dw ri up ri ri ri ri ri ri dw 
Episodes : 237, Total reward : 0.939000 .
0.2 0.01
ri dw le ri ri ri ri ri dw dw 
dw le le le up le le le ri dw 
dw le ri ri up ri ri up dw le 
dw ri ri ri ri up dw le ri dw 
dw up up le ri dw dw le le le 
dw up ri up le dw dw ri ri dw 
ri ri ri dw up ri dw ri ri dw 
up dw ri d