# Q-Learning
- 탐험(Exploration)을 하면서도 최적의 정책을 학습이 가능함(상대적으로 SARSA에 비해서)
- Off-Policy
    - SARSA와 같은 on-policy는 이전 정책으로부터 얻은 샘플을 재사용할 수 없고, 자신의 샘플로 자신을 업데이트하므로 문제점이 존재할 수 있음
    - 정책이 복수개이므로 behavior policy(현재의 예시에선 입실론 그리디)로 샘플을 수집하고 target policy로 최적의 정책을 학습
- 학습하는 정책(TD) : 탐욕정책 -> exploitation

$$q(s, a) = q(s, a) + \alpha \big( r + \gamma\ max_{a'} q(s', a') - q(s, a)  \big )$$

- 행동하는 정책 : $\epsilon$-greedy -> exploration
    - 종류 : $\epsilon$-greedy, BoltMann, Bayesian, ...
    - 아래의 예는 SARSA와 같은 입실론 그리디
    
$$\pi(s) = \begin{cases} 
    a^* = argmax_{a \in A} q(s, a), 1 - \epsilon \\
    \text{random action}, \ \epsilon 
\end{cases}$$

In [1]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.pyplot import ylim
import os
import numpy as np
import random
from collections import defaultdict
import gym
import gym_maze
np.random.seed(1)

In [2]:
plt.style.use('ggplot')
ylim((-2, 1))
env = gym.make('maze-sample-10x10-v0')

# State 의 boundary
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
print(STATE_BOUNDS)
# Maze의 size (10, 10)
NUM_GRID = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int))
# action space
ACTION = ['up', 'dw', 'ri', 'le']

# gui환경의 Render 여부
RENDER = False

class QLearningAgent:
    def __init__(self, actions):
        self.actions = actions
        self.learning_rate = 0.2
        self.discount_factor = 0.9
        self.epsilon = 0.1
        self.q_table = defaultdict(lambda : [0.0, 0.0, 0.0, 0.0])
    
    # 학습하는 정책(벨만최적방정식) : 큐함수 업데이트 
    def learn(self, state, action, reward, next_state):
        q_1 = self.q_table[state][action]
        q_2 = reward + self.discount_factor * max(self.q_table[next_state])
        self.q_table[state][action] += self.learning_rate * (q_2 - q_1)
        
    # 입실론 탐욕 정책에 따라 행동을 반환 : sampling
    def get_action(self, state):
        if np.random.rand() > self.epsilon:
            # 무작위 행동 반환
            action = np.random.choice(self.actions)
        else:
            # 큐함수에 따른 행동 반환
            state_action = self.q_table[state]
            action = self.arg_max(state_action)
        return int(action)
    
    @staticmethod
    def arg_max(state_action):
        max_index_list = []
        max_value = state_action[0]
        for index, value in enumerate(state_action):
            if value > max_value:
                max_index_list.clear()
                max_value = value
                max_index_list.append(index)
            elif value == max_value:
                max_index_list.append(index)
        return random.choice(max_index_list)
    
    
    # greedy 정책을 출력
    def print_policy(self):
        for y in range(NUM_GRID[0]):
            for x in range(NUM_GRID[1]):
                print("%s"%ACTION[self.arg_max(self.q_table[str((x, y))])], end=" ")
                
def state_to_bucket(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_GRID[i] - 1
        else:
            # Mapping the state bounds to the bucket array
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_GRID[i] - 1) * STATE_BOUNDS[i][0] / bound_width
            scaling = (NUM_GRID[i] - 1) / bound_width
            bucket_index = int(round(scaling * state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)


# 모형 학습
if __name__ == "__main__":
    env.reset()
    agent = QLearningAgent(actions=list(range(env.action_space.n)))
    scores = []
    episodes = []
    
    for episode in range(250):
        state = env.reset()
        state = state_to_bucket(state)
        total_reward = 0
        
        while True:
            if RENDER:
                env.render()
                
            action = agent.get_action(str(state))
            
            # 행동을 수행하고 다음 상태, 보상, 에피소드의 종료여부를 받아옴
            next_state, reward, done, _ = env.step(action)
            next_state = state_to_bucket(next_state)
            
            # 큐함수를 업데이트
            agent.learn(str(state), action, reward, str(next_state))
            total_reward += reward
            state = next_state
            
            # 모든 큐함수를 화면에 표시
            #
            
            if done:
                print("Episode : %d total reward = %f ."%(episode, total_reward))
                episodes.append(episode)
                scores.append(total_reward)
                
                if episode % 50 == 0:
                    if not os.path.isdir('./save_graph'):
                        os.mkdir('./save_graph')
                    plt.plot(episodes, scores)
                    plt.savefig('./save_graph/q_learning_basic.png')
                break
                
            if np.mean(scores[-min(10, len(scores)):]) > 0.93:
                RENDER = True
                agent.print_policy()
            else:
                RENDER = False
        env.close()

[2018-05-07 15:24:24,745] Making new env: maze-sample-10x10-v0
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


[(0, 9), (0, 9)]
Episode : 0 total reward = -3.510000 .
Episode : 1 total reward = -4.667000 .
Episode : 2 total reward = -1.321000 .
Episode : 3 total reward = -2.862000 .
Episode : 4 total reward = -4.743000 .
Episode : 5 total reward = 0.184000 .
Episode : 6 total reward = -0.275000 .
Episode : 7 total reward = 0.157000 .
Episode : 8 total reward = -1.425000 .
Episode : 9 total reward = -0.368000 .
Episode : 10 total reward = -0.192000 .
Episode : 11 total reward = -1.051000 .
Episode : 12 total reward = -1.129000 .
Episode : 13 total reward = 0.460000 .
Episode : 14 total reward = -0.067000 .
Episode : 15 total reward = -0.244000 .
Episode : 16 total reward = -0.095000 .
Episode : 17 total reward = -0.092000 .
Episode : 18 total reward = -0.112000 .
Episode : 19 total reward = 0.338000 .
Episode : 20 total reward = -0.186000 .
Episode : 21 total reward = -0.177000 .
Episode : 22 total reward = 0.062000 .
Episode : 23 total reward = -0.977000 .
Episode : 24 total reward = -0.176000 

Episode : 215 total reward = 0.333000 .
Episode : 216 total reward = 0.688000 .
Episode : 217 total reward = -0.143000 .
Episode : 218 total reward = 0.190000 .
Episode : 219 total reward = 0.093000 .
Episode : 220 total reward = 0.024000 .
Episode : 221 total reward = 0.392000 .
Episode : 222 total reward = 0.125000 .
Episode : 223 total reward = 0.355000 .
Episode : 224 total reward = 0.285000 .
Episode : 225 total reward = 0.496000 .
Episode : 226 total reward = 0.228000 .
Episode : 227 total reward = 0.151000 .
Episode : 228 total reward = 0.586000 .
Episode : 229 total reward = -0.198000 .
Episode : 230 total reward = -0.034000 .
Episode : 231 total reward = 0.449000 .
Episode : 232 total reward = 0.120000 .
Episode : 233 total reward = 0.362000 .
Episode : 234 total reward = 0.399000 .
Episode : 235 total reward = 0.236000 .
Episode : 236 total reward = 0.147000 .
Episode : 237 total reward = 0.339000 .
Episode : 238 total reward = 0.235000 .
Episode : 239 total reward = 0.556000