# FrozenLake-v1 환경을 통한 SRASA, Q-Learning 실습

Q Function을 통해 optimal policy를 찾아보자

## Library Import

In [1]:
import gym
import numpy as np
import random
from IPython.display import clear_output

## SARSA

- On Policy: episode를 진행할때 사용하는 policy와 학습이 진행되는 policy가 동일하다.
- $\varepsilon$-greedy

In [2]:
env = gym.make('FrozenLake-v1', is_slippery=True)
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [3]:
Q = np.zeros([env.nS, env.nA])
alpha = 0.1
gamma = 0.99
epsilon = 0.1

for i in range(1, 100001):
    state = env.reset()
    done = False
    
    if random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q[state])
        
    while not done:
        next_state, reward, done, info = env.step(action)
        
        if random.uniform(0, 1) < epsilon:
            next_action = env.action_space.sample()
        else:
            next_action = np.argmax(Q[next_state])
        
        if done:
            Q[state, action] = (1 - alpha) * Q[state, action] + alpha * reward
        else:
            Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (reward + gamma * Q[next_state, next_action]) # SARSA
        
        state = next_state
        
    if i % 100 == 0:
        clear_output(wait=True)
        print('Episode: {}'.format(i))
        
print('Q function')
print(Q)

Episode: 100000
Q function
[[0.08898676 0.08738025 0.09274236 0.09018776]
 [0.         0.05352371 0.06364552 0.08303412]
 [0.         0.09683771 0.0813251  0.07892875]
 [0.         0.05401344 0.04806951 0.08563002]
 [0.10544947 0.07553272 0.06647727 0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.10812075 0.07257274 0.        ]
 [0.         0.         0.         0.        ]
 [0.0744904  0.07785073 0.15016906 0.        ]
 [0.         0.24023207 0.35382864 0.        ]
 [0.         0.27828566 0.20314127 0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.43222687 0.57631904 0.        ]
 [0.         0.81611428 0.72589466 0.        ]
 [0.         0.         0.         0.        ]]


In [4]:
dic = {0:'Left', 1:'Down', 2:'Right', 3:'Up'}
for i in range(4):
    for j in range(4):
        print(dic[np.argmax(Q[i*4 + j])], end="\t")
    print()

Right	Up	Down	Up	
Left	Left	Down	Left	
Right	Right	Down	Left	
Left	Right	Down	Left	


In [5]:
state = env.reset()
done = False
env.render()

while not done:
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Up)
SFF[41mF[0m

## Q-Learning

Simulation, 즉 sampling 과정에서 Agent는 exploration과 exploitation사이에서 고민해야 한다. 따라서 여기선 입실론 그리디를 사용한다.  

하지만 Agent의 목표는 Greedy Policy가 되어야 한다. 그동안 구해놓은 Q function을 최대한 활용해야 최적의 선택을 내릴 수 있기 때문이다.  

즉, update할 t 시점의 action을 $\varepsilon$-greedy를 통해 선택하고, 해당 action을 update하기 위해선 next state의 action들 중에 argmax한 action을 가져와 update한다.  

이렇게 하면 update할 Q값을 $\varepsilon$-greedy policy를 통해 선택하고, 해당 action을 update하기 위해 greedy policy 를 통해 Q값을 선택하게 되어, 두 policy를 분리하여 학습할 수 있다.

In [6]:
env = gym.make('FrozenLake-v1', is_slippery=True)
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [7]:
Q = np.zeros([env.nS, env.nA])
alpha = 0.1
gamma = 0.99
epsilon = 0.1

for i in range(1, 100001):
    state = env.reset()
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])
        
        next_state, reward, done, info = env.step(action)
        
        if done:
            Q[state, action] = (1 - alpha) * Q[state, action] + alpha * reward
        else:
            Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state])) # Q-learning
        
        state = next_state
        
    if i % 100 == 0:
        clear_output(wait=True)
        print('Episode: {}'.format(i))
        
print('Q function')
print(Q)

Episode: 100000
Q function
[[0.6017734  0.53165547 0.54492431 0.52997627]
 [0.26346164 0.21185392 0.27272474 0.5060863 ]
 [0.36397113 0.39971974 0.38200507 0.44239921]
 [0.30229425 0.28615977 0.30436636 0.42947739]
 [0.62288959 0.3111455  0.33394229 0.35055972]
 [0.         0.         0.         0.        ]
 [0.47671444 0.1425748  0.23539872 0.1646763 ]
 [0.         0.         0.         0.        ]
 [0.36357353 0.38960229 0.42820631 0.68804471]
 [0.46068349 0.72295062 0.35878658 0.43334272]
 [0.71642237 0.53582882 0.44655791 0.34295921]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.41262428 0.57652819 0.79317279 0.38272836]
 [0.76838933 0.89114921 0.81152204 0.82410398]
 [0.         0.         0.         0.        ]]


In [8]:
dic = {0:'Left', 1:'Down', 2:'Right', 3:'Up'}
for i in range(4):
    for j in range(4):
        print(dic[np.argmax(Q[i*4 + j])], end="\t")
    print()

Left	Up	Up	Up	
Left	Left	Left	Left	
Up	Down	Left	Left	
Left	Right	Down	Left	


In [9]:
state = env.reset()
done = False
env.render()

while not done:
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)
SFFF
F[41mH[0mFH
FFFH
HFFG
