# Cliff Walking Example을 통한 SARSA와 Q-Learning 비교

## Library Import

In [1]:
import numpy as np
import random
from IPython.display import clear_output

## Make Environment

절벽 환경 구현하기  


In [2]:
class Cliff:
    
    def __init__(self, row, col):
        if (row <= 1) or (col <= 1):
            print('make env (row, col) = (2, 2)')
        else:
            print('make env (row, col) = ({}, {})'.format(row, col))
        self.row = row
        self.col = col
        self.pos = [0, 0]
        
    def reset(self):
        self.pos = [0, 0]
        return 0
    
    def render(self):
        print('TODO')
        
        for i in range()
    
    def step(self, action):
        # [Left, Down, Right, Up] = [0, 1, 2, 3]
        
        reward = 0
        
        if action == 0:
            if self.pos[1] > 0:
                self.pos[1] -= 1
            else:
                reward = -1
            
        elif action == 1:
            if self.pos[0] > 0:
                self.pos[0] -= 1
            else:
                reward = -1
                
        elif action == 2:
            if self.pos[1] + 1 < self.col:
                self.pos[1] += 1
            else:
                reward = -1
                
        elif action == 3:
            if self.pos[0] + 1 < self.row:
                self.pos[0] += 1
            else:
                reward = -1
        
        state = self.pos[0] * self.col + self.pos[1]
        done = False
        
        if self.pos[0] == 0 and self.pos[1] > 0:
            done = True
            if self.pos[1] == self.col - 1:
                reward = 1
            else:
                reward = -100
                
        return state, reward, done
    
    def env_info(self):
        state_n = self.col * self.row
        action_n = 4
        
        return state_n, action_n

## SARSA

In [3]:
row = 4
col = 6
env = Cliff(row, col)

state_n, action_n = env.env_info()

Q = np.zeros([state_n, action_n])
alpha = 0.1
gamma = 0.99
epsilon = 0.3

for i in range(1, 100001):
    state = env.reset()
    done = False
    
    if random.uniform(0, 1) < epsilon:
        action = random.choice([0, 1, 2, 3])
    else:
        action = np.argmax(Q[state])
        
    while not done:
        next_state, reward, done = env.step(action)
        
        if random.uniform(0, 1) < epsilon:
            next_action = random.choice([0, 1, 2, 3])
        else:
            next_action = np.argmax(Q[next_state])
            
        if done:
            Q[state, action] = (1 - alpha) * Q[state, action] + alpha * reward
        else:
            Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (reward + gamma * Q[next_state, next_action])
        
        state, action = next_state, next_action
        
    if i % 100 == 0:
        clear_output(wait=True)
        print('Episode: {}'.format(i))

print('Q function')
print(Q)       

Episode: 100000
Q function
[[-1.63991780e+01 -9.23186425e+00 -1.00000000e+02 -2.14987140e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-3.57415465e+00 -1.14470928e+01 -1.17226209e+01 -1.71265696e+00]
 [-3.84840093e+00 -1.00000000e+02 -6.85983933e+00 -2.13800846e+00]
 [-9.54546195e+00 -1.00000000e+02 -5.21455683e+00 -1.54936780e+00]
 [-4.62336157e+00 -1.00000000e+02 -8.29312251e+00 -1.43179830e+00]
 [-2.73426595e+01 -1.00000000e+02  2.72895192e-01 -1.57602716e+00]
 [-2.90691025e+00  1.00000000e+00 -1.26700193e+00 -4.23065702e-01]
 [-2.61616379e+00 -3.76626796e+00 -1.69376549e+00 -1.81858196e+00]
 [-1.89275717e+00 -2.03254949e+01 -1.68691193e+00 -1.69164077e+00]
 [-2.12407825e+00 -2.37187427e+00 -

In [4]:
dic = {0:'Left', 1:'Down', 2:'Right', 3:'Up'}
for i in range(row - 1, -1, -1):
    for j in range(col):
        #print(i*col + j, end='\t')
        print(dic[np.argmax(Q[i*col + j])], end="\t")
    print()

Right	Right	Right	Right	Right	Down	
Right	Right	Up	Right	Right	Down	
Up	Up	Up	Up	Right	Down	
Up	Left	Left	Left	Left	Left	


## Q-Learning

- epsilon을 조절하며 pass cnt가 얼마나 늘어나는지 확인해보자.

In [5]:
row = 4
col = 6
env = Cliff(row, col)

state_n, action_n = env.env_info()

Q = np.zeros([state_n, action_n])
alpha = 0.1
gamma = 0.99
epsilon = 0.5
pass_cnt = 0

for i in range(1, 50001):
    state = env.reset()
    done = False
    
    tr = 0
    
    while not done:
        tr += 1
        if tr > 100:
            break
        
        if random.uniform(0, 1) < epsilon:
            action = random.choice([0, 1, 2, 3])
        else:
            action = np.argmax(Q[state])
        
        next_state, reward, done = env.step(action)
            
        if done:
            if reward == 1:
                pass_cnt += 1
            Q[state, action] = (1 - alpha) * Q[state, action] + alpha * reward
        else:
            Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]))
        
        state = next_state
        
    if i % 100 == 0:
        clear_output(wait=True)
        print('Episode: {}, Goal: {}'.format(i, pass_cnt))

print('Q function')
print(Q)       

Episode: 50000, Goal: 5506
Q function
[[-6.79346521e-02 -6.79346521e-02 -1.00000000e+02  9.41480149e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-5.85198506e-02  9.32065348e-01  9.50990050e-01  9.32065348e-01]
 [ 9.41480149e-01 -1.00000000e+02  9.60596010e-01  9.41480149e-01]
 [ 9.50990050e-01 -1.00000000e+02  9.70299000e-01  9.50990050e-01]
 [ 9.60596010e-01 -1.00000000e+02  9.80100000e-01  9.60596010e-01]
 [ 9.70299000e-01 -1.00000000e+02  9.90000000e-01  9.70299000e-01]
 [ 9.80100000e-01  1.00000000e+00 -1.00000000e-02  9.80100000e-01]
 [-6.79346521e-02  9.41480149e-01  9.41480149e-01  9.22744694e-01]
 [ 9.32065348e-01  9.50990050e-01  9.50990050e-01  9.32065348e-01]
 [ 9.41480149e-01  9.605

In [6]:
dic = {0:'Left', 1:'Down', 2:'Right', 3:'Up'}
for i in range(row - 1, -1, -1):
    for j in range(col):
        #print(i*col + j, end='\t')
        print(dic[np.argmax(Q[i*col + j])], end="\t")
    print()

Down	Down	Down	Down	Down	Down	
Down	Down	Down	Down	Down	Down	
Right	Right	Right	Right	Right	Down	
Up	Left	Left	Left	Left	Left	
