In [26]:
import numpy as np
import pandas as pd
import time

In [27]:
class Agent:
    def __init__(self, maze, epsilon, gamma, alpha, 
                 state, q_table, actions=['up', 'down', 'left', 'right']):
        self.maze=maze
        self.epsilon = epsilon 
        self.gamma = gamma  
        self.alpha = alpha
        self.state = state 
        self.actions = actions #  ['up', 'down', 'left', 'right']
        self.q_table = q_table 
    
    def choose_action(self):
        state_actions = self.q_table[self.state]
        randomRate = np.random.uniform()
        if randomRate > self.epsilon:
            return self.actions.index(np.random.choice(self.actions)) 

        else:
            return state_actions.argmax()
    
    def update_q_table(self, reward, action, nxt_state):
        q_predict = self.q_table[self.state][action]
        if nxt_state in ['win', 'ghost'] :
            q_target = reward
        else:
            q_target = reward + self.gamma * self.q_table[nxt_state][action]
        self.q_table[self.state][action] += self.alpha * (q_target - q_predict) 
        self.state = nxt_state

In [28]:
class Maze:
    def __init__(self, actions):
        self.actions = actions
        
    def build_map(self, size, target, t_reward):
        self.size = size
        self.target = target
        self.map = -1+np.zeros((size))
        self.all_positions=[(i,j) for i in range(size[0]) for j in range(size[1])]
        for item in target:
            x,y=item
            self.map[x,y] = t_reward
        return
    
    def env_feedback(self, state, action):
        nxt_state = self.cal_coordinate(state, action)
        
        reward = self.map[nxt_state]
        if nxt_state in self.target:
            nxt_state = 'win'
            
        return nxt_state, reward
    
    def cal_coordinate(self, state, action):
        next_state = ()
        if action == 0:
            next_state = (state[0] - 1, state[1])
        elif action == 1:
            next_state = (state[0] + 1, state[1])
        elif action == 2:
            next_state = (state[0], state[1] + 1)
        elif action == 3:
            next_state = (state[0], state[1] - 1)
        if next_state not in self.all_positions:
            next_state=state
        return next_state
    
    def create_q_table(self):
        q_table = np.zeros(self.size + (len(self.actions),))
        print('Q_table.shape :')
        print(q_table.shape)
        return np.array(q_table)

In [29]:
EPISODES = 20
ACTIONS = ['up', 'down', 'left', 'right']
initSTATE = (0,0)
SIZE = (5,5) # maze size

EPSILON = 0.9
GAMMA = 0.9
ALPHA = 0.1

In [30]:
target = [(3,1), (4,4)]
t_reward = 100

f_reward_list = -1+np.zeros(SIZE)
Maze = Maze(ACTIONS)
Maze.build_map(SIZE, target, t_reward)

In [31]:
Q_table = Maze.create_q_table()

Q_table.shape :
(5, 5, 4)


In [32]:
Agent = Agent(Maze,epsilon= EPSILON, gamma= GAMMA, alpha= ALPHA,
             state = initSTATE, actions= ACTIONS,q_table= Q_table.copy())

In [33]:
def path(state, is_terminated):
    print(state, end='')
    if not is_terminated: print(' > ', end='')

# main process RL - Q_Learning
for episode in range(EPISODES):
    Agent.state = initSTATE
    is_terminated = False
    count = 0
    while not is_terminated :
        action = Agent.choose_action()
        nxt_state, reward = Maze.env_feedback(state=Agent.state, action=action)
        if Agent.state == nxt_state: 
            reward = -10
        ###########################################
        Agent.update_q_table(reward=reward, action=action, nxt_state=nxt_state)
        
        if nxt_state in ['win']:
            is_terminated = True
        path(Agent.state, is_terminated)
        Agent.state = nxt_state
        count +=1
        time.sleep(0.05)
        
    print('\n Episode. '+str(episode)+' finished ... ,total step : '+str(count))
    time.sleep(2)

(0, 0) > (1, 0) > (0, 0) > (0, 1) > (0, 1) > (1, 1) > (0, 1) > (0, 2) > (0, 2) > (1, 2) > (0, 2) > (0, 3) > (0, 3) > (1, 3) > (0, 3) > (0, 4) > (0, 4) > (1, 4) > (0, 4) > (0, 4) > (0, 3) > (0, 2) > (0, 1) > (0, 0) > (0, 0) > (0, 0) > (1, 0) > (2, 0) > (1, 0) > (1, 1) > (1, 2) > (2, 2) > (1, 2) > (1, 3) > (2, 3) > (1, 3) > (1, 4) > (2, 4) > (1, 4) > (1, 4) > (1, 3) > (1, 2) > (1, 1) > (2, 1) > (1, 1) > (1, 0) > (1, 0) > (2, 0) > (3, 0) > (2, 0) > (2, 1) > win
 Episode. 0 finished ... ,total step : 52
(0, 1) > (1, 1) > (2, 1) > win
 Episode. 1 finished ... ,total step : 4
(0, 1) > (0, 2) > (1, 2) > (2, 2) > (3, 2) > (2, 2) > (2, 3) > (3, 3) > (2, 3) > (2, 4) > (3, 4) > (2, 4) > (2, 4) > (2, 3) > (2, 2) > (2, 1) > win
 Episode. 2 finished ... ,total step : 17
(1, 0) > (1, 1) > (2, 1) > win
 Episode. 3 finished ... ,total step : 4
(1, 0) > (0, 0) > (0, 1) > (0, 0) > (1, 0) > (2, 0) > (2, 0) > (3, 0) > (4, 0) > (3, 0) > win
 Episode. 4 finished ... ,total step : 11


KeyboardInterrupt: 

In [None]:
Agent.q_table

array([[[-1.99      , -1.23532666, -1.99      , -1.25116878],
        [-1.        , -0.87210106, -0.9793621 , -0.91463446],
        [-1.        , -0.60089642, -0.59368384, -0.63343621],
        [-1.        ,  1.92466942, -0.34117063, -0.361     ],
        [-1.        , -0.2881    , -0.20890217, -1.        ]],

       [[-0.9793621 , -0.85439372, -1.99      , -0.92629492],
        [-0.778069  , -0.83099521, -0.947359  , -0.80238078],
        [-0.5878    , -0.54407539, -0.56804491, -0.56387341],
        [-0.28      , 18.83384477, -0.39955041, -0.1       ],
        [-0.28      , -0.19      , -0.10994851, -1.        ]],

       [[-0.58246453, -0.5757067 , -1.        , -0.57537865],
        [-0.60490315, -0.5273659 , -0.5878    , -0.469702  ],
        [-0.34075   , -0.2881    , -0.34075   , -0.271     ],
        [-0.2233    , 61.2579511 ,  0.        ,  0.        ],
        [-0.109     , 10.        ,  0.        ,  0.        ]],

       [[-0.31167748, -0.442     , -1.        ,  1.75248254],
  