In [1]:
import numpy as np
import pandas as pd
import time
import tkinter as tk
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pylab as plt

In [2]:
unit = 40
maze_h = 8
maze_w = 8
episode_count = 200
episodes = range(episode_count)
rewards = []
movements = []
reward_black = -100
reward_goal = 100
reward_normal = 1

In [3]:
class Maze:
    def __init__(self):
        self.window = tk.Tk()
        self.window.title("Maze with Q-Learning")
        self.window.geometry('{0}x{1}'.format(maze_w*unit,maze_h*unit))
        self.action_space = {'u','d','r','l'}
        self.n_action = len(self.action_space)
        self.holes = []
        self.build_maze()
        
    def build_maze(self):
        self.canvas = tk.Canvas(self.window,bg='white',width=maze_w*unit,height=maze_h*unit)
        
        for c in range(0,maze_w*unit,unit):
            x0, y0, x1, y1 = c ,0 ,c ,maze_w*unit
            self.canvas.create_line(x0, y0, x1, y1)
        
        for r in range(0,maze_h*unit,unit):
            x0, y0, x1, y1 = 0, r, maze_h*unit, r
            self.canvas.create_line(x0, y0, x1, y1)
            
        origin = np.array([unit/2,unit/2])
        
        hole_offset =[[0,1], 
              [1,1], [1,2], [1,4], [1,5], [1,6], [1,7], 
              [2,1], [2,6], 
              [3,3], [3,4], [3,6], 
              [4,1], [4,3], 
              [5,0], [5,1], [5,3], [5,5], [5,7],
              [6,1], [6,2], [6,3], [6,5],
              [7,5], [7,6]]
        for i in range(len(hole_offset)):
            s = "hole"+str(i)
            hole_center = origin + np.array([unit*hole_offset[i][1],unit*hole_offset[i][0]])
            self.s = self.canvas.create_rectangle(
                hole_center[0] - 15, hole_center[1] - 15,
                hole_center[0] + 15, hole_center[1] + 15,
                fill="black"
            )
            self.holes.append(self.canvas.coords(self.s))
        
        goal_center = origin + np.array([unit*7,unit*7])
        self.goal = self.canvas.create_rectangle(
            goal_center[0] - 15, goal_center[1] - 15,
            goal_center[0] + 15, goal_center[1] + 15,
            fill="yellow"
        )
        
        self.sprite = self.canvas.create_rectangle(
            origin[0] - 15, origin[1] - 15,
            origin[0] + 15, origin[1] + 15,
            fill="red"
        )
        
        self.canvas.pack()
        
    def render(self):
        time.sleep(0.1)
        self.window.update()
        
    def reset(self):
        self.window.update()
        time.sleep(0.5)
        self.canvas.delete(self.sprite)
        origin = np.array([unit/2,unit/2])
        self.sprite = self.canvas.create_rectangle(
            origin[0] - 15, origin[1] - 15,
            origin[0] + 15, origin[1] + 15,
            fill="red"
        )
        return self.canvas.coords(self.sprite)
        
    def get_state_reward(self, action):
        s = self.canvas.coords(self.sprite)
        base_action = np.array([0,0])
        if action == 0: #up
            if s[1] > unit:
                base_action[1] -= unit
        elif action == 1: #down
            if s[1] < (maze_h - 1)*unit:
                base_action[1] += unit
        elif action == 2: #right
            if s[0] < (maze_w - 1)*unit:
                base_action[0] += unit
        elif action == 3: #left
            if s[0] > unit:
                base_action[0] -= unit
            
        self.canvas.move(self.sprite, base_action[0], base_action[1])
        s_  = self.canvas.coords(self.sprite)
        if s_ == self.canvas.coords(self.goal):
            reward = reward_goal
            done = True
            s_ = 'terminal'
        elif s_ in self.holes:
            reward = reward_black
            done = True
            s_ = 'terminal'
        else:
            reward = reward_normal
            done = False
        return s_, reward, done

In [4]:
class QLearningTable:
    def __init__(self, actions, learning_rate=0.03, reward_decay=0.9, e_greedy=0.1, train=1):
        self.actions = actions
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        try:
            self.q_table = pd.read_csv(r'RL_Maze_Q_Table.csv',index_col=0)
            self.q_table = self.q_table.rename(index=str,columns={'0':0, '1':1, '2':2, '3':3}) #change column to type int to avoid errors
        except FileNotFoundError:
            self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)
        self.train = train
    
    def choose_action(self, observation):
        self.add_state(observation)
        if np.random.uniform() < self.epsilon:
            action = np.random.choice(self.actions)
        else:
            state_action = self.q_table.loc[observation, :]
            state_action = state_action.reindex(np.random.permutation(state_action.index))
            action = state_action.idxmax()
        return action
    
    def learn(self, s, a, r, s_):
        if self.train == 1:
            self.add_state(s_)
            q_predict = self.q_table.loc[s,a]
            if s_ != 'terminal':
                q_target = r + self.gamma * self.q_table.loc[s_, :].max()
            else:
                q_target = r
            self.q_table.loc[s, a] += self.lr * (q_target - q_predict)
            self.q_table.to_csv(r'RL_Maze_Q_Table.csv',index=True)
    
    def add_state(self, state):
        if state not in self.q_table.index:
            self.q_table = self.q_table.append(
                pd.Series([0] * len(self.actions),
                         index=self.q_table.columns,
                         name=state)
            )

In [5]:
def run_experiment():
    for episode in episodes:
        print("Episode {0}/{1}".format(episode, episode_count))
        observation = env.reset()
        moves = 0
        reward = 0
        reached = []
        reached.append(observation)
        
        while True:
            env.render()
            action = q_learning_agent.choose_action(str(observation))
            observation_, reward_, done = env.get_state_reward(action)
            moves += 1
            if observation_ not in reached:
                reward += reward_
                reached.append(observation_)
                if reward_ == reward_goal:
                    print("Goal reached")
            else:
                reward -= (reward_)*5
            
            q_learning_agent.learn(str(observation), action, reward, str(observation_))
            observation = observation_
            
            if done:
                movements.append(moves)
                rewards.append(reward)
                print("Reward: {0}, Moves: {1}".format(reward, moves))
                break
    print("game over!")
    plot_reward_movements()

def plot_reward_movements():
    plt.figure()
    plt.subplot(2, 1, 1)
    plt.plot(episodes, movements)
    plt.xlabel("Episode")
    plt.ylabel("#Movements")
    
    plt.subplot(2, 1, 2)
    plt.step(episodes, rewards)
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.show()

In [None]:
if __name__=="__main__":
    choice = int(input("Enter 0 for training and 1 for evaluation => "))
    if choice == 0:
        episode_count = int(input("Enter number of training epochs"))
        episodes = range(episode_count)
        env = Maze()
        q_learning_agent = QLearningTable(actions=list(range(env.n_action)))
        env.window.after(10, run_experiment)
        env.window.mainloop()
    elif choice == 1:
        episode_count = 10
        episodes = range(episode_count)
        env = Maze()
        q_learning_agent = QLearningTable(e_greedy=0.0, train=0, actions=list(range(env.n_action)))
        env.window.after(10, run_experiment)
        env.window.mainloop()
    else:
        print("Wrong choice")

Enter 0 for training and 1 for evaluation => 0
Enter number of training epochs20
Episode 0/20
Goal reached
Reward: 105, Moves: 18
Episode 1/20
Goal reached
Reward: 115, Moves: 16
Episode 2/20
Reward: -100, Moves: 1
Episode 3/20
Reward: -89, Moves: 12
Episode 4/20
Goal reached
Reward: 105, Moves: 18
Episode 5/20
Goal reached
Reward: 115, Moves: 16
Episode 6/20
Reward: -88, Moves: 13
Episode 7/20
Goal reached
Reward: 105, Moves: 18
Episode 8/20
Goal reached
Reward: 115, Moves: 16
Episode 9/20
Goal reached
Reward: 115, Moves: 16
Episode 10/20
Goal reached
Reward: 105, Moves: 18
Episode 11/20
Reward: -94, Moves: 7
Episode 12/20
Goal reached
Reward: 115, Moves: 16
Episode 13/20
Reward: -96, Moves: 11
Episode 14/20
Goal reached
Reward: 115, Moves: 16
Episode 15/20
Goal reached
Reward: 111, Moves: 18
Episode 16/20
Goal reached
Reward: 115, Moves: 16
Episode 17/20
Goal reached
Reward: 115, Moves: 16
Episode 18/20
Reward: -91, Moves: 10
Episode 19/20
Goal reached
Reward: 115, Moves: 16
game ove