In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import random

In [57]:
env = gym.make("Taxi-v3")



# 500 = 25개 taxi 위치,목적지(4개), 승객위치 5개(4개+택시안), 
# action마다 -1을 reward, 승객을 목적지에 내려주면 +20, 
# 승객을 잘못 태우거나 잘못 내리면 -10
q_table = np.zeros([env.observation_space.n, env.action_space.n])

training_episodes = 9000 # Amount of times to run environment while training.
display_episodes = 10 # Amount of times to run environment after training.

# Hyperparameters
alpha = 0.1 # Learning Rate
gamma = 0.6 # Discount Rate
epsilon = 0.1 # Chance of selecting a random action instead of maximising reward.

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(training_episodes):
    state = env.reset()
    done = False
    penalties, reward, = 0, 0
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Pick a new action for this state.
        else:
            action = np.argmax(q_table[state]) # Pick the action which has previously given the highest reward.

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action] # Retrieve old value from the q-table.
        next_max = np.max(q_table[next_state])

        # Update q-value for current state.
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10: # Checks if agent attempted to do an illegal action.
            penalties += 1

        state = next_state
        
    if i % 200 == 0: # Output number of completed episodes every 100 episodes.
        print(f"Episode: {i}   {penalties}")

print("Training finished.\n")

Episode: 0   39
Episode: 200   5
Episode: 400   5
Episode: 600   2
Episode: 800   6
Episode: 1000   1
Episode: 1200   0
Episode: 1400   0
Episode: 1600   1
Episode: 1800   1
Episode: 2000   0
Episode: 2200   3
Episode: 2400   0
Episode: 2600   0
Episode: 2800   1
Episode: 3000   0
Episode: 3200   1
Episode: 3400   0
Episode: 3600   0
Episode: 3800   0
Episode: 4000   2
Episode: 4200   0
Episode: 4400   0
Episode: 4600   1
Episode: 4800   0
Episode: 5000   1
Episode: 5200   1
Episode: 5400   0
Episode: 5600   0
Episode: 5800   0
Episode: 6000   0
Episode: 6200   0
Episode: 6400   1
Episode: 6600   0
Episode: 6800   1
Episode: 7000   0
Episode: 7200   0
Episode: 7400   2
Episode: 7600   0
Episode: 7800   0
Episode: 8000   1
Episode: 8200   0
Episode: 8400   0
Episode: 8600   3
Episode: 8800   0
Training finished.



In [25]:
env = gym.make('Taxi-v3') 

epochs = 0
    
done = False
    
state = env.reset()

while not done:
    action = np.argmax(q_table[state])
    state, reward, done, info = env.step(action)
    env.render()
    epochs += 1
print(epochs)    
env.close()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
|[42m_[0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :G|
| : | : : |
|[42

# Maze

In [None]:
!pip install gym pyvirtualdisplay
!apt-get install -y xvfb python-opengl ffmpeg
!apt-get update
!apt-get install cmake
!pip install --upgrade setuptools
!pip install ez_setup
!pip install gym[atari]
!pip install box2d-py
!pip install gym[Box_2D]
!apt install imagemagick
!pip uninstall -y pygame
!pip install pygame

In [None]:
from pyvirtualdisplay import Display
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython import display as ipythondisplay
from IPython.display import HTML

def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env
 
def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")  

v_display = Display(visible=0, size=(1400, 900))
v_display.start()

In [None]:
!git clone https://github.com/tuzzer/gym-maze.git
!pwd
%cd gym-maze/
!pwd
!python setup.py install  

In [52]:
import gym_maze

env = wrap_env(gym.make('maze-sample-5x5-v0')) 

s  = env.reset()
for i in range(100):
    env.step(env.action_space.sample())
env.close()
show_video()    

In [None]:
import math

def simulate():

    learning_rate =  get_learning_rate(0)
    explore_rate =  get_explore_rate(0)
    discount_factor = 0.99


    for episode in range(NUM_EPISODES):

        obv = env.reset()   #    초기상태  [0.0  ,  0.0]

        state_0 =  ( int(obv[0]), int(obv[1]) )
        
        total_reward = 0

        for t in range(MAX_T):
            action = select_action(state_0, explore_rate)
            obv, reward, done, _ = env.step(action)
            
            
            state = ( int(obv[0]), int(obv[1]) )
            total_reward += reward

            best_q = np.amax(q_table[state])
            q_table[state_0 + (action,)] += learning_rate * (reward + discount_factor * (best_q) - q_table[state_0 + (action,)])

            state_0 = state
            if done : break

                
        print("Episode %d finished after %f time steps with total reward = %f (done %d)." % (episode, t, total_reward, done))                


        explore_rate = get_explore_rate(episode)
        learning_rate = get_learning_rate(episode)
        
        print(explore_rate, learning_rate)
        
    env.render(close=True)


def select_action(state, explore_rate):
    if random.random() < explore_rate:
        action = env.action_space.sample()
    else:
        action = int(np.argmax(q_table[state]))
    return action

def get_explore_rate(t):
    return max(MIN_EXPLORE_RATE, min(0.8, 1.0 - math.log10((t+1)/DECAY_FACTOR)))

def get_learning_rate(t):
    return max(MIN_LEARNING_RATE, min(0.8, 1.0 - math.log10((t+1)/DECAY_FACTOR)))


env = gym.make("maze-sample-5x5-v0")

MAZE_SIZE = tuple((env.observation_space.high + np.ones(env.observation_space.shape)).astype(int))   # (5,5)
NUM_BUCKETS = MAZE_SIZE  # one bucket per grid

NUM_ACTIONS = env.action_space.n  # ["N", "S", "E", "W"]
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))

MIN_EXPLORE_RATE = 0.001
MIN_LEARNING_RATE = 0.2
DECAY_FACTOR = np.prod(MAZE_SIZE, dtype=float) / 10.0

NUM_EPISODES = 100#
MAX_T = np.prod(MAZE_SIZE, dtype=int) * 100

q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,), dtype=float)    # (5,5,4)

simulate()

In [55]:
env = wrap_env(gym.make('maze-sample-5x5-v0')) 

obv = env.reset() 

state =  ( int(obv[0]), int(obv[1]) )

for i in range(100):
    action = select_action(state, 0.01)
    obv, reward, done, _ = env.step(action)
    state = ( int(obv[0]), int(obv[1]) )

    if done :
      break
env.close()
show_video()  