# 0. Import dependencies

Import Libraries:
- Numpy for the Qtable
- OpenAI gym for the environment
- Random to generate random numbers

In [3]:
import numpy as np
import gym
import random

# Agent

In [4]:
container_amount = 18
prio = ['H', 'L']
container_id = []
container_prio = []

for c in range(container_amount):
    container_id.append(c)
    container_prio_index = random.randrange(len(prio))
    container_prio.append(prio[container_prio_index])

container_data = np.column_stack((container_id,container_prio))

# 1. Environment

- Create the environment for the Taxi

In [5]:
size = (3,3,2)
environment = np.array([([( [['0'] * size[2]]) * size[1] ]) * size[0] ]).astype(object)
env = environment

def scoreEnvironment(env):
    result = 0
    for x in range(env.shape[0]):
        for y in range(env.shape[1]):
            for z in range(Environment.shape[2]):
                result = (Environment[0,:,:] == 'L').sum()
                result -= (Environment[0,:,:] == 'H').sum()
                result -= (Environment[2,:,:] == 'L').sum()
                result += (Environment[2,:,:] == 'H').sum()
    return result


#return cleared environment
def clearEnvironment(env):
    return np.array([[['0'] * Size[2]] * Size[1]] * Size[0])

#generate status of board in Boolean array
def GenerateStatusDataset(env,dataType):
    return np.concatenate([np.array([dataType == 'H'],dtype=np.float32),np.array(env == 'L',dtype=np.float32).flatten(), np.array(env == 'H', dtype=np.float32).flatten(), np.array(env == '0',dtype=np.float32).flatten()])

In [6]:
def move(env, X, Y, Z):
    #if max height reached, place container somewhere else
    if Z>size[2] -1:
        return False
    
    #container can not be placed outside the lot
    if X<0 or X>= size[0] or Y<0 or Y>= size[1]:
        return False
    
    #container can not be placed when there already is one
    if env[X][Y][Z] !='0':
        return False
    
    #container can not be floating
    if not env[X][Y][:Z].all() !='0':
        return False
    
    #container in the middle
    if Y-1 != -1 and Y+1 != size[1]:
        if env[X][Y-1][0] != '0' and env[X][Y+1][0] !='0':
            return False
        
    return True

In [7]:
def action_XYZ(action):
    value = action
    
    for a in range(size[0] + 1):
        if value <= a * (size[1] * size[2]):
            X = a - 1
            break 

    value_Y = value - (X * size[1] * size[2])

    for i in range(value_Y + 1):
        if value_Y <= i * (size[2]):
            Y = i - 1
            break

    Z = value_Y - Y * size[2] - 1
    return X,Y,Z

def place_container(action, prio, env):

    X,Y,Z = action_XYZ(action)
    
    if move(env, X,Y,Z):
        env[X][Y][Z] = prio
        return True, env
    else:
        return False, env

# 2. Create Qtable and initialize

- Create the Qtable
- Action and state size are calculated to know how much rows and columns are needed

In [8]:
action_size = env.action_space.n
print("Action size ", action_size)

state_size = env.observation_space.n
print("State size ", state_size)

AttributeError: 'numpy.ndarray' object has no attribute 'action_space'

In [4]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


# 3. Hyperparameters

- Specify the required hyperparameters

In [5]:
total_episodes = 50000       #Total episodes
total_test_episodes = 100    #Total test episodes
max_steps = 99               #Max steps per episodes

learning_rate = 0.7          #Learning rate
gamma = 0.618                #Discounting rate

#Exploration parameters
epsilon = 1.0               #Exploration rate
max_epsilon = 1.0           #Exploration probability at start
min_epsilon = 0.01          #Minimum exploration probability
decay_rate = 0.01           #Exponential decay rate for exploration probability

# 4. Qlearning algorithm

In [6]:
#Until learning is stopped
for episode in range(total_episodes):
    #Reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        #Choose an action in the current state
        ##Randomize a number
        exp_exp_tradeoff = random.uniform(0,1)
        
        ##If this number is greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
        
        #Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()
        
        #Take the action and observe the outcome state and reward
        new_state, reward, done, info = env.step(action)
        
        qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        #Our new state is state
        state = new_state
        
        #If done : finish episode
        if done == True:
            break
    
    episode += 1
    
    #Reduce epsilon because we need less and less exploration
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
        

# 5. Use Qtable to play

In [7]:
env.reset()
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("****************")
    print("EPISODE ", episode)
    
    for step in range(max_steps):
        env.render()
        #Take the action that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            break
        state = new_state
env.close()

****************
EPISODE  0
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |[35mB[0m: |
+---------+

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1mR[0m: | : :G|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1mR[0m:[43m [0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R:[42m_[0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : 

In [8]:
print("Score", total_rewards)
print ("Score over time: " + str(sum(rewards)/total_test_episodes))

Score 10
Score over time: 7.77
