# Importing Dependancies

In [1]:
import numpy as np
import random

# Save and Load Q-Table

In [None]:
# can load AI's Q-Table from CSV file as follows:
q_table = np.loadtxt("AI_5x5.csv", delimiter = ",")

In [None]:
# save AI's Q-Table to CSV file for easy use
np.savetxt("AI_5x5.csv", q_table, delimiter = ",")

# Creating the Field (States and Actions)

In [2]:
class Field():
    # Initialise variables
    def __init__(self, size, item_pickup, item_dropoff, start_pos):
        self.size = size # size of square grid
        self.item_pickup = item_pickup # position of item to be picked up
        self.item_dropoff = item_dropoff # position of dropoff point
        self.pos = start_pos # where player spawns
        
        self.item_in_car = False # whether player has the item
        
    def get_no_states(self):
        # There are two player conditions, has item and not has item
        dimensions = (self.size ** 6) * 2 # there are 5x5 places for player, 5x5 places for item, 5x5 for dropoff, so size^6
        return dimensions
    
    def get_state(self):
        # get which state we are in, each state is unique
        state = self.pos[0] * 2 * (self.size ** 5) # gets us index in the 5x5x5x5x5x2 = 6250 range
        state += self.pos[1] * 2 * (self.size ** 4) # gets us index in the 5x5x5x5x2 = 1250 range
        state += self.item_pickup[0] * 2 * (self.size ** 3) # gets us index in the 5x5x5x2 = 250 range
        state += self.item_pickup[1] * 2 * (self.size ** 2) # etc.
        state += self.item_pickup[0] * 2 * self.size
        state += self.item_pickup[1] * 2
        
        if self.item_in_car:
            state += 1
            
        return state
    
    # movement and actions
    def make_action(self, action):
        (x, y) = self.pos # current player pos
        
        if action == 0: # Go South
            if y == self.size - 1: # if at bottom of grid, cant go further and serious -ve reward
                return -10, False # this false means the game is still not over
            else: # everywhere else, can travel down and small -ve reward to discourage endless walking
                self.pos = (x, y + 1) # increase y pos, note that y-coords increase downwards
                return -1, False
        elif action == 1: # Go North
            if y == 0: # if at top
                return -10, False
            else:
                self.pos = (x, y - 1)
                return -1, False
        elif action == 2: # Go West
            if x == 0: #if at far left
                return -10, False
            else:
                self.pos = (x - 1, y)
                return -1, False
        elif action == 3: # Go East
            if x == self.size - 1: # if at far right
                return -10, False
            else:
                self.pos = (x + 1, y)
                return -1, False
        elif action == 4: # pickup
            if self.item_in_car or self.item_pickup != (x, y): # if player has item already or not at item pos
                return -10, False
            else: # if has item
                self.item_in_car = True # now has item
                return 20, False # +ve reward
        elif action == 5: # dropoff
            if not self.item_in_car: # if does not have item
                return -10, False
            elif self.item_dropoff != (x, y): # if not at dropoff, drop item anyway but punish
                self.item_pickup = (x, y)
                self.item_in_car = False
                return -10, False
            else: # dropped item off at end, wins game
                return 20, True # return True since game is over

# Testing Field

In [3]:
# show most optimal solution maually
field = Field(10, (0, 0), (9, 9), (9, 0)) # 10x10 grid, item at topleft, dropoff at bottomright, player spawn at topright

field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)

print("Player picks up: ", field.make_action(4))

field.make_action(0)
field.make_action(3)
field.make_action(0)
field.make_action(3)
field.make_action(0)
field.make_action(3)
field.make_action(0)
field.make_action(3)
field.make_action(0)
field.make_action(3)
field.make_action(0)
field.make_action(3)
field.make_action(0)
field.make_action(3)
field.make_action(0)
field.make_action(3)
field.make_action(0)
field.make_action(3)

print("Player now at: ", field.pos)

print("Player drops off: ", field.make_action(5))

Player picks up:  (20, False)
Player now at:  (9, 9)
Player drops off:  (20, True)


# Naive Solution (Random Walk)

In [4]:
# solution by random walk
def naive_solution():
    # field initialise
    size = 5
    item_start = (4, 3)
    item_dropoff = (2, 2)
    start_pos = (1, 3)
    
    field = Field(size, item_start, item_dropoff, start_pos)
    
    # loop parameters
    done = False
    steps = 0
    
    action_list = []
    
    while not done:
        action = random.randint(0, 5) # take a random action
        action_list.append(action)
        
        reward, done = field.make_action(action) # done will be set to True if game finishes
        steps += 1
        
    return steps, action_list # when finished, return total amount of steps taken

print(naive_solution()) # show how many steps one random solution takes

many_runs = [naive_solution()[0] for i in range(100)] # amount of steps in a list for 100 runs
average_steps = sum(many_runs) / len(many_runs) # finds average amount of steps over the 100 runs
print("Average amount of steps taken to complete by random walking: ", average_steps)

(17357, [2, 1, 5, 0, 3, 2, 0, 1, 0, 2, 1, 3, 0, 2, 3, 3, 3, 4, 4, 0, 5, 4, 2, 2, 5, 1, 3, 3, 4, 1, 3, 4, 1, 3, 2, 0, 5, 5, 3, 0, 5, 4, 1, 3, 1, 3, 1, 5, 2, 1, 3, 1, 1, 2, 5, 4, 0, 1, 0, 2, 3, 0, 4, 2, 2, 4, 3, 1, 0, 3, 3, 5, 1, 0, 1, 3, 4, 0, 2, 2, 1, 0, 2, 3, 3, 2, 3, 3, 0, 2, 4, 2, 3, 4, 3, 4, 1, 4, 4, 1, 4, 2, 1, 4, 3, 1, 0, 2, 4, 3, 0, 0, 5, 1, 2, 0, 3, 2, 2, 0, 0, 5, 5, 2, 5, 1, 0, 5, 2, 1, 0, 5, 2, 2, 2, 5, 1, 3, 3, 1, 3, 2, 3, 3, 0, 4, 4, 3, 1, 4, 3, 1, 4, 1, 3, 2, 2, 3, 4, 3, 4, 5, 3, 5, 1, 2, 1, 3, 3, 0, 4, 0, 4, 3, 0, 0, 4, 5, 3, 0, 0, 4, 2, 4, 0, 5, 3, 0, 0, 0, 3, 1, 4, 3, 1, 2, 3, 3, 2, 3, 1, 1, 3, 1, 2, 1, 2, 2, 5, 0, 3, 2, 5, 2, 2, 3, 3, 1, 2, 5, 3, 2, 3, 0, 2, 4, 2, 1, 1, 0, 2, 2, 5, 1, 3, 0, 3, 5, 5, 2, 5, 1, 4, 5, 2, 0, 0, 3, 1, 1, 4, 2, 0, 3, 0, 1, 4, 3, 5, 5, 2, 5, 2, 4, 1, 0, 1, 3, 2, 0, 0, 1, 1, 5, 3, 2, 3, 3, 2, 0, 3, 0, 4, 4, 1, 1, 5, 3, 2, 5, 4, 0, 1, 2, 2, 0, 0, 1, 4, 2, 1, 5, 1, 5, 1, 4, 3, 0, 2, 2, 4, 2, 1, 5, 3, 0, 0, 4, 4, 4, 2, 3, 4, 3, 4, 0, 0, 0, 0, 3, 1

Average amount of steps taken to complete by random walking:  5930.45


# Training

In [5]:
# field initialise
size = 5
item_start = (0, 0)
item_dropoff = (4, 4)
start_pos = (4, 0)

field = Field(size, item_start, item_dropoff, start_pos)

# states and actions, initialise q-table
no_states = field.get_no_states()
no_actions = 6
# Q-table is states x actions (rows x columns)
q_table = np.zeros((no_states, no_actions))

# initialise training variables
epsilon = 0.1
alpha = 0.1
gamma = 0.6

# Train the AI over 10000 steps
for i in range(10000):
    item_start = (random.randint(0, 4), random.randint(0, 4))
    item_dropoff = (random.randint(0, 4), random.randint(0, 4))
    start_pos = (random.randint(0, 4), random.randint(0, 4))
    field = Field(size, item_start, item_dropoff, start_pos) # new field each time
    done = False
    
    while not done: # evaluate each run
        state = field.get_state() # what state we are in
        # whether to explore or exploit with probability
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, 5) # explore
        else:
            action = np.argmax(q_table[state]) # exploit
            
        reward, done = field.make_action(action) # evaluate run
        
        new_state = field.get_state() # get current state
        new_state_max = np.max(q_table[new_state]) # get current action
        
        # Q-Learning algorithm
        q_table[state, action] = q_table[state, action] + alpha * (reward + gamma * new_state_max - q_table[state, action])

AI is now trained for most scenarios, it might not work if epsilon is zero so to fix, start with epsilon != 0 so that it can give the course a try, then let epsilon = 0 and see what it thinks the optimal path is. It should be good over most scenarios though.

# Testing the AI

In [8]:
def reinforcement_learning():
    # initialise variables
    epsilon = 0.1 # if epsilon is zero, we want it to use its most optimal solution that its found (exploit)
    alpha = 0.1
    gamma = 0.6

    # initialise field
    size = 5
    item_start = (4, 3)
    item_dropoff = (2, 2)
    start_pos = (1, 3)
    
    field = Field(size, item_start, item_dropoff, start_pos)
    done = False
    steps = 0
    
    action_list = []

    # run AI through field
    while not done:
        state = field.get_state()
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, 5)
        else:
            action = np.argmax(q_table[state])

        reward, done = field.make_action(action)
        action_list.append(action)

        new_state = field.get_state()
        new_state_max = np.max(q_table[new_state])

        q_table[state, action] = q_table[state, action] + alpha * (reward + gamma * new_state_max - q_table[state, action])
        
        steps += 1
        
    return steps, action_list # amount of steps until completion

In [18]:
print(reinforcement_learning()) # show how many steps one AI run takes

many_runs = [reinforcement_learning()[0] for i in range(1000)] # amount of steps in a list for 1000 runs
average_steps = sum(many_runs) / len(many_runs) # finds average amount of steps over the 1000 runs
print("Average amount of steps taken to complete by AI: ", average_steps)

(10, [3, 3, 3, 4, 1, 0, 1, 2, 2, 5])
Average amount of steps taken to complete by AI:  13.886
