## Setting up the game field and rules

In [1]:
class Field:
    def __init__(self, size, item_pickup, item_dropoff, start_position):
        self.size = size  # size of the borad or field
        self.item_pickup = item_pickup
        self.item_dropoff = item_dropoff
        self.position = start_position
        self.item_in_car = False

    def get_number_of_states(self):
        # All possible state
        # We have m row * n column

        # So posibility of picked-up = m*n
        # Meanwhile posibility of drop-off = m*n
        # And last check the agent have carry item or not = *2
        return self.size*self.size*self.size*self.size*2

    def get_state(self):
        # One to one mapping wiht Q-table
        # position[0] = x
        # position[1] = y
        state = self.position[0]*self.size*self.size*self.size*2
        state = state + self.position[1]*self.size*self.size*2
        state = state + self.item_pickup[0]*self.size*2
        state = state + self.item_pickup[1]*2

        if self.item_in_car:
            state = state + 1
        return state

    # Take action and receive reward fromn the rules
    def make_action(self, action):

        (x,y) = self.position

        if action == 0: # down
            if y == self.size-1:
                return -10, False
            else:
                self.position = (x, y+1)
                return -1, False

        elif action == 1: # up
            if y == 0:
                return -10, False
            else:
                self.position = (x, y-1)
                return -1, False

        elif action == 2: # left
            if x == 0:
                return -10, False
            else:
                self.position = (x-1, y)
                return -1, False

        elif action == 3: # right
            if x == self.size-1:
                return -10, False
            else:
                self.position = (x+1, y)
                return -1, False

        elif action == 4: # pick-up
            if self.item_in_car:
                return -10, False
            elif self.item_pickup != (x,y):
                return -10, False
            else:
                self.item_in_car = True
                return 20, False

        elif action == 5: # drop-off
            if not self.item_in_car:
                return -10, False
            elif self.item_dropoff != (x,y):
                self.item_pickup = (x,y)
                self.item_in_car = False
                return -10, False
            else:
                self.item_in_car = False
                return 20, True

## Play a game manually

In [2]:
size = 10
item_pickup = (0,0)
item_dropoff = (9,9)
start_position = (9,0)

field = Field(size, item_pickup, item_dropoff, start_position)

In [3]:
field.position

(9, 0)

In [4]:
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)
field.make_action(2)

(-1, False)

In [5]:
field.make_action(4)

(20, False)

In [6]:
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)
field.make_action(0)

(-1, False)

In [7]:
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)
field.make_action(3)

(-1, False)

In [8]:
field.make_action(5)

(20, True)

## Play a game auto

### Random plan with Naive Random Solution

In [9]:
import random

In [10]:
# 1 episode
def random_solution():
    size = 10
    item_pickup = (0,0)
    item_dropoff = (9,9)
    start_position = (9,0)

    field = Field(size, item_pickup, item_dropoff, start_position)

    done = False
    steps = 0

    while not done:
        action = random.randint(0,5)
        reward, done = field.make_action(action)
        steps = steps + 1

    return steps


In [11]:
# Number of step or moving time that done
random_solution()

31585

- average runing

In [12]:
run = [random_solution() for _ in range(100)]

In [13]:
sum(run)/len(run)

142920.57

### Q-Learning algorithm plan

In [14]:
import numpy as np

In [29]:
import numpy as np
#                             alpha    ,       gamma         ,          epsilon
def q_learning(field, learning_rate=0.1, discount_factor=0.9, exploration_rate=0.1, max_steps=1000):
    size = field.size
    num_states = field.get_number_of_states()
    q_table = np.zeros((num_states, 6))  # 6 possible actions

    def select_action(state):
        if np.random.rand() < exploration_rate:
            return np.random.choice(6)  # Explore: choose a random action
        else:
            return np.argmax(q_table[state, :])  # Exploit: choose the action with the highest Q-value

    def update_q_table(state, action, reward, next_state):
        best_next_action = np.argmax(q_table[next_state, :])
        q_table[state, action] += learning_rate * (reward + discount_factor * q_table[next_state, best_next_action] - q_table[state, action])

    state = field.get_state()
    steps = 0

    for _ in range(max_steps):
        action = select_action(state)
        reward, done = field.make_action(action)

        steps += 1

        next_state = field.get_state()
        update_q_table(state, action, reward, next_state)

        state = next_state

        if done:
            return steps

    return max_steps  # Return max_steps if the goal is not reached within the specified maximum steps

# Example usage:
size = 5
item_pickup = (1, 1)
item_dropoff = (4, 4)
start_position = (0, 0)

field = Field(size, item_pickup, item_dropoff, start_position)

# Run Q-learning for a specific number of episodes
num_episodes = 1
total_steps = 0

for episode in range(num_episodes):
    steps = q_learning(field)
    total_steps += steps



average_steps = total_steps / num_episodes
print(f"Average Steps to Goal over {num_episodes} episodes: \nTotal step {total_steps} \nAverage  step {average_steps}")


Average Steps to Goal over 1 episodes: 
Total step 1000 
Average  step 1000.0


When compared to random solution the Q-Learning with mae number of episode that equal to 1 will more efficiency.

In [30]:
num_episodes = 10
total_steps = 0

for episode in range(num_episodes):
    steps = q_learning(field)
    total_steps += steps



average_steps = total_steps / num_episodes
print(f"Average Steps to Goal over {num_episodes} episodes: \nTotal step {total_steps} \nAverage  step {average_steps}")

Average Steps to Goal over 10 episodes: 
Total step 8127 
Average  step 812.7
