<a href="https://colab.research.google.com/github/sevendaystoglory/temp/blob/main/q-learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import numpy as np
import random

# Landmark locations as lists
def get_rgby():
    R = [0, 0]
    G = [0, 4]
    B = [4, 3]
    Y = [4, 0]
    return [R, G, B, Y]

LANDMARKS = [[0, 0], [0, 4], [4, 3], [4, 0]]  # list form

def generate_state():
    t = [random.randint(0, 4), random.randint(0, 4)]
    landmarks = get_rgby()
    p = random.choice(landmarks)
    d = random.choice([loc for loc in landmarks if loc != p])
    return [t, p, d]

# Q-table creation
def Q_table():
    return np.zeros((6, 5, 5, 5, 4))  # a, x_t, y_t, p_idx, d_idx

def coord_to_landmark_index(coord):
    return LANDMARKS.index(coord)

def passenger_to_index(passenger):
    if passenger == 'in_taxi':
        return 4
    else:
        return coord_to_landmark_index(passenger)

def destination_to_index(destination):
    return coord_to_landmark_index(destination)

# Q-learning update
def q_update(Q, state, action_str, next_state, reward, alpha=0.1, gamma=0.99):
    action_map = {'S': 0, 'N': 1, 'E': 2, 'W': 3, 'P': 4, 'D': 5}
    a = action_map[action_str]

    x_t, y_t = state[0]
    p_idx = passenger_to_index(state[1])
    d_idx = destination_to_index(state[2])

    x_next, y_next = next_state[0]
    p_next_idx = passenger_to_index(next_state[1])
    d_next_idx = destination_to_index(next_state[2])

    max_next = np.max(Q[:, x_next, y_next, p_next_idx, d_next_idx])
    td_target = reward + gamma * max_next

    Q[a, x_t, y_t, p_idx, d_idx] += alpha * (td_target - Q[a, x_t, y_t, p_idx, d_idx])

# Transition function
def transition(s, a):
    taxi = s[0][:]
    passenger = s[1]
    destination = s[2]
    reward = -1  # default step cost

    # Movement
    if a == 'N':
        taxi[0] = max(taxi[0] - 1, 0)
    elif a == 'S':
        taxi[0] = min(taxi[0] + 1, 4)
    elif a == 'E':
        taxi[1] = min(taxi[1] + 1, 4)
    elif a == 'W':
        taxi[1] = max(taxi[1] - 1, 0)

    # Pickup
    elif a == 'P':
        if passenger != 'in_taxi' and taxi == passenger:
            passenger = 'in_taxi'
        else:
            reward = -10

    # Dropoff
    elif a == 'D':
        if passenger == 'in_taxi' and taxi == destination:
            reward = 20
            passenger = 'delivered'
        else:
            reward = -10

    new_s = [taxi, passenger, destination]
    return new_s, reward


In [44]:
Q = Q_table() # initialize Q table
for episode in range(30000): # 1000 episodes
  state = generate_state() # initialize state
  while True:
    x_t, y_t = state[0]
    p_idx = passenger_to_index(state[1])
    d_idx = destination_to_index(state[2])
    q_index = np.argmin([Q[a, x_t, y_t, p_idx, d_idx] for a in range(6)])
    # ε-greedy action selection
    epsilon = 0.1
    if np.random.uniform(0, 1) < epsilon:
        a_idx = np.random.randint(6)
    else:
        q_values = [Q[a, x_t, y_t, p_idx, d_idx] for a in range(6)]
        a_idx = np.argmax(q_values)
    a = ['S', 'N', 'E', 'W', 'P', 'D'][a_idx]
    next_state, reward = transition(state, a)
    if next_state[1] == 'delivered':
      break
    q_update(Q, state, a, next_state, reward)
    state = next_state


In [45]:
def run_episode(Q):
    state = generate_state()
    steps = 0
    total_reward = 0
    print(f"\n🚕 Starting episode")
    print(f"Initial Taxi Pos: {state[0]}, Passenger: {state[1]}, Destination: {state[2]}\n")

    while True:
        x_t, y_t = state[0]
        p_idx = passenger_to_index(state[1])
        d_idx = destination_to_index(state[2])

        q_values = [Q[a, x_t, y_t, p_idx, d_idx] for a in range(6)]
        a_idx = np.argmax(q_values)
        action = ['S', 'N', 'E', 'W', 'P', 'D'][a_idx]

        next_state, reward = transition(state, action)

        steps += 1
        total_reward += reward

        print(f"Step {steps:2}: Taxi at {state[0]}, Action: {action}, Reward: {reward}")

        if next_state[1] == 'delivered':
            print(f"\n🎉 Episode complete in {steps} steps! Total reward: {total_reward}\n")
            break

        state = next_state


In [54]:
run_episode(Q)


🚕 Starting episode
Initial Taxi Pos: [3, 2], Passenger: [4, 3], Destination: [0, 4]

Step  1: Taxi at [3, 2], Action: E, Reward: -1
Step  2: Taxi at [3, 3], Action: S, Reward: -1
Step  3: Taxi at [4, 3], Action: P, Reward: -1
Step  4: Taxi at [4, 3], Action: N, Reward: -1
Step  5: Taxi at [3, 3], Action: N, Reward: -1
Step  6: Taxi at [2, 3], Action: N, Reward: -1
Step  7: Taxi at [1, 3], Action: N, Reward: -1
Step  8: Taxi at [0, 3], Action: E, Reward: -1
Step  9: Taxi at [0, 4], Action: D, Reward: 20

🎉 Episode complete in 9 steps! Total reward: 12

