In [None]:
import numpy as np

In [None]:
# 0 = invalid, 1 = valid track, 2 = start, 3 = finish
track_1 = np.zeros((32, 17), dtype=np.int8)

# Mark valid regions
track_1[0:3, 3:9] = 1
track_1[3:10, 2:9] = 1
track_1[10:18, 1:9] = 1
track_1[18:25, 0:9] = 1
track_1[25, 0:10] = 1
track_1[26:28, 0:17] = 1
track_1[28, 1:17] = 1
track_1[29:31, 2:17] = 1
track_1[31, 3:17] = 1

# Mark starting line (overwrites valid)
track_1[0, 3:9] = 2

# Mark finish line (overwrites valid)
track_1[26:32, 16] = 3

velocity = np.array([0, 0])  # [v_horizontal, v_vertical]

actions = np.array([
    [-1, -1], [-1, 0], [-1, 1],
    [ 0, -1], [ 0, 0], [ 0, 1],
    [ 1, -1], [ 1, 0], [ 1, 1]
])

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

def visualize_track(track):
    rows, cols = track.shape
    
    fig, ax = plt.subplots(figsize=(4, 6))
    
    colors = {
        0: 'lightgray',   # invalid
        1: 'white',       # valid track
        2: '#f08080',     # start (light red)
        3: "#4EB84E"      # finish (light green)
    }
    
    for row in range(rows):
        for col in range(cols):
            color = colors[track[row, col]]
            rect = patches.Rectangle((col, row), 1, 1, 
                                      linewidth=0.5, 
                                      edgecolor='gray', 
                                      facecolor=color)
            ax.add_patch(rect)
    
    ax.set_xlim(0, cols)
    ax.set_ylim(0, rows)
    ax.set_aspect('equal')
    ax.axis('off')
    
    plt.tight_layout()
    plt.show()

visualize_track(track_1)

In [None]:
def get_valid_actions(velocity):
    """
    Before choosing an action, exclude any action that would result in (0, 0) velocity
    """
    valid = []
    for i, action in enumerate(actions):
        new_vel = np.clip(velocity + action, 0, 4)
        if not (new_vel[0] == 0 and new_vel[1] == 0):
            valid.append(i)
    return valid

In [None]:
def bresenham_supercover(x0, y0, x1, y1):
    cells = []
    dx = abs(x1 - x0)
    dy = abs(y1 - y0)
    x, y = x0, y0
    sx = 1 if x1 > x0 else -1
    sy = 1 if y1 > y0 else -1
    
    cells.append((x, y))
    
    if dx >= dy:
        err = 2 * dy - dx
        for _ in range(dx):
            if err >= 0:
                # diagonal step - add intermediate cells first
                cells.append((x+sx, y))
                cells.append((x, y+sy))
                y += sy
                err -= 2 * dx
            x += sx
            err += 2 * dy
            cells.append((x, y))
    else:
        err = 2 * dx - dy
        for _ in range(dy):
            if err >= 0:
                # diagonal step - add intermediate cells first
                cells.append((x, y+sy))
                cells.append((x+sx, y))
                x += sx
                err -= 2 * dy
            y += sy
            err += 2 * dx
            cells.append((x, y))
    
    return cells

In [None]:
def is_finish(cell, track):
    x, y = cell
    rows, cols = track.shape
    if x < 0 or x >= cols or y < 0 or y >= rows:
        return False
    return track[y, x] == 3

def is_invalid(cell, track):
    x, y = cell
    rows, cols = track.shape
    if x < 0 or x >= cols or y < 0 or y >= rows:
        return True
    return track[y, x] == 0

def check_path(x0, y0, x1, y1, track):
    cells = bresenham_supercover(x0, y0, x1, y1)
    for cell in cells:
        if is_finish(cell, track):
            return "finish", cell
        elif is_invalid(cell, track):
            return "crash", cell
    return "valid", (x1, y1)

In [None]:
# Algorithm parameter: small ε > 0
epsilon = 1e-6

In [None]:
# Initialize

# NOTE: State is position and velocity.
# There are 32x17 grid positions and 5 velocity for each of the horizontal and vertical component i.e. 5x5
# Total: 32 x 17 x 5 x 5

# π an arbitrary ε-soft policy
policy = np.zeros((32, 17, 5, 5), dtype=np.int8)

for vy in range(5):
    for vx in range(5):
        valid = get_valid_actions(np.array([vx, vy]))
        policy[:, :, vy, vx] = np.random.choice(valid)

In [None]:
# Q(s, a) ∈ R (arbitrarily), for all s ∈ S, a ∈ A(s)
action_value = np.zeros((32, 17, 5, 5, 9), dtype=np.float32)

In [None]:
# N(s, a) = 0, for all s ∈ S, a ∈ A(s)
number_of_times_visited = np.zeros((32, 17, 5, 5, 9), dtype=np.int8)