In [4]:
%run RaceCar.py
import numpy as np
import pygame

In [5]:
def generate_map(w_cells, h_cells, min_lim, max_lim):
    race_map = np.zeros((w_cells, h_cells), dtype=np.int32)
    # Set boundaries
    race_map[w_cells // 2:, h_cells // 2:] = -1
    lims = np.random.randint(min_lim, max_lim, size=4)
    for h in range(h_cells):
        lims[0] = np.amax([min_lim, np.amin([max_lim, lims[0] + np.random.choice([-1, 0, 1])])])
        race_map[h, :lims[0]] = -1
        if h > h_cells // 2:
            lims[1] = np.amax([min_lim, np.amin([max_lim, lims[1] + np.random.choice([-1, 0, 1])])])
            race_map[h, h_cells // 2 - lims[1]:] = -1
    for w in range(w_cells):
        lims[2] = np.amax([min_lim, np.amin([max_lim, lims[2] + np.random.choice([-1, 0, 1])])])
        race_map[:lims[2], w] = -1
        if w > w_cells // 2:
            lims[3] = np.amax([min_lim, np.amin([max_lim, lims[3] + np.random.choice([-1, 0, 1])])])
            race_map[w_cells // 2 - lims[3]:, w] = -1
    # Set start line
    race_map[-1, np.argwhere(race_map[-1, :] == 0)] = 1
    # Set finish line
    race_map[np.argwhere(race_map[:, -1] == 0), -1] = 2
    return race_map

#
width = 850
height = 850
w_cells = 30
h_cells = 30
min_lim = 1
max_lim = 6

race_map = generate_map(w_cells, h_cells, min_lim, max_lim)


def func():
    while True:
        draw_map(screen, race_map)
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                sys.exit()
        pygame.display.update()

In [6]:
# Number of states.
S = race_map.shape[0] * race_map.shape[1]
init_states = np.argwhere(race_map == 1)
terminal_states = np.argwhere(race_map == 2)
print(init_states)
# Number of actions (considering only "positive" movements).
actions_x = 6
actions_y = 6
A = actions_x * actions_y
# Init a randomic policy.
pi = np.random.randint(A, size=S)
# Discount factor.
gamma = 1.0
# Exploration degree.
epsilon = 0.1
# Number of episodes.
episodes = 100
# Inizialization.
N = np.zeros((S, A), dtype=np.int32)
Q = np.random.normal(loc=10.0, size=(S, A))
for T in terminal_states:
    Q[np.ravel_multi_index((T[0], T[1]), (race_map.shape[0], race_map.shape[1])), :] = 0.0

# Loop on episodes.
for e in range(episodes):
    print("NEW") #! DEBUG
    # Exploring start.
    init_coords = np.copy(init_states[np.random.randint(init_states.shape[0])])
    print(init_coords) #! DEBUG
    # Run a game.
    car = RaceCar(race_map, init_coords, pi, epsilon, width, height, actions_x, actions_y)
    car.run()
    states = car.get_states()
    actions = car.get_actions()
    rewards = car.get_rewards()
    G = 0.0
    for t, s_t in reversed(list(enumerate(states))):
        G = rewards[t] + gamma * G
        N[s_t, actions[t]] += 1
        Q[s_t, actions[t]] += (1.0 / N[s_t, actions[t]]) * (G - Q[s_t, actions[t]])
        pi[s_t] = np.argmax(Q[s_t, :])

[[29  4]
 [29  5]
 [29  6]
 [29  7]
 [29  8]
 [29  9]
 [29 10]
 [29 11]
 [29 12]
 [29 13]]
NEW
[29  6]
876
[29  6]
Action: 1, [0 1].
Action: 22, [3 4].
Action: 17, [2 5].
Action: 16, [2 4].
NEW
[29 13]
883
[29 13]
Action: 23, [3 5].
NEW
[29  6]
876
[29  6]
Action: 9, [1 3].
Action: 24, [4 0].
Action: 17, [2 5].
NEW
[29 13]
883
[29 13]
Action: 25, [4 1].
NEW
[29  5]
875
[29  5]
Action: 33, [5 3].
Action: 18, [3 0].
Action: 24, [4 0].
NEW
[29  7]
877
[29  7]
Action: 13, [2 1].
Action: 13, [2 1].
Action: 30, [5 0].
NEW
[29  4]
874
[29  4]
Action: 22, [3 4].
Action: 17, [2 5].
Action: 25, [4 1].
NEW
[29  7]
877
[29  7]
Action: 20, [3 2].
Action: 22, [3 4].
NEW
[29  4]
874
[29  4]
Action: 31, [5 1].
Action: 34, [5 4].
NEW
[29  8]
878
[29  8]
Action: 35, [5 5].
NEW
[29  4]
874
[29  4]
Action: 14, [2 2].
Action: 32, [5 2].
Action: 7, [1 1].
NEW
[29 13]
883
[29 13]
Action: 9, [1 3].
NEW
[29  7]
877
[29  7]
Action: 35, [5 5].
NEW
[29  9]
879
[29  9]
Action: 3, [0 3].
Action: 22, [3 4].
NEW
[29 