# Monte Carlo Race Car
**Authors: Alessandro Tenaglia, Roberto Masocco, Giacomo Solfizi**

**Date: May 24, 2021**

In [None]:
%run RaceCar.py
import numpy as np
import pygame

## Environment Creation
Some notes:
- The map is randomly generated inside a window of fixed size (and actually quite large).
- A reward of 0 is given when the finish line is reached.
- A reward of around -900 is given when an obstacle is hit.
- A reward of -1 is given for each regular step made.

In [None]:
def generate_map(w_cells, h_cells, min_lim, max_lim):
    """Generates a map as a table in which each cell holds a numerical flag."""
    race_map = np.zeros((w_cells, h_cells), dtype=np.int32)
    # Set boundaries and obstacles: -1.
    race_map[w_cells // 2:, h_cells // 2:] = -1
    lims = np.random.randint(min_lim, max_lim, size=4)
    for h in range(h_cells):
        lims[0] = np.amax([min_lim, np.amin([max_lim, lims[0] + np.random.choice([-1, 0, 1])])])
        race_map[h, :lims[0]] = -1
        if h > h_cells // 2:
            lims[1] = np.amax([min_lim, np.amin([max_lim, lims[1] + np.random.choice([-1, 0, 1])])])
            race_map[h, h_cells // 2 - lims[1]:] = -1
    for w in range(w_cells):
        lims[2] = np.amax([min_lim, np.amin([max_lim, lims[2] + np.random.choice([-1, 0, 1])])])
        race_map[:lims[2], w] = -1
        if w > w_cells // 2:
            lims[3] = np.amax([min_lim, np.amin([max_lim, lims[3] + np.random.choice([-1, 0, 1])])])
            race_map[w_cells // 2 - lims[3]:, w] = -1
    # Set start line: 1.
    race_map[-1, np.argwhere(race_map[-1, :] == 0)] = 1
    # Set finish line: 2.
    race_map[np.argwhere(race_map[:, -1] == 0), -1] = 2
    return race_map

# Initialize map dimensions and generate it.
width = 850
height = 850
w_cells = 30
h_cells = 30
min_lim = 1
max_lim = 6
race_map = generate_map(w_cells, h_cells, min_lim, max_lim)
race_map.dump("RaceMap.dat")

## Monte Carlo algorithm execution
Some notes:
- Actions are (X, Y) movements, namely "horizontal" and "vertical", inside the map, named "velocities".
- Along Y, the car can move from 0 up to 5 tiles upwards.
- Along X, the car can move from 0 to 5 tiles to the right.
- The null action, i.e. (0, 0) is allowed, but the optimal estimated policy should never take it.
- Maps are usually large, so no less than 500000 episodes should be run. In any case, if the finish line is never reached the results are meaningless since the policy estimated will be suicidal.

In [None]:
# Initialize states, the identify initial and terminal states.
S = race_map.shape[0] * race_map.shape[1]
init_states = np.argwhere(race_map == 1)
terminal_states = np.argwhere(race_map == 2)
wins = 0
print("Starting line has {} state(s).".format(len(init_states)))
# Number of actions.
actions_x = 6
actions_y = 6
A = actions_x * actions_y
# Randomly initialize the policy.
pi = np.random.randint(A, size=S)
# Discount factor.
gamma = 1.0
# Exploration degree.
epsilon = 0.1
# Number of episodes.
episodes = 500000
# Inizialize MC data: N counters and q estimate (randomly).
N = np.zeros((S, A), dtype=np.int32)
Q = np.random.normal(loc=10.0, size=(S, A))
for T in terminal_states:
    # Terminal states have null value.
    Q[np.ravel_multi_index((T[0], T[1]), (race_map.shape[0], race_map.shape[1])), :] = 0.0

# Loop on episodes.
for e in range(episodes):
    if (e % 10000) == 0:
        # Tell us where you at.
        print(e)
    # Exploring start.
    init_coords = np.copy(init_states[np.random.randint(init_states.shape[0])])
    # Run a game.
    car = RaceCar(race_map, init_coords, pi, epsilon, actions_x, actions_y)
    if car.run() == True:
        wins += 1
    # Perform MC update.
    states = car.get_states()
    actions = car.get_actions()
    rewards = car.get_rewards()
    G = 0.0
    for t, s_t in reversed(list(enumerate(states))):
        G = rewards[t] + gamma * G
        N[s_t, actions[t]] += 1
        Q[s_t, actions[t]] += (1.0 / N[s_t, actions[t]]) * (G - Q[s_t, actions[t]])
        pi[s_t] = np.argmax(Q[s_t, :])
if wins == 0:
    print("ERROR: Never got to the finish line!")
    raise
print("Visited {} (state, action) pairs.".format(np.count_nonzero(N)))
print("Reached the finish line {} time(s).".format(wins))

# Save results.
N.dump("N.dat")
Q.dump("Q.dat")
pi.dump("pi.dat")