# Aufgabe 1
Die Darstellung der Gridworld funktioniert recht gut mit einem dictionary, das als keys die Koordinaten der vorhandenen Positionen beinhaltet und als values entweder eine relle Zahl, die eine konstante Belohnung anzeigt, oder ein dictionary, dass Richtungen auf Bewertungen aufgrund der aktuellen Strategie und vorheriger Bewertungen abbildet.

In [1]:
def trivial_constant_map (position, value):
    """Return a dictionary representing one constant value position on the gridworld."""
    return {position: value}

def make_open_field ():
    """Make a standard open (that is, it can be moved through) field."""
    return {
        ( 1, 0): 0,
        (-1, 0): 0,
        ( 0, 1): 0,
        ( 0,-1): 0
    }.copy()

def trivial_open_map (position):
    """Return a dictionary representing one open position on the gridworld"""
    return {position: make_open_field()}

def add_constant_position (gridworld, position, value):
    """Add a position with constant value to the gridworld and return the gridworld"""
    gridworld[position] = value
    return gridworld

def add_open_position (gridworld, position):
    """Add an open position to the gridworld and return the gridworld."""
    gridworld[position] = make_open_field()
    return gridworld

Es ist sinnvoll, gegeben eine Karte, Position und eine Richtung, die Bewertungen des Nachbars in der gegebenen Richtung zu finden. Es ist dabei nach den Regeln des Spieles auch sinnvoll, die Bewertung der aktuellen Position auszugeben, wenn der entsprechende Nachbar nicht existiert.

In [2]:
def apply_direction (position, direction):
    """Return position+direction."""
    (x, y) = position
    (dx, dy) = direction
    return (x+dx, y+dy)

def get_neighbor (gridworld, position, direction):
    """Return the content of the neighbor of position in gridworld in direction.
    Return the content of the position itself if the neighbor is not in gridworld."""
    neighbor = gridworld.get(apply_direction(position, direction))
    return neighbor or gridworld[position]

Das Folgende ist um die Strategie tatsächlich zu extrahieren, aber für die Iteration ist diese Funktion nicht nötig.

In [3]:
def best_action (field):
    """Return the best action for the given field."""
    # collect the actions in a variable to return later
    actions = []
    # keep track of the maximum score found
    # in the beginning no maximum score has been found.
    max_score = None
    # go over all the directions in the current field
    for (direction, score) in field.items():
        if (not max_score) or (score > max_score):
            # if the score is higher than the current one
            # (with None interpreted as -inf)
            # update the maximum found score
            max_score = score
            # and replace all the already found actions
            # (which have a lower score than the current one)
            # with the current action
            actions = [direction]
        elif score == max_score:
            # if the current action has max score, just record it
            actions.append(direction)
        # if the current action has a score lower than max_score,
        # it is of no interest
    return actions

def extract_strategy (gridworld):
    strategy = dict()
    for (position, field) in gridworld.items():
        if type(field) is dict:
            strategy[position] = best_action(field)
    return strategy

Diese `best_action`-Funktion ist bereits ausreichend, um eine Strategie aus einem \(Q\) herzuleiten. Es besteht kein Bedarf, die Strategie explizit darzustellen.

Der Iterationsschritt lässt sich am einfachsten funktional implementieren, indem wir die Gridworld vollständig neu konstruieren, indem wir über die Einträge in der alten Gridworld iterieren und die neu berechneten Bewertungen in eine neue Gridworld einfügen.

In [4]:
# calculating the directions rotated left and right from a given direction
def rot_left (direction):
    (dx, dy) = direction
    return (-dy, dx)

def rot_right (direction):
    (dx, dy) = direction
    return (dy, -dx)

# calculate the score for a given target field – this is either the field value
# itself for a constant field, or the maximum over the action scores for all actions
# of an open field
def field_score (field):
    if type(field) is dict:
        return max(list(field.values()))
    else:
        return field

# we need a function to calculate a new field from a given gridworld and position
def calculate_field (gridworld, position):
    old_field = gridworld[position]  # assume that the referenced position
                                     # does exist in the gridworld
    # if the field is open, we need to recalculate it
    if type(old_field) is dict:
        new_field = make_open_field()
        # now compute all the scores for the newly created field
        for action in [(1,0), (-1,0), (0,1), (0,-1)]:
            left = rot_left(action)
            right = rot_right(action)
            new_field[action] = 0.8 * field_score(get_neighbor(gridworld, position, action)) \
                + 0.1 * field_score(get_neighbor(gridworld, position, left)) \
                + 0.1 * field_score(get_neighbor(gridworld, position, right)) \
                - 0.04
        return new_field
    # otherwise the field is constant and we return it as is
    else:
        return old_field

def update_gridworld (gridworld):
    new_gridworld = gridworld.copy()
    for (position, _) in gridworld.items():
        new_gridworld[position] = calculate_field(gridworld, position)
    return new_gridworld

Zuletzt testen wir die ganze Sache:

In [5]:
gridworld = trivial_open_map((0,0))
add_open_position(gridworld, (0,1))
add_open_position(gridworld, (0,2))
add_open_position(gridworld, (1,0))
add_open_position(gridworld, (1,2))
add_open_position(gridworld, (2,0))
add_open_position(gridworld, (2,1))
add_open_position(gridworld, (2,2))
add_constant_position(gridworld, (3,0),  1)
add_constant_position(gridworld, (3,1), -1)
add_open_position(gridworld, (3,2))

print ( gridworld )

print ( calculate_field(gridworld, (3,2)) )

gridworld = update_gridworld(gridworld)

print ( gridworld )

for _ in range(1000):
    gridworld = update_gridworld(gridworld)

print ( gridworld )

print ( extract_strategy(gridworld) )

{(0, 0): {(1, 0): 0, (-1, 0): 0, (0, 1): 0, (0, -1): 0}, (0, 1): {(1, 0): 0, (-1, 0): 0, (0, 1): 0, (0, -1): 0}, (0, 2): {(1, 0): 0, (-1, 0): 0, (0, 1): 0, (0, -1): 0}, (1, 0): {(1, 0): 0, (-1, 0): 0, (0, 1): 0, (0, -1): 0}, (1, 2): {(1, 0): 0, (-1, 0): 0, (0, 1): 0, (0, -1): 0}, (2, 0): {(1, 0): 0, (-1, 0): 0, (0, 1): 0, (0, -1): 0}, (2, 1): {(1, 0): 0, (-1, 0): 0, (0, 1): 0, (0, -1): 0}, (2, 2): {(1, 0): 0, (-1, 0): 0, (0, 1): 0, (0, -1): 0}, (3, 0): 1, (3, 1): -1, (3, 2): {(1, 0): 0, (-1, 0): 0, (0, 1): 0, (0, -1): 0}}
{(1, 0): -0.14, (-1, 0): -0.14, (0, 1): -0.04, (0, -1): -0.8400000000000001}
{(0, 0): {(1, 0): -0.04, (-1, 0): -0.04, (0, 1): -0.04, (0, -1): -0.04}, (0, 1): {(1, 0): -0.04, (-1, 0): -0.04, (0, 1): -0.04, (0, -1): -0.04}, (0, 2): {(1, 0): -0.04, (-1, 0): -0.04, (0, 1): -0.04, (0, -1): -0.04}, (1, 0): {(1, 0): -0.04, (-1, 0): -0.04, (0, 1): -0.04, (0, -1): -0.04}, (1, 2): {(1, 0): -0.04, (-1, 0): -0.04, (0, 1): -0.04, (0, -1): -0.04}, (2, 0): {(1, 0): 0.76, (-1, 0): -0