# Inhaltsverzeichnis
Die allgemeinen Tic Tac Toe Methoden sind in der beiliegenden Datei `TTT_functions.py` geschrieben um dieses Notebook etwas zu kürzen.
### 1. Q-Learning Implementierung wie für Blatt 3

### 2. Neuronales Netz

### 3. Tests

In [12]:
from TTT_functions import *

# Q-Learning

Der Gegenspieler wird immer als Umwelt interpretiert. Das heißt wird von einem Zustand $S$ aus eine Aktion $A$ gewählt, so ist der Folgezustand $S'$ dadurch noch nicht eindeutig bestimmt. Dieser wird erst durch die nächste Aktion des Gegners festgelegt.

So kann auch während dem Spiel gelernt werden, da die Wertung vom Zustands-Aktionspaar $(S,A)$ von der Übergangswahrscheinlichkeit $S \to S'$ abhängt.

Indem dies für beide Spieler gleichzeitig gemacht wird kann ein Spiel also die Q-Matrix mit Informationen über alle im Spiel vorkommenden Zustands-Aktionspaare aktualisiert werden, sodass die KI gleichzeitig das Verhalten als beginnender und als zweiter Spieler lernt.

## Trainingsalgorithmus

In [30]:
def train_q_learning(learning_rate, discount_factor, base_exploration_rate, num_episodes=1e4, reward_dict={"win":1, "loss":-1, "draw":0, "move":-0.05}):
    """
    play Tic Tac Toe [num_episodes] times to learn using Q-Learning with the given learning rate and discount_factor.
    inputs:
        learning_rate - (float) in range [0,1] - alpha
        discount_factor - (float) in range [0,1] - gamma
        base_exploration_rate - (float) - the starting exploration rate
        num_episodes - (int) - number of episodes for training
        reward_dict - (dict) - a dictionary specifying the rewards for winning, losing and draw
            -> must have keys "win", "loss", "draw"
    returns:
        (dict) - the Q-table after training with the given parameters
    """
    Q_table = dict()    # assign values to every visited state-action pair
    N_table = dict()    # counting how often each state-action pair was visited
    action_dict = dict()    # save the possible actions for each state

    games = []
    exploration_rate = base_exploration_rate
    for n in range(num_episodes):
        # play episode
        state_hist = play_episode(Q_table, action_dict, exploration_rate, discount_factor, learning_rate, reward_dict, N_table)
        exploration_rate *= base_exploration_rate
        games.append(state_hist)

    print("final exploration rate:", exploration_rate)
    
    return Q_table, N_table, games
        

## Simulieren einer Episode

In [31]:
def play_episode(Q_table, action_dict, exploration_rate, discount_factor=0.95, learning_rate=0.1, reward_dict={"win":1, "loss":-1, "draw":0, "move":-0.05}, N_table=dict()):
    """
    self-play an entire episode
    returns:
        (list) - state history
        (list) - action history
    
    action_dict is changed in-place
    """ 
    field = [0 for _ in range(9)]
    sign = 1
    action_history = []
    state_history = []
    while True:
        state = tuple(field)
        state_history.append(state)
        # get possible actions
        try:
            actions = action_dict[state]
        except KeyError:
            actions = get_actions(field)
            action_dict[state] = actions

        if len(state_history) > 2: 
            # we know the state that resulted from the last action
            update_q_table(Q_table, state_history, action_history, actions, discount_factor, learning_rate, reward_dict, N_table)

        if len(actions) == 0:
            break # game has ended

        action = choose_Q_action(state, actions, Q_table, exploration_rate=exploration_rate)
        action_history.append(action)
        field[action] = sign
        sign = sign%2 + 1 # toggle sign between 1 and 2

    last_state = state_history[-2]
    last_action = action_history[-1]
    if not (last_state, last_action) in Q_table.keys():
        Q_table[(last_state, last_action)] = 0
        N_table[(last_state, last_action)] = 0
    # print("before", Q_table[(last_state, last_action)])
    Q_table[(last_state, last_action)] += learning_rate*(reward_dict["win"] - Q_table[(last_state, last_action)])
    N_table[(last_state, last_action)] += 1
    
    return state_history


def update_q_table(Q_table, state_history, action_history, actions, discount_factor, learning_rate, reward_dict, N_table):
    """
    update the second to last state in the Q-table
    returns:
        None
    """
    prev_state = state_history[-3] # S = state
    prev_action = action_history[-2] # A = action
    state = state_history[-1] # S' = next state after action A

    reward = get_reward(list(state), actions, reward_dict) # R = Reward
    next_rewards = [] # Q(S', a') for all actions a'
    for action in actions:
        try:
            next_rewards.append(Q_table[(state, action)])
        except KeyError:
            Q_table[(state, action)] = 0
            N_table[(state, action)] = 0
            next_rewards.append(0)

    if not (prev_state, prev_action) in Q_table.keys():
        Q_table[(prev_state, prev_action)] = 0
        N_table[(prev_state, prev_action)] = 0
    if ((1,2,0,0,1,2,0,0,0),8) in Q_table.keys():
        test_value = str(Q_table[((1,2,0,0,1,2,0,0,0),8)])
    # Q(S,A) += alpha*(R + gamma * max(S', a') - Q(S,A))
    Q_table[(prev_state, prev_action)] += learning_rate*(reward + discount_factor * max(next_rewards, default=0) - Q_table[(prev_state, prev_action)])
    N_table[(prev_state, prev_action)] += 1
    if ((1,2,0,0,1,2,0,0,0),8) in Q_table.keys():
        if test_value != str(Q_table[((1,2,0,0,1,2,0,0,0),8)]):
            print("old value:", test_value)
            print("reward was", reward)
            print("new value:", Q_table[((1,2,0,0,1,2,0,0,0),8)])


def get_reward(field, actions, reward_dict):
    """
    return the reward for the given field and possible actions
    """
    if len(actions) > 0:
        reward = reward_dict["move"]
    else:
        winner = game_ended(field, get_winner=True)
        if winner == 0: #draw
            reward = reward_dict["draw"]
        else:
            reward = reward_dict["loss"]
    return reward

## Auswählen einer Aktion
Die Aktionen werden nach einer $\varepsilon$-greedy Strategie ausgewählt. $\varepsilon$ ist dabei die Wahrscheinlichkeit, dass Exploration, also eine zufällige Aktion gewählt wird.

In [32]:
import random
def choose_Q_action(state, actions, Q_table, exploration_rate=0):
    """
    choose an action based on the possible actions, the current Q-table and the current exploration rate
    inputs:
    -------
        state - (tuple) or (list) - the state as a tuple or list
        actions (tuple) or (list) - all possible actions in the given state
        Q_table - (dict) - dictionary storing all known Q-values
        exploration_rate - (float) in [0,1] - probability of choosing exploration rather than exploitation
    """
    r = random.random()
    if r > exploration_rate:
        # print("exploit", r, exploration_rate)
        # exploit knowledge
        action_values = []
        for action in actions:
            try:
                action_values.append(Q_table[(state,action)])
            except KeyError:
                action_values.append(0)
        max_value = max(action_values)
        best_actions = []
        for action, value in zip(actions, action_values):
            if value == max_value:
                best_actions.append(action)
        # return random action with maximum expected reward
        return random.choice(best_actions)
    # explore environment through random move
    return random.choice(actions)

## Anwenden des Q-Learning Algorithmus

In [47]:
learning_rate = 0.01
discount_factor = 0.95
num_episodes = int(1e4)
exploration_rate = 1-(5/num_episodes)
reward_dict = {"win":1,      # reward for win
               "loss":-1,    # reward for loss
               "draw":0,     # reward for draw
               "move":-0.05} # reward per non-terminal move

%time Q_table, N_table, games = train_q_learning(learning_rate, discount_factor, exploration_rate, num_episodes=num_episodes, reward_dict=reward_dict)

final exploration rate: 0.006726162258635551
Wall time: 1.25 s


## Speichern der Q-Matrix

In [36]:
def export_Q_table(Q_table, filename="Q_table.txt"):
    """
    write the given Q-table into a file
    """
    with open(filename, "w") as file:
        file.write("Q_table = {\n")
        for key, value in Q_table.items():
            file.write(str(key) + ":" + str(value) + ",\n")
        file.write("}")

In [38]:
export_Q_table(Q_table)

# Neuronales Netz
### Idee:
Wir nutzen die zuvor erzeugt Q-Matrix um ein Neuronales Netzwerk zu trainieren, welches die Q-Funktion approximiert.

Dazu wird das Netzwerk ein Zustands-Aktions Paar als Eingabe bekommen und eine Wertung ($\in \mathbb{R}$) zurückgeben.

## Trainingsdaten vorbereiten

In [1]:
import numpy as np
import random
def prepare_data(Q_table):
    """
    prepare the data given in a Q-table for training the neural network
    returns:
    --------
        (np.ndarray) - array of lists of 10 inputs each for the neural network. Every input is in range [0,1]
            the first value in each sublist is the action, the other 9 are the state
        (np.ndarray) - array of output values. Every output is in range [0,1]
    """
    training_inputs = []
    training_outputs = []
    for state_action, value in Q_table.items():
        # input_data   =      state_info       +    action_info
        training_inputs.append( prepare_state_action(*state_action) )
        training_outputs.append( (value+1)/3 ) #make sure the target value is in range [0,1]
        # training_outputs.append(random.random()) # choose random outputs
    return np.array(training_inputs), np.array(training_outputs)

def prepare_state_action(state, action):
    return [action/8] + [square/2 for square in state]

## Netzwerk initialisieren

Bei `activation='relu'` und `activation='selu'` scheint das Neuronale Netz zumindest die Aktionen nicht nur nach Index der Aktion zu bewerten. Dennoch lernt das Netz auch damit scheinbar gar nicht.

Wir haben auch zahlreichen Kombinationen von `optimizer` und `loss` ausprobiert und kamen immer zum Ergebnis, dass sich der `loss` Wert im Training nach spätestens zwei Epochen nicht mehr wirklich ändert.

In [2]:
from tensorflow.keras import layers
import tensorflow.keras as keras

def create_ttt_network(hidden_layers):
    model = keras.Sequential()
    model.add( keras.Input(shape=(10,)) ) # input layer - 10 Nodes
    for size in hidden_layers:
        model.add( layers.Dense(size, activation='relu') )
    model.add( layers.Dense(1) )   # output layer - 1 Node

    model.compile(optimizer='Adam', loss='BinaryCrossentropy')
    return model

# def create_ttt_network(hidden_layers):
#     x = keras.Input(shape=(10,))
#     inputs = x
#     for size in hidden_layers:
#         x = layers.Dense(size)(x)
#     outputs = layers.Dense(1)(x)   # output layer - 1 Node
#     model = keras.Model(inputs=inputs, outputs=outputs)
#     model.compile(optimizer='RMSprop', loss='Huber')
#     return model

In [3]:
keras.losses.BinaryCrossentropy

tensorflow.python.keras.losses.BinaryCrossentropy

In [4]:
keras.activations.sigmoid

<function tensorflow.python.keras.activations.sigmoid(x)>

In [5]:
keras.optimizers.RMSprop

tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop

## Netzwerk trainieren

Hier wird das Netzwerk mit den oben generierten Daten trainiert. Anstatt oben eine neue Q-Matrix zu berechnen kann auch eine bekannte aus einer Datei geladen werden.

In [6]:
def train_network(model, samples, labels, epochs=50, batch_size=32):
    return model.fit(samples, labels, epochs=epochs, batch_size=batch_size, use_multiprocessing=True, validation_split=0.1)

def import_Q_table(filename="Q_table.txt"):
    """
    import Q_table as dictionary:
    keys are state-action pairs as a tuple of a tuple (9 integers: 0/1/2) and an integer (0-8)
    values are the corresponding Q-values

    Example:
        Q_table[((0,0,0,0,1,0,0,0,0),2)] -> 0.3
    """
    Q_table = dict()
    with open(filename, "r") as file:
        for line in file.readlines():
            if line == "Q_table = {\n" or line == "}":
                continue
            state_action, value = line[:-2].split(":")
            state = tuple([int(x.strip(" ")) for x in state_action[2:-5].split(",")])
            action = int(state_action[-2])
            Q_table[(state, action)] = float(value)
    return Q_table

#### Load Q-table from File

In [7]:
# optionally generate data
Q_table = import_Q_table(filename="Q_table.txt")

#### actually train the network 

Wir haben verschiedene Größen des Netzwerks ausprobiert:
  - mehrere Hidden Layers 2-5 Ebenen, insgesamt bis zu ca. 100 Neuronen
  - keine Hidden Layers
  - und vieles dazwischen

In allen Fälen haben wir verschieden Werte für `batch_size` (im Breeich 5 bis 50) und `epochs` (im Bereich 1-50) ausprobiert und immer das gleiche Ergebnis festgestellt:
Das Neuronale Netz scheint nicht zu lernen. Der `loss` ändert sich ab dem zweiten Durchlauf quasi gar nicht mehr.

In [20]:
# prepare data
inputs, outputs = prepare_data(Q_table)
# inputs = np.array(list(zip(*inputs)))
# inputs = np.array([[i//10 for i in range(1,11)]])
# print(inputs)
# outputs = np.array([1])
print(f"train network using {inputs.shape} test values")
# create model
my_model = create_ttt_network([30,25,20,20])
my_model.summary()
# train network with given data
history = my_model.fit(inputs, outputs, epochs=5000, batch_size=50, use_multiprocessing=True, validation_split=0.1, verbose=1)
# history = train_network(my_model, inputs, outputs, epochs=20, batch_size=25)

] - 1s 2ms/step - loss: 0.6364 - val_loss: 0.6367
Epoch 4811/5000
Epoch 4812/5000
Epoch 4813/5000
Epoch 4814/5000
Epoch 4815/5000
Epoch 4816/5000
Epoch 4817/5000
Epoch 4818/5000
Epoch 4819/5000
Epoch 4820/5000
Epoch 4821/5000
Epoch 4822/5000
Epoch 4823/5000
Epoch 4824/5000
Epoch 4825/5000
Epoch 4826/5000
Epoch 4827/5000
Epoch 4828/5000
Epoch 4829/5000
Epoch 4830/5000
Epoch 4831/5000
Epoch 4832/5000
Epoch 4833/5000
Epoch 4834/5000
Epoch 4835/5000
Epoch 4836/5000
Epoch 4837/5000
Epoch 4838/5000
Epoch 4839/5000
Epoch 4840/5000
Epoch 4841/5000
Epoch 4842/5000
Epoch 4843/5000
Epoch 4844/5000
Epoch 4845/5000
Epoch 4846/5000
Epoch 4847/5000
Epoch 4848/5000
Epoch 4849/5000
Epoch 4850/5000
Epoch 4851/5000
Epoch 4852/5000
Epoch 4853/5000
Epoch 4854/5000
Epoch 4855/5000
Epoch 4856/5000
Epoch 4857/5000
Epoch 4858/5000
Epoch 4859/5000
Epoch 4860/5000
Epoch 4861/5000
Epoch 4862/5000
Epoch 4863/5000
Epoch 4864/5000
Epoch 4865/5000
Epoch 4866/5000
Epoch 4867/5000
Epoch 4868/5000
Epoch 4869/5000
Epoch 

In [21]:
print(my_model(np.array([np.array([0,0,0,0,0,0,0,0,0,0.25])]), training=False))

tf.Tensor([[0.3210916]], shape=(1, 1), dtype=float32)


# Aktion auswählen

In [22]:
import random
def choose_NN_action(state, actions, model, exploration_rate=0):
    """
    choose an action based on the possible actions, the given neural network (model) and the current exploration rate
    """
    r = random.random()
    if r > exploration_rate:
        # exploit knowledge
        action_values = []
        for action in actions:
            action_values.append( model.predict([prepare_state_action(state, action)])[0][0] )
        print(list(zip(actions, action_values)))
        max_value = max(action_values)
        best_actions = []
        for action, value in zip(actions, action_values):
            if value == max_value:
                best_actions.append(action)
        # return random action with maximum expected reward
        return random.choice(best_actions)
    # explore environment through random move
    return random.choice(actions)

# Spielen: Mensch vs. KI
Bevor gespielt werden kann muss dies natürlich mit einigen funktionen vorbereitet werden:

In [26]:
def play_AI(Q_table, network):
    choose_action = get_ai_function(Q_table, network)
    start_player = ""
    while not start_player.lower() in ["me", "ai"]:
        start_player = input("Who starts? (me/ ai)\n")
    
    field = [0 for _ in range(9)]
    sign = 1
    print_field(field)

    playing = True
    while playing:
        actions = get_actions(field)
        if len(actions) == 0:
            break
        if start_player == "ai":
            action = choose_action(tuple(field), actions, exploration_rate=0)
        else:
            action = get_human_action(actions)
            if action == "end":
                print("game interrupted")
                return
        field[action] = sign
        sign = sign%2 + 1
        print_field(field)
        start_player = "ai" if start_player == "me" else "me"
        playing = print_winner(field)


def print_winner(field):
        """
        if the given field is in a terminal state, print the winner.
        """
        winner = game_ended(field, get_winner=True)
        playing = True
        if winner != None:
            playing = False
            if winner == 0:
                print("draw!")
            elif winner == 1:
                print("'o' won!")
            else:
                print("'x' won!")
        return playing


def get_human_action(actions):
    """
    get user input for the next action
    """
    action = -5
    while not action in actions:
        action = input(f"Choose your action ({str(actions)[1:-1]})\n")
        try:
            action = int(action)
            return action
        except:
            if action.lower() == "end":
                return "end"


def get_ai_function(Q_table, network):
    """
    returns a function that chooses an AI-move based on a given field
    inputs:
        Q_table - (dict) - dictionary with Q_values
        network - (keras.model) - keras model
    returns:
        (function) - a function that chooses an action.
            arguments: (state, actions, exploration_rate=0)
    """
    user_input = ""
    while not user_input.upper() in ["NN", "Q"]:
        user_input = input("What AI shall be the opponent?\n(Neural Network -> NN or Q-Learning -> Q)\n")

    if user_input.upper() == "NN":
        def choose_ai_action(state, actions, exploration_rate=0):
            return choose_NN_action(state, actions, network, exploration_rate=exploration_rate)
    else:
        def choose_ai_action(state, actions, exploration_rate=0):
            return choose_Q_action(state, actions, Q_table, exploration_rate=exploration_rate)
    return choose_ai_action

In [27]:
play_AI(Q_table, my_model)

-------------
|   |   |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
[(0, 0.3165288), (1, 0.3159535), (2, 0.3156817), (3, 0.31544566), (4, 0.3152082), (5, 0.31495264), (6, 0.31469184), (7, 0.314431), (8, 0.3141702)]
-------------
| o |   |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
-------------
| o |   |   |
-------------
|   |   |   |
-------------
|   |   | x |
-------------
[(1, 0.3301552), (2, 0.33049333), (3, 0.33030254), (4, 0.3302731), (5, 0.3300371), (6, 0.329782), (7, 0.32949033)]
-------------
| o |   | o |
-------------
|   |   |   |
-------------
|   |   | x |
-------------
-------------
| o |   | o |
-------------
|   |   |   |
-------------
|   | x | x |
-------------
[(1, 0.33474383), (3, 0.33593133), (4, 0.3358034), (5, 0.33553803), (6, 0.33551237)]
-------------
| o |   | o |
-------------
| o |   |   |
-------------
|   | x | x |
-------------
-------------
| o |   | o |
-------------
| o | x |   |
---------

In [33]:
def display_Q_function(state, Q_table, n=10):
    """
    print the given field with Q-values for the availiable actions (instead of empty squares)
    """
    lines = ["-"*(7+3*n)]
    for r in range(3):
        line = "|"
        for c, sign in enumerate(state[3*r:3*r+3]):
            if sign == 1:
                line += " "*(n//2) + "o" + " "*(n//2) + "|"
            elif sign == 2:
                line += " "*(n//2) + "x" + " "*(n//2) + "|"
            else:
                line += f"{Q_table[(state, 3*r + c)]:{n}.3} |"
        lines.append(line)
        lines.append("-"*(7+3*n))
    for line in lines:
        print(line)

def display_NN_function(state, model, n=10):
    lines = ["-"*(7+3*n)]
    for r in range(3):
        line = "|"
        for c, sign in enumerate(state[3*r:3*r+3]):
            if sign == 1:
                line += " "*(n//2) + "o" + " "*(n//2) + "|"
            elif sign == 2:
                line += " "*(n//2) + "x" + " "*(n//2) + "|"
            else:
                action = model.predict([prepare_state_action(state, 3*r + c)])[0][0]
                line += f"{action:{n}.4} |"
        lines.append(line)
        lines.append("-"*(7+3*n))
    for line in lines:
        print(line)

In [38]:
field = (1,1,0,0,1,2,2,0,0)
field = (0,2,0,0,1,0,1,0,0)
field = (0,1,2,0,1,2,0,0,0)
field = (0,2,2,0,1,0,1,0,0)
display_Q_function(field, Q_table)
print("chosen action:", choose_Q_action(field, get_actions(list(field)), Q_table))
display_NN_function(field, my_model)
print("chosen action:", choose_NN_action(field, get_actions(list(field)), my_model))

-------------------------------------
|  -0.00255 |     x     |     x     |
-------------------------------------
|   -0.0207 |     o     |   -0.0109 |
-------------------------------------
|     o     |   -0.0199 |   -0.0105 |
-------------------------------------
chosen action: 0
-------------------------------------
|    0.3301 |     x     |     x     |
-------------------------------------
|    0.3282 |     o     |      0.33 |
-------------------------------------
|     o     |    0.3294 |    0.3293 |
-------------------------------------
[(0, 0.33009592), (3, 0.32821727), (5, 0.33002704), (7, 0.32936642), (8, 0.32925484)]
chosen action: 0


In [112]:
prep_state = [s/2 for s in field]
inputs, outputs = prepare_data(Q_table)
for state_action, value in zip(inputs, outputs):
    if list(state_action[:9]) == prep_state:
        print(int(state_action[-1]*8), "has value", value*3-1)

2 has value -0.010000000000000009
5 has value 0.15451061114415987
8 has value 0.0011816731690843518
