# <center>Домашняя работа по лекции 16
### Задание:
Опираясь на табличное обучение с подкреплением реализовать и обучить нейронную Q-функцию.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# Copied from https://github.com/neilslater/game_playing_scripts

'''
   Copyright 2017 Neil Slater

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
'''

import numpy as np
import random

class TicTacToeGame():
    def __init__(self):
        self.state = '         '
        self.player = 'X'
        self.winner = None

    def allowed_moves(self):
        states = []
        for i in range(len(self.state)):
            if self.state[i] == ' ':
                states.append(self.state[:i] + self.player + self.state[i+1:])
        return states

    def make_move(self, next_state):  # или ход привел к чьей-то победе, или переход хода
        if self.winner:
            raise(Exception("Game already completed, cannot make another move!"))
        if not self.__valid_move(next_state):
            raise(Exception("Cannot make move {} to {} for player {}".format(
                    self.state, next_state, self.player)))

        self.state = next_state
        self.winner = self.predict_winner(self.state)
        if self.winner:
            self.player = None
        elif self.player == 'X':
            self.player = 'O'
        else:
            self.player = 'X'

    def playable(self):
        return ( (not self.winner) and any(self.allowed_moves()) )  # победителя еще нет и есть доступные ходы

    def predict_winner(self, state):
        lines = [(0,1,2), (3,4,5), (6,7,8), (0,3,6), (1,4,7), (2,5,8), (0,4,8), (2,4,6)]
        winner = None
        for line in lines:
            line_state = state[line[0]] + state[line[1]] + state[line[2]]
            if line_state == 'XXX':
                winner = 'X'
            elif line_state == 'OOO':
                winner = 'O'
        return winner

    def __valid_move(self, next_state):
        allowed_moves = self.allowed_moves()
        if any(state == next_state for state in allowed_moves):
            return True
        return False

    def print_board(self):
        s = self.state
        print('     {} | {} | {} '.format(s[0],s[1],s[2]))
        print('    -----------')
        print('     {} | {} | {} '.format(s[3],s[4],s[5]))
        print('    -----------')
        print('     {} | {} | {} '.format(s[6],s[7],s[8]))


In [117]:
class Agent():
    def __init__(self, game_class, model, criterion, epsilon=0.1, lr=0.001, gamma=0.3, player_mark='X'):
        self.V = model
        self.NewGame = game_class
        self.epsilon = epsilon
        self.criterion = criterion
        self.optimizer = torch.optim.Adam(self.V.parameters(), lr=lr)
        self.player_mark = player_mark
        self.gamma = gamma

    def state_value(self, game_state):
        sign_to_value = {"X": 1.0, "O": -1.0, " ": 0.0}
        nn_game_state = torch.FloatTensor([sign_to_value[item] for item in game_state]).requires_grad_(True).view(1, len(game_state))
        return self.V(nn_game_state)

    def learn_game(self, num_episodes=1000):
        for episode in range(num_episodes):
            self.learn_from_episode()

    def learn_from_episode(self):
        game = self.NewGame()
        _, move = self.learn_select_move(game)  # выбираем начальный ход (учитываем заданноое правило)
        while move:
            move = self.learn_from_move(game, move)

    def learn_from_move(self, game, move):
        game.make_move(move)  # проверяет, есть ли победитель в game.winner, и если нет - переход хода
        r = self.__reward(game)  # вычисляем вознаграждение за ход move
        td_target = r
        next_state_value = 0.0
        selected_next_move = None
        if game.playable():  # победителя еще нет и есть доступные ходы
            best_next_move, selected_next_move = self.learn_select_move(game)
            next_state_value = self.state_value(best_next_move)
        current_state_value = self.state_value(move)
        td_target = r + self.gamma * next_state_value
        target = torch.FloatTensor([td_target]).view(1, 1).requires_grad_(True)
        
        # обучаем сеть
        self.V.zero_grad()
        loss = self.criterion(current_state_value, target)
        loss.backward()
        # делаем шаг в сторону улучшения модели
        self.optimizer.step()

        return selected_next_move

    def learn_select_move(self, game):
        allowed_state_values = self.__state_values(game.allowed_moves())  # словарь состояние-значение
        if game.player == self.player_mark:
            best_move = self.__argmax_V(allowed_state_values)
        else:
            best_move = self.__argmin_V(allowed_state_values)

        selected_move = best_move
        if random.random() < self.epsilon:   # добавляем случайный ход (pздесь - в 10%, согласно epsilon)
            selected_move = self.__random_V(allowed_state_values)

        return (best_move, selected_move)

    def play_select_move(self, game):
        allowed_state_values = self.__state_values(game.allowed_moves())  # словарь с доступными ходами и их values
        if game.player == self.player_mark:
            return self.__argmax_V(allowed_state_values)  # выбираем наулучших ход для player_mark
        else:
            # ходит "не player_mark": выбираем наулучших ход для "не player_mark" (== наихудший для player_mark)
            return self.__argmin_V(allowed_state_values)  
    
    def demo_game(self, verbose=False):
        game = self.NewGame()
        t = 0
        while game.playable():  # пока нет победителя и есть доступные ходы
            if verbose:
                print(" \nTurn {}\n".format(t))
                game.print_board()
            move = self.play_select_move(game)
            game.make_move(move)  # или ход привел к чьей-то победе, или переход хода
            t += 1
        if verbose:
            print(" \nTurn {}\n".format(t))
            game.print_board()
        if game.winner:
            if verbose:
                print("\n{} is the winner!".format(game.winner))
            return game.winner
        else:
            if verbose:
                print("\nIt's a draw!")
            return '-'

    def interactive_game(self, agent_player='X'):
        game = self.NewGame()
        t = 0
        while game.playable():
            print(" \nTurn {}\n".format(t))
            game.print_board()
            if game.player == agent_player:
                move = self.play_select_move(game)  # делаем ход (выбранное новое состояние)
                game.make_move(move)
            else:
                move = self.__request_human_move(game)
                game.make_move(move)
            t += 1

        print(" \nTurn {}\n".format(t))
        game.print_board()

        if game.winner:
            print("\n{} is the winner!".format(game.winner))
            return game.winner
        print("\nIt's a draw!")
        return '-'

    def __state_values(self, game_states):
        return dict((state, self.state_value(state)) for state in game_states)

    def __argmax_V(self, state_values):
        max_V = max(state_values.values())
        chosen_state = random.choice([state for state, v in state_values.items() if v == max_V])
        return chosen_state

    def __argmin_V(self, state_values):
        min_V = min(state_values.values())
        chosen_state = random.choice([state for state, v in state_values.items() if v == min_V])
        return chosen_state

    def __random_V(self, state_values):
        return random.choice(list(state_values.keys()))

    def __reward(self, game):
        if game.winner == self.player_mark:
            return 1.0
        elif game.winner:
            return -1.0
        else:
            return 0.0

    def __request_human_move(self, game):
        allowed_moves = [i+1 for i in range(9) if game.state[i] == ' ']
        human_move = None
        while not human_move:
            idx = int(input('Choose move for {}, from {} : '.format(game.player, allowed_moves)))
            if any([i==idx for i in allowed_moves]):
                human_move = game.state[:idx-1] + game.player + game.state[idx:]
        return human_move

In [118]:
class VNNModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(9, 50) 
        self.fc2 = nn.Linear(50, 50)
        self.fc3 = nn.Linear(50, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [119]:
def demo_game_stats(agent):
    games_num = 1000
    results = [agent.demo_game() for i in range(games_num)]
    game_stats = {k: results.count(k) / games_num * 100 for k in ['X', 'O', '-']}
    print("    percentage results: {}".format(game_stats))

In [122]:
%%time

agent = Agent(TicTacToeGame, model=VNNModel(), criterion=nn.MSELoss(), epsilon=0.1, lr=0.001)
print("Before learning:")
demo_game_stats(agent)

agent.learn_game(100)
print("After 100 learning games:")
demo_game_stats(agent)

agent.learn_game(400)
print("After 500 learning games:")
demo_game_stats(agent)

agent.learn_game(500)
print("After 1000 learning games:")
demo_game_stats(agent)

agent.learn_game(1000)
print("After 2000 learning games:")
demo_game_stats(agent)

agent.learn_game(1000)
print("After 3000 learning games:")
demo_game_stats(agent)

Before learning:
    percentage results: {'X': 100.0, 'O': 0.0, '-': 0.0}
After 100 learning games:
    percentage results: {'X': 100.0, 'O': 0.0, '-': 0.0}
After 500 learning games:
    percentage results: {'X': 0.0, 'O': 100.0, '-': 0.0}
After 1000 learning games:
    percentage results: {'X': 0.0, 'O': 100.0, '-': 0.0}
After 2000 learning games:
    percentage results: {'X': 0.0, 'O': 0.0, '-': 100.0}
After 3000 learning games:
    percentage results: {'X': 100.0, 'O': 0.0, '-': 0.0}
Wall time: 1min 51s


In [124]:
agent.demo_game(True)

 
Turn 0

       |   |   
    -----------
       |   |   
    -----------
       |   |   
 
Turn 1

       |   |   
    -----------
     X |   |   
    -----------
       |   |   
 
Turn 2

     O |   |   
    -----------
     X |   |   
    -----------
       |   |   
 
Turn 3

     O |   |   
    -----------
     X |   |   
    -----------
       |   | X 
 
Turn 4

     O | O |   
    -----------
     X |   |   
    -----------
       |   | X 
 
Turn 5

     O | O | X 
    -----------
     X |   |   
    -----------
       |   | X 
 
Turn 6

     O | O | X 
    -----------
     X |   | O 
    -----------
       |   | X 
 
Turn 7

     O | O | X 
    -----------
     X |   | O 
    -----------
     X |   | X 
 
Turn 8

     O | O | X 
    -----------
     X |   | O 
    -----------
     X | O | X 
 
Turn 9

     O | O | X 
    -----------
     X | X | O 
    -----------
     X | O | X 

X is the winner!


'X'

In [115]:
agent.interactive_game()

 
Turn 0

       |   |   
    -----------
       |   |   
    -----------
       |   |   
 
Turn 1

       |   |   
    -----------
       |   |   
    -----------
     X |   |   


Choose move for O, from [1, 2, 3, 4, 5, 6, 8, 9] :  5


 
Turn 2

       |   |   
    -----------
       | O |   
    -----------
     X |   |   
 
Turn 3

       |   |   
    -----------
       | O |   
    -----------
     X | X |   


Choose move for O, from [1, 2, 3, 4, 6, 9] :  3


 
Turn 4

       |   | O 
    -----------
       | O |   
    -----------
     X | X |   
 
Turn 5

       |   | O 
    -----------
       | O |   
    -----------
     X | X | X 

X is the winner!


'X'

In [116]:
agent.interactive_game()

 
Turn 0

       |   |   
    -----------
       |   |   
    -----------
       |   |   
 
Turn 1

       |   |   
    -----------
       |   |   
    -----------
     X |   |   


Choose move for O, from [1, 2, 3, 4, 5, 6, 8, 9] :  5


 
Turn 2

       |   |   
    -----------
       | O |   
    -----------
     X |   |   
 
Turn 3

       |   |   
    -----------
       | O |   
    -----------
     X | X |   


Choose move for O, from [1, 2, 3, 4, 6, 9] :  9


 
Turn 4

       |   |   
    -----------
       | O |   
    -----------
     X | X | O 
 
Turn 5

     X |   |   
    -----------
       | O |   
    -----------
     X | X | O 


Choose move for O, from [2, 3, 4, 6] :  6


 
Turn 6

     X |   |   
    -----------
       | O | O 
    -----------
     X | X | O 
 
Turn 7

     X |   |   
    -----------
     X | O | O 
    -----------
     X | X | O 

X is the winner!


'X'