In [1]:
import random
import os.path
import pickle
from game import Game, Move, Player
#from copy import deepcopy
import numpy as np
import itertools

In [2]:
from copy import deepcopy

possible_moves = [
    *itertools.product([(0,1),(0,2),(0,3)], [Move.BOTTOM, Move.LEFT, Move.RIGHT]), 
    *itertools.product([(4,1),(4,2),(4,3)], [Move.LEFT, Move.RIGHT, Move.TOP]), 
    *itertools.product( [(1,0),(2,0),(3,0)],  [Move.BOTTOM, Move.TOP, Move.RIGHT]), 
    *itertools.product([(1,4),(2,4),(3,4)], [Move.BOTTOM, Move.LEFT, Move.TOP]), 
    ((0,0),Move.BOTTOM), ((0,0),Move.RIGHT), ((0,4),Move.BOTTOM), ((0,4),Move.LEFT), ((4,0),Move.TOP), ((4,0),Move.RIGHT), ((4,4),Move.TOP), ((4,4),Move.LEFT)
]

class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()
        self.number = 1
        self.last_state_action = None

    def get_valid_moves(self, state):
        return [ x for x in possible_moves if state[x[0]] == -1 or state[x[0]] == self.number ]

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:        
        from_pos, slide = random.choice(self.get_valid_moves(game._board))
        return from_pos, slide

class MyPlayer(Player):
    def __init__(self, learning_rate=.8, discount_factor=.2, exploration_prob=0.1) -> None:
        super().__init__()
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.history = []
        self.slides = [Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT]
        self.number = 0
        self.positioning_bonuses_matrix_start = [[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]]
        self.positioning_bonuses_matrix = None

        self.q_table = {}
    
    def opponent_about_to_win(self, game:Game, opponent):
        count = 0
        # Check rows
        for row in game._board:
            if np.sum(row == opponent) == 4:
                count +=1

        # Check columns
        for col in game._board.T:
            if np.sum(col == opponent) == 4:
                count +=1

        # Check main diagonal
        if np.sum(np.diag(game._board) == opponent) == 4:
            count +=1

        # Check anti-diagonal
        if np.sum(np.diag(np.fliplr(game._board)) == opponent) == 4:
            count +=1
    
        return  count
    
    def bonus_positioning(self, board):
        bonus = 0
        for i in range(0,5):
            for j in range(0,5):
                if board[i][j]==self.number:
                    bonus += self.positioning_bonuses_matrix[i][j]

        return  bonus

    def get_q_value(self, state, action):
        state_key = str(state)
        return self.q_table.get((state_key, action), .0)

    def update_q_value(self, state, action, new_value):
        state_key = str(state)
        self.q_table[(state_key, action)] = new_value

    def make_move(self, game: Game) -> tuple[tuple[int, int], Move]: 
        q_values = [self.get_q_value(game._board, action) for action in self.get_valid_moves(game._board)]
        max_q_value = max(q_values, default=0.0)
        best_actions = [action for action, q_value in zip(self.get_valid_moves(game._board), q_values) if q_value == max_q_value]
        (from_pos, slide) =  random.choice(best_actions)

        return from_pos, slide
    
    def get_valid_moves(self, state):
        return [ x for x in possible_moves if state[x[0]] == -1 or state[x[0]] == self.number ]

    def train_move(self, game: Game) -> tuple[tuple[int, int], Move]:
        # Epsilon-greedy strategy for action selection
        original = game._board
        valid_moves = self.get_valid_moves(game._board)

        assert(all([game._board[m[0]]==self.number or game._board[m[0]]==-1 for m in valid_moves]))

        valid_move = False

        #count number of "4-near-pieces" of the opponent
        opp_about_to_win_count_b = self.opponent_about_to_win(game, 1-self.number)

        #make a move
        while not valid_move:
            if random.uniform(0, 1) <= self.exploration_prob:
                (from_pos, slide) = random.choice(self.get_valid_moves(game._board))
            else:
                # Choose the action with the highest Q-value
                q_values = [self.get_q_value(game._board, action) for action in self.get_valid_moves(game._board)]
                max_q_value = max(q_values, default=0.0)
                best_actions = [action for action, q_value in zip(self.get_valid_moves(game._board), q_values) if q_value == max_q_value]
                (from_pos, slide) =  random.choice(best_actions)

                assert(game._board[from_pos]==self.number or game._board[from_pos]==-1)  
                
            valid_move = game.moove(tuple(reversed(from_pos)), slide, self.number)

        #count number of "4-near-pieces" of the opponent
        opp_about_to_win_count_a = self.opponent_about_to_win(game, 1-self.number)

        #formula for additional bonus
        final_bonus = (opp_about_to_win_count_b-opp_about_to_win_count_a)*5
        move = (from_pos, slide)

        self.history.append((original, move, game._board, final_bonus))

        return from_pos, slide

    def rotate_move(self, move, rotation):
        from_pos, slide = move

        # Convert 1D position to 2D coordinates
        row, col = from_pos

        # Rotate the coordinates based on the board rotation
        for _ in range(rotation):
            row, col = col, 5 - 1 - row

        # Convert back to 1D position
        new_from_pos = (row, col)

        # Rotate the sliding direction based on the board rotation
        new_slide = Move((slide.value + rotation) % 4)

        return new_from_pos, new_slide
    
    def flip_move_vertically(self, move):
        from_pos, slide = move

        # Convert 1D position to 2D coordinates
        row, col = from_pos

        # Flip the column vertically (left to right)
        new_col = 5 - 1 - col

        # Convert back to 1D position
        new_from_pos = (row, new_col)

        # Adjust the slide direction based on the flip
        if slide == Move.LEFT:
            new_slide = Move.RIGHT
        elif slide == Move.RIGHT:
            new_slide = Move.LEFT
        else:
            new_slide = slide

        return new_from_pos, new_slide
    
    def flip_move_horizontally(self, move):
        from_pos, slide = move

        # Unpack the from_pos tuple
        row, col = from_pos

        # Flip the row horizontally (up to bottom)
        new_row = 5 - 1 - row

        # Create a new tuple for the updated from_pos
        new_from_pos = (new_row, col)

        # Adjust the slide direction based on the flip
        if slide == Move.TOP:
            new_slide = Move.BOTTOM
        elif slide == Move.BOTTOM:
            new_slide = Move.TOP
        else:
            new_slide = slide

        return new_from_pos, new_slide
    
    def trainInverse(self, state, action, reward, next_state):
        for i in range(5):
            for j in range(5):
                if state[i][j] != -1:
                    state[i][j] = 1 - state[i][j]
                if next_state[i][j] != -1:
                    next_state[i][j] = 1 - next_state[i][j]

        self.train(state, action, reward, next_state)
                    
    def train(self, state, action, reward, next_state):

        cur_qv = self.get_q_value(state, action)
        max_qv_next = max([self.get_q_value(next_state, next_action) for next_action in self.get_valid_moves(next_state)], default=0.0)

        new_qv = (1-self.learning_rate)*cur_qv + self.learning_rate * (reward + self.discount_factor * max_qv_next)

        #update the state and all its equivalents
        self.update_q_value(state, action, new_qv)
        
        for r in [90, 180, 270]:
            self.update_q_value(np.rot90(state, k=r), self.rotate_move(action, int(r/90)), new_qv)
        
        self.update_q_value(np.fliplr(state), self.flip_move_vertically(action), new_qv)
        self.update_q_value(np.flipud(state), self.flip_move_horizontally(action), new_qv)

    def trainModelVS(self, opponent: Player):
        # load q_table from file in order to training on previous experience
        if os.path.isfile("./q_table.pkl"):
            with open('q_table.pkl', 'rb') as fp:
                self.q_table = pickle.load(fp)
        
        #play various games
        num_games = 100
        for ep in range(num_games):
            #logging
            if ep <num_games-1:
                print(f"TRAINING: {round((ep+1)/num_games*100, 2)}%", end="\r")
            else:
                print(f"TRAINING COMPLETED!\n")

            #start new game
            game = Game()

            while True:
                # Player 1 (RL agent) move
                self.train_move(game)

                winner = game.check_winner()
                if winner > 0: break

                before = game.get_board()
                from_pos, slide = opponent.make_move(game)
                game.moove(tuple(reversed(from_pos)), slide, opponent.number)
                opponent.last_state_action = (before, (from_pos, slide), game.get_board())

                winner = game.check_winner()
                if winner > 0: break


            n=len(self.history)
            if winner == 0:
                for i, h in enumerate(self.history):
                    scores = np.power(2, np.linspace(1,7,num=n))-1
                    self.train(h[0], h[1], scores[i]+h[3], h[2])
            else:
                for i, h in enumerate(self.history):
                    scores = np.power(2, np.linspace(1,2,num=n))-1
                    self.train(h[0], h[1], scores[i]+h[3], h[2])

                self.trainInverse(opponent.last_state_action[0], opponent.last_state_action[1], 2**7-1, opponent.last_state_action[2])

            self.history=[]
        
def test(agent, opponent):
    num_episodes=50
    winsF = 0

    for ep in range(num_episodes):
        #logging
        if ep <=num_episodes-1:
            print(f"TESTING: {round((ep)/num_episodes*100, 2)}%", end="\r")
        game = Game()
        if game.play(agent, opponent) == 0: winsF +=1

    print(f"TESTING COMPLETED!\n")

    print(f"wins F:{round(winsF/num_episodes,2)*100}%")

In [3]:
myAgent = MyPlayer()
rndPlayer = RandomPlayer()

In [4]:
myAgent.trainModelVS(rndPlayer)

TRAINING COMPLETED!



In [5]:
with open('q_table.pkl', 'wb') as fp:
    pickle.dump(myAgent.q_table, fp)

In [6]:
#test(myAgent, rndPlayer)
game = Game()
test(myAgent, rndPlayer)

0 (2, 0) BOTTOM
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
🟩⬜⬜⬜⬜

1 (0, 0) RIGHT
⬜⬜⬜⬜🟥
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
🟩⬜⬜⬜⬜

0 (4, 3) LEFT
⬜⬜⬜⬜🟥
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
🟩🟩⬜⬜⬜

1 (3, 0) BOTTOM
⬜⬜⬜⬜🟥
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
🟩⬜⬜⬜⬜
🟥🟩⬜⬜⬜

0 (3, 0) RIGHT
⬜⬜⬜⬜🟥
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜🟩
🟥🟩⬜⬜⬜

1 (0, 0) RIGHT
⬜⬜⬜🟥🟥
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜🟩
🟥🟩⬜⬜⬜

0 (1, 0) BOTTOM
⬜⬜⬜🟥🟥
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
🟥⬜⬜⬜🟩
🟩🟩⬜⬜⬜

1 (2, 4) LEFT
⬜⬜⬜🟥🟥
⬜⬜⬜⬜⬜
🟥⬜⬜⬜⬜
🟥⬜⬜⬜🟩
🟩🟩⬜⬜⬜

0 (4, 1) TOP
⬜🟩⬜🟥🟥
⬜⬜⬜⬜⬜
🟥⬜⬜⬜⬜
🟥⬜⬜⬜🟩
🟩⬜⬜⬜⬜

1 (4, 4) LEFT
⬜🟩⬜🟥🟥
⬜⬜⬜⬜⬜
🟥⬜⬜⬜⬜
🟥⬜⬜⬜🟩
🟥🟩⬜⬜⬜

0 (4, 4) LEFT
⬜🟩⬜🟥🟥
⬜⬜⬜⬜⬜
🟥⬜⬜⬜⬜
🟥⬜⬜⬜🟩
🟩🟥🟩⬜⬜

1 (0, 0) RIGHT
🟩⬜🟥🟥🟥
⬜⬜⬜⬜⬜
🟥⬜⬜⬜⬜
🟥⬜⬜⬜🟩
🟩🟥🟩⬜⬜

0 (4, 3) TOP
🟩⬜🟥🟩🟥
⬜⬜⬜🟥⬜
🟥⬜⬜⬜⬜
🟥⬜⬜⬜🟩
🟩🟥🟩⬜⬜

1 (1, 0) RIGHT
🟩⬜🟥🟩🟥
⬜⬜🟥⬜🟥
🟥⬜⬜⬜⬜
🟥⬜⬜⬜🟩
🟩🟥🟩⬜⬜

0 (4, 4) TOP
🟩⬜🟥🟩🟩
⬜⬜🟥⬜🟥
🟥⬜⬜⬜🟥
🟥⬜⬜⬜⬜
🟩🟥🟩⬜🟩

1 (0, 2) BOTTOM
🟩⬜🟥🟩🟩
⬜⬜⬜⬜🟥
🟥⬜⬜⬜🟥
🟥⬜🟩⬜⬜
🟩🟥🟥⬜🟩

0 (1, 0) BOTTOM
🟩⬜🟥🟩🟩
🟥⬜⬜⬜🟥
🟥⬜⬜⬜🟥
🟩⬜🟩⬜⬜
🟩🟥🟥⬜🟩

1 (1, 4) LEFT
🟩⬜🟥🟩🟩
🟥🟥⬜⬜⬜
🟥⬜⬜⬜🟥
🟩⬜🟩⬜⬜
🟩🟥🟥⬜🟩

0 (3, 0) BOTTOM
🟩⬜🟥🟩🟩
🟥🟥⬜⬜⬜
🟥⬜⬜⬜🟥
🟩⬜🟩⬜⬜
🟩🟥🟥⬜🟩

1 (1, 4) TOP
🟩⬜🟥🟩🟥
🟥🟥⬜⬜🟩
🟥⬜⬜⬜🟥
🟩⬜🟩⬜⬜
🟩🟥🟥⬜🟩

0 (4, 3) TOP
🟩⬜🟥🟩🟥
🟥🟥⬜🟩🟩
🟥⬜⬜⬜🟥
🟩⬜🟩⬜⬜
🟩🟥🟥⬜🟩

1 (2, 0) BOTTOM
🟩⬜🟥🟩🟥
🟥🟥⬜🟩🟩
🟩⬜⬜⬜🟥
🟩⬜🟩⬜⬜
🟥🟥🟥

KeyboardInterrupt: 