In [1]:
import random
import os.path
import pickle
from game import Game, Move, Player
from copy import deepcopy
import numpy as np
import itertools

In [2]:
possible_moves = [
    *itertools.product([(0,1),(0,2),(0,3)], [Move.BOTTOM, Move.LEFT, Move.RIGHT]), 
    *itertools.product([(4,1),(4,2),(4,3)], [Move.LEFT, Move.RIGHT, Move.TOP]), 
    *itertools.product( [(1,0),(2,0),(3,0)],  [Move.BOTTOM, Move.TOP, Move.RIGHT]), 
    *itertools.product([(1,4),(2,4),(3,4)], [Move.BOTTOM, Move.LEFT, Move.TOP]), 
    ((0,0),Move.BOTTOM), ((0,0),Move.RIGHT), ((0,4),Move.BOTTOM), ((0,4),Move.LEFT), ((4,0),Move.TOP), ((4,0),Move.RIGHT), ((4,4),Move.TOP), ((4,4),Move.LEFT)
]

class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()
        self.number = 1
    
    def get_valid_moves(self, state):
        valid_moves = [ x for x in possible_moves if state[x[0]] == -1 or state[x[0]] == self.number ]
        assert(all(valid_moves))
        return valid_moves

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        from_pos, slide = random.choice(self.get_valid_moves(game._board))

        return from_pos, slide


class MyPlayer(Player):
    def __init__(self, learning_rate=0.2, discount_factor=0.6, exploration_prob=0.1) -> None:
        super().__init__()
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.history = []
        self.slides = [Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT]
        self.number = 0
        self.positioning_bonuses_matrix = [[3,1,1,1,3],[1,2,2,2,1],[1,2,3,2,1],[1,2,2,2,1],[3,1,1,1,3]]

        self.q_table = {}
    
    def opponent_about_to_win(self, game:Game, opponent):
        count = 0
        # Check rows
        for row in game._board:
            if np.sum(row == opponent) == 4:
                count +=1

        # Check columns
        for col in game._board.T:
            if np.sum(col == opponent) == 4:
                count +=1

        # Check main diagonal
        if np.sum(np.diag(game._board) == opponent) == 4:
            count +=1

        # Check anti-diagonal
        if np.sum(np.diag(np.fliplr(game._board)) == opponent) == 4:
            count +=1
    
        return  count
    
    def bonus_positioning(self, game:Game):
        bonus = 0
        for i in range(0,5):
            for j in range(0,5):
                if game._board[i][j]==self.number:
                    bonus += game._board[i][j]*self.positioning_bonuses_matrix[i][j]

        return  bonus

    def get_q_value(self, state, action):
        state_key = str(state)
        return self.q_table.get((state_key, action), .0)

    def update_q_value(self, state, action, new_value):
        state_key = str(state)
        self.q_table[(state_key, action)] = new_value

    def make_move(self, game: Game) -> tuple[tuple[int, int], Move]: 
        # Choose the action with the highest Q-value
        q_values = [self.get_q_value(game._board, action) for action in self.get_valid_moves(game._board)]
        max_q_value = max(q_values, default=0.0)
        best_actions = [action for action, q_value in zip(self.get_valid_moves(game._board), q_values) if q_value == max_q_value]
        (from_pos, slide) = random.choice(best_actions)

        return from_pos, slide
    
    def get_valid_moves(self, state):
        return [ x for x in possible_moves if state[x[0]] == -1 or state[x[0]] == self.number ]

    def train_move(self, game: Game) -> tuple[tuple[int, int], Move]:
        # Epsilon-greedy strategy for action selection
        original = deepcopy(game._board)
        valid_moves = self.get_valid_moves(game._board)

        assert(all([game._board[m[0]]==self.number or game._board[m[0]]==-1 for m in valid_moves]))

        valid_move = False

        about_to_win_count_b = self.opponent_about_to_win(game, 1-self.number)
        about_to_win_count_b_me = self.opponent_about_to_win(game, self.number)

        while not valid_move:
            if random.uniform(0, 1) <= self.exploration_prob:
                (from_pos, slide) = random.choice(valid_moves)
            else:
                # Choose the action with the highest Q-value
                q_values = [self.get_q_value(game._board, action) for action in self.get_valid_moves(game._board)]
                max_q_value = max(q_values, default=0.0)
                best_actions = [action for action, q_value in zip(self.get_valid_moves(game._board), q_values) if q_value == max_q_value]
                (from_pos, slide) =  random.choice(best_actions)

                assert(game._board[from_pos]==self.number or game._board[from_pos]==-1)  
                
            valid_move = game.moove(from_pos, slide, self.number)

        about_to_win_count_a = self.opponent_about_to_win(game, 1-self.number)
        about_to_win_count_a_me = self.opponent_about_to_win(game, self.number)

        final_bonus = (about_to_win_count_b-about_to_win_count_a)*10 + (about_to_win_count_a_me-about_to_win_count_b_me)*10+self.bonus_positioning(game)

        self.history.append((original, (from_pos, slide), deepcopy(game._board), final_bonus))
        return from_pos, slide

    def train(self, state, action, reward, next_state):
        cur_qv = self.get_q_value(state, action)
        max_qv_next = max([self.get_q_value(next_state, next_action) for next_action in self.get_valid_moves(next_state)], default=0.0)

        new_qv = (1-self.learning_rate)*cur_qv + self.learning_rate * (reward + self.discount_factor * max_qv_next)
        self.update_q_value(state, action, new_qv)

    def trainModelVS(self, opponent: Player):
        num_games = 10_000
        if os.path("./q_table.pkl"):
            with open('q_table.pkl', 'rb') as fp:
                self.q_table = pickle.load(fp)
                self.exploration_prob*=.2
        for ep in range(num_games):
            #logging
            if ep <num_games-1:
                print(f"TRAINING: {round((ep+1)/num_games*100, 2)}%", end="\r")
            else:
                print(f"TRAINING COMPLETED!\n")

            #start new game
            game = Game()

            while True:
                # Player 1 (RL agent) move
                self.train_move(game)

                winner = game.check_winner()
                if winner > 0: break

                from_pos, slide = opponent.make_move(game)
                game.moove(from_pos, slide, opponent.number)

                winner = game.check_winner()
                if winner > 0: break


            n=len(self.history)
            if winner == 0:
                for i, h in enumerate(self.history):
                    scores = np.power(2, np.linspace(1,7,num=n))-1
                    self.train(h[0], h[1], scores[i]+h[3], h[2])
            else:
                for i, h in enumerate(self.history):
                    scores = -np.power(2, np.linspace(1,5,num=n))+1
                    self.train(h[0], h[1], scores[i]+h[3], h[2])

            self.history=[]
        
def test(agent, opponent):
    num_episodes=1000
    winsF = 0
    #winsS = 0

    for ep in range(num_episodes):
        if ep <num_episodes-1:
            print(f"TESTING: {round((ep+1)/num_episodes*100, 2)}%", end="\r")
        else:
            print(f"TESTING COMPLETED!\n")
        game = Game()
        if game.play(agent, opponent) == 0: winsF +=1
        #winsS += game.play_with_agent(agent, 2)


    print(f"wins F:{round(winsF/num_episodes*100, 2)}%")
    #print(f"wins S:{round(winsS/num_episodes*100, 2)}%")

def watchGame(agent, opponent):
    game=Game()
    game.watchPlay(agent, opponent)

In [3]:
myAgent = MyPlayer()
rndPlayer = RandomPlayer()
myAgent.trainModelVS(rndPlayer)

TRAINING COMPLETED!



In [None]:
with open('q_table.pkl', 'wb') as fp:
    pickle.dump(myAgent.q_table, fp)

In [5]:
test(myAgent, rndPlayer)
watchGame(myAgent, rndPlayer)

TESTING COMPLETED!

wins F:55.3%
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜

0 (0, 1) Move.RIGHT
⬜⬜⬜⬜🟩
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜

1 (0, 2) Move.RIGHT
⬜⬜⬜🟩🟥
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜

0 (0, 0) Move.RIGHT
⬜⬜🟩🟥🟩
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜

1 (1, 4) Move.BOTTOM
⬜⬜🟩🟥🟩
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜🟥

0 (0, 2) Move.LEFT
🟩⬜⬜🟥🟩
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜🟥

1 (4, 4) Move.TOP
🟩⬜⬜🟥🟥
⬜⬜⬜⬜🟩
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜

0 (1, 0) Move.BOTTOM
🟩⬜⬜🟥🟥
⬜⬜⬜⬜🟩
⬜⬜⬜⬜⬜
⬜⬜⬜⬜⬜
🟩⬜⬜⬜⬜

1 (2, 0) Move.BOTTOM
🟩⬜⬜🟥🟥
⬜⬜⬜⬜🟩
⬜⬜⬜⬜⬜
🟩⬜⬜⬜⬜
🟥⬜⬜⬜⬜

0 (4, 4) Move.TOP
🟩⬜⬜🟥🟩
⬜⬜⬜⬜🟥
⬜⬜⬜⬜🟩
🟩⬜⬜⬜⬜
🟥⬜⬜⬜⬜

1 (4, 4) Move.TOP
🟩⬜⬜🟥🟥
⬜⬜⬜⬜🟩
⬜⬜⬜⬜🟥
🟩⬜⬜⬜🟩
🟥⬜⬜⬜⬜

0 (3, 0) Move.TOP
🟩⬜⬜🟥🟥
🟩⬜⬜⬜🟩
⬜⬜⬜⬜🟥
⬜⬜⬜⬜🟩
🟥⬜⬜⬜⬜

1 (0, 3) Move.RIGHT
🟩⬜⬜🟥🟥
🟩⬜⬜⬜🟩
⬜⬜⬜⬜🟥
⬜⬜⬜⬜🟩
🟥⬜⬜⬜⬜

0 (1, 4) Move.BOTTOM
🟩⬜⬜🟥🟥
🟩⬜⬜⬜🟥
⬜⬜⬜⬜🟩
⬜⬜⬜⬜⬜
🟥⬜⬜⬜🟩

1 (0, 2) Move.BOTTOM
🟩⬜⬜🟥🟥
🟩⬜⬜⬜🟥
⬜⬜⬜⬜🟩
⬜⬜⬜⬜⬜
🟥⬜🟥⬜🟩

0 (0, 0) Move.RIGHT
⬜⬜🟥🟥🟩
🟩⬜⬜⬜🟥
⬜⬜⬜⬜🟩
⬜⬜⬜⬜⬜
🟥⬜🟥⬜🟩

1 (2, 0) Move.TOP
🟥⬜🟥🟥🟩
⬜⬜⬜⬜🟥
🟩⬜⬜⬜🟩
⬜⬜⬜⬜⬜
🟥⬜🟥⬜🟩

0 (3, 0) Move.BOTTOM
🟥⬜🟥🟥🟩
⬜⬜⬜⬜🟥
🟩⬜⬜⬜🟩
🟥⬜⬜⬜⬜
🟩⬜🟥⬜🟩

1 (1, 0) Move.RIGHT
🟥⬜🟥🟥🟩
⬜⬜⬜🟥🟥
🟩⬜⬜⬜🟩
🟥⬜⬜⬜⬜
🟩⬜🟥⬜🟩

0 (2, 4) Move.TOP
🟥⬜🟥🟥🟩