# state
Initialise 4*4 board with few cells filled initially

# Action
What position - What numbers can be filled based on current board state

# Reward
Between 0 and 1 given at end of the game

In [4]:
import numpy as np
def generate_sudoku(size): # to be written later
    to_return = np.zeros((4, 4))
    to_return[1] = np.array([2,3,4,1])
    to_return[2] = np.array([3,4,1,2])
    return to_return

In [11]:
generate_sudoku(4)

array([[0., 0., 0., 0.],
       [2., 3., 4., 1.],
       [3., 4., 1., 2.],
       [0., 0., 0., 0.]])

In [None]:
def is_fill_valid(board, position_filled):
    # To be formulated
    return True

In [1]:
BOARD_SIZE = 4

In [1]:
class State:
    def __init__(self, agent):
        self.board = generate_sudoku(BOARD_SIZE) 
        self.isEnd = False
        self.agent = agent
        self.boardHash = None
        self.last_position_filled = None
        
    # get unique hash of current board state
    def get_hash(self):
        self.boardHash = str(self.board.reshape(BOARD_SIZE * BOARD_SIZE))
        return self.boardHash
    
    def available_positions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    positions.append((i, j))  # need to be tuple
        return positions

    def update_state(self, position, value):
        self.board[position] = value
        self.last_position_filled = position
        
    def winner(self):
        if not is_fill_valid(self.board, self.last_position_filled):
            self.isEnd = True
            return -1
        
        if len(self.available_positions()) == 0:
            self.isEnd = True
            return 1
        
        self.isEnd = False
        return None
    
    # only when game ends
    def give_reward(self):
        result = self.winner()
        # backpropagate reward
        self.agent.feed_reward(result)
        
    # board reset
    def reset(self):
        self.board = generate_sudoku(BOARD_SIZE)
        self.boardHash = None
        self.isEnd = False
        self.last_position_filled = None
        
    def play(self, rounds=100):
        for i in range(rounds):
            if i % 1000 == 0:
                print("Rounds {}".format(i))
                
            while not self.isEnd:
                positions = self.available_positions()
                agent_action = self.agent.choose_action(positions, self.board)
                self.update_state(*agent_action)
                board_hash = self.get_hash()
                self.agent.add_state(board_hash)
                
                # check board status if it is end
                win = self.winner()
                if win is not None:
                    # self.show_board()
                    # ended with success or wrong fill
                    self.give_reward()
                    self.agent.reset()
                    self.reset()
                    break
                    
    def show_board(self):
        for i in range(0, BOARD_SIZE):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_SIZE):
                if self.board[i, j] == 0:
                    token = ' '
                else:
                    token = str(self.board[i, j])
                out += token + ' | '
            print(out)
        print('-------------')

In [2]:
class Player:
    def __init__(self, exp_rate=0.3):
        self.states = []  # record all positions taken
        self.lr = 0.2
        self.exp_rate = exp_rate  # Epsilon
        self.decay_gamma = 0.9
        self.states_value = {}  # state -> value
        
    def choose_action(self, positions, current_board):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            fill_val = np.random.choice(BOARD_SIZE)
            action = (positions[idx], fill_val)
        else:
            value_max = -999
            for p in positions:
                for fill_val in range(BOARD_SIZE):
                    next_board = current_board.copy()
                    next_board[p] = fill_val
                    next_boardHash = self.get_hash(next_board)
                    value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                    # print("value", value)
                    if value >= value_max:
                        value_max = value
                        action = (p, fill_val)
        # print("{} takes action {}".format(self.name, action))
        return action
    
    # at the end of game, backpropagate and update states value
    def feed_reward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]
            
    # append a hash state
    def add_state(self, state):
        self.states.append(state)
        
    def reset(self):
        self.states = []
        
    def save_policy(self):
        with open('policy_agent', 'wb') as fw:
            pickle.dump(self.states_value, fw)

    def load_policy(self, file):
        with open(file, 'rb') as fr:
            self.states_value = pickle.load(fr)


In [None]:
# training
agent = Player()
st = State(agent)
print("training...")
st.play(1)