In [None]:
!pip install 'kaggle-environments>=0.1.6'

In [None]:
from kaggle_environments import make, utils

env = make("connectx", debug=True)
env.render()

In [None]:
import numpy as np
from random import choice
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Qテーブル

class QTable():
    def __init__(self, actions):
        self.Q = {}
        self.actions = actions
    
    def get_state_key(self, state):
        board = state.board[:]
        board.append(state.mark)
        state_key = np.array(board).astype(str) 
        return hex(int(''.join(state_key), 3))[2:]        
        
    def get_q_values(self, state):
        state_key = self.get_state_key(state)
        if state_key not in self.Q.keys(): 
            self.Q[state_key] = [0] * len(self.actions)
        return self.Q[state_key]
    
    def update(self, state, action, add_q):
        state_key = self.get_state_key(state)
        self.Q[state_key] = [q + add_q if idx == action else q for idx, q in enumerate(self.Q[state_key])]
        
        
#エージェント
        
env = make("connectx", debug=True)
trainer = env.train([None, "random"])

class QLearningAgent():
    def __init__(self, env, epsilon=0.4):
        self.env = env
        self.actions = list(range(self.env.configuration.columns))
        self.q_table = QTable(self.actions)
        self.epsilon = epsilon
        self.reward_log = []  
        
    def policy(self, state):
        if np.random.random() < self.epsilon:
            return choice([c for c in range(len(self.actions)) if state.board[c] == 0])
        else:
            q_values = self.q_table.get_q_values(state)
            selected_items = [q if state.board[idx] == 0 else -1e7 for idx, q in enumerate(q_values)]
            return int(np.argmax(selected_items))
        
    def custom_reward(self, reward, done):
        if done:
            if reward == 1:
                return 20
            elif reward == 0:
                return -30
            else:
                return 10
        else:
            return -10
        
    def learn(self, trainer, episode_cnt=50000, gamma=0.6, 
              learn_rate=0.3):
        for episode in tqdm(range(episode_cnt)):
            state = trainer.reset() 
            self.epsilon = 0.4
            while not env.done:
                action = self.policy(state) 
                next_state, reward, done, info = trainer.step(action)
                reward = self.custom_reward(reward, done)
                gain = reward + gamma * max(self.q_table.get_q_values(next_state))
                estimate = self.q_table.get_q_values(state)[action]
                self.q_table.update(state, action, learn_rate * (gain - estimate)) 
                state = next_state
      
            self.reward_log.append(reward)
        
qa = QLearningAgent(env)
qa.learn(trainer)


#平均報酬

sns.set(style='darkgrid')
pd.DataFrame({'Average Reward': qa.reward_log}).rolling(500).mean().plot(figsize=(10,5))
plt.show()


#Pythonファイル出力

tmp_dict_q_table = qa.q_table.Q.copy()
dict_q_table = dict()

for k in tmp_dict_q_table:
    if np.count_nonzero(tmp_dict_q_table[k]) > 0:
        dict_q_table[k] = int(np.argmax(tmp_dict_q_table[k]))

my_agent = '''def my_agent(observation, configuration):
    from random import choice
    q_table = ''' \
    + str(dict_q_table).replace(' ', '') \
    + '''
    board = observation.board[:]
    board.append(observation.mark)
    state_key = list(map(str, board))
    state_key = hex(int(''.join(state_key), 3))[2:]
    if state_key not in q_table.keys():
        return choice([c for c in range(configuration.columns) if observation.board[c] == 0])
    action = q_table[state_key]
    if observation.board[action] != 0:
        return choice([c for c in range(configuration.columns) if observation.board[c] == 0])
    return action
    '''

with open('submission.py', 'w') as f:
    f.write(my_agent)
    
env.reset()

In [None]:
#対戦

env.run([my_agent, "random"])
env.render(mode="ipython", width=500, height=450)

In [None]:
#vs randomとvs negamaxとの勝率

from kaggle_environments import evaluate

def mean_reward(rewards):
    return sum(r[0] for r in rewards) / float(len(rewards))

print("My Agent vs Random Agent:", mean_reward(evaluate("connectx", [my_agent, "random"], num_episodes=10)))
print("My Agent vs Negamax Agent:", mean_reward(evaluate("connectx", [my_agent, "negamax"], num_episodes=10)))