# Install kaggle-environments

In [None]:
!pip install 'kaggle-environments>=0.1.6'

In [None]:
import numpy as np
import pandas as pd
import gym  # gym: OpenAI Gymのこと。OpenAIは人工知能系の非営利企業で有名なところ。強化学習のシミュレーション用PF
import random
import matplotlib.pyplot as plt
from random import choice
from tqdm.notebook import tqdm
from kaggle_environments import evaluate, make, utils

# Debug/Train your Agent
## Implement QLearning Model

### QTable

In [None]:
class QTable():
    def __init__(self, actions):
        self.Q = {}
        self.actions = actions
    
    def get_state_key(self, state):
        board = state.board[:]
        board.append(state.mark)
        state_key = np.array(board).astype(str) 
        return hex(int(''.join(state_key), 3))[2:]        
        
    def get_q_values(self, state):
        # 状態に対して、全actionのQ値のリストを取得
        state_key = self.get_state_key(state)
        if state_key not in self.Q.keys(): 
            # 過去にその状態になったことがない場合は0埋め
            self.Q[state_key] = [0] * len(self.actions)
        return self.Q[state_key]
    
    def update(self, state, action, add_q):
        # Q値を更新
        state_key = self.get_state_key(state)
        self.Q[state_key] = [q + add_q if idx == action else q for idx, q in enumerate(self.Q[state_key])]

### The Agent of  QLearning

In [None]:
class QLearningAgent():
    def __init__(self, env, epsilon=0.99):
        self.env = env
        self.actions = list(range(self.env.configuration.columns))
        self.q_table = QTable(self.actions)
        self.epsilon = epsilon
        self.reward_log = []    
        
    def policy(self, state):
        if np.random.random() < self.epsilon:
            return choice([c for c in range(len(self.actions)) if state.board[c] == 0])
        else:
            q_values = self.q_table.get_q_values(state)
            selected_items = [q if state.board[idx] == 0 else -1e7 for idx, q in enumerate(q_values)]
            return int(np.argmax(selected_items))
        
    def custom_reward(self, reward, done):
        if done:
            if reward == 1: # 勝ち
                return 20
            elif reward == 0: # 負け
                return -20
            else: # 引き分け
                return 10
        else:
            return -0.05 # 勝負がついてない
        
    def train(self, trainer, battle_cnt=100, gamma=0.5, 
              learning_rate=0.1, epsilon_decay_rate=0.99, min_epsilon=0.1):
        """
        QLearningを実行するためのメソッド
        """
        
        for _ in tqdm(range(battle_cnt)):
            state = trainer.reset() 
            # slow down epsilon
            self.epsilon = max(min_epsilon, self.epsilon * epsilon_decay_rate) 
            while not env.done:
                # execute battle
                action = self.policy(state) 
                next_state, reward, done, info = trainer.step(action)
                reward = self.custom_reward(reward, done)
                # Update QTable with gradient
                gain = reward + gamma * max(self.q_table.get_q_values(next_state))
                estimate = self.q_table.get_q_values(state)[action]
                self.q_table.update(state, action, learning_rate * (gain - estimate)) 
                state = next_state
      
            self.reward_log.append(reward)                

### Configs

In [None]:
episode_cnt = 20000  # 強化学習の実行回数
gamma = 0.5
learn_rate = 0.4
epsilon_decay_rate = 0.9999
min_epsilon = 0.1

### train

In [None]:
env = make("connectx", debug=True)
trainer = env.train([None, "random"])
# Training
qa = QLearningAgent(env)
qa.train(trainer, episode_cnt, gamma, learn_rate, epsilon_decay_rate, min_epsilon)

observation = trainer.reset()

# ゲーム終了時に得られた報酬の移動平均
import seaborn as sns
sns.set(style='darkgrid')
pd.DataFrame({'Average Reward': qa.reward_log}).rolling(500).mean().plot(figsize=(10,5))
plt.show()

# Evaluate your Agent

# Play your Agent
Click on any column to place a checker there ("manually select action").

# Write Submission File



In [None]:
tmp_dict_q_table = qa.q_table.Q.copy()
dict_q_table = dict()

for k in tmp_dict_q_table:
    if np.count_nonzero(tmp_dict_q_table[k]) > 0:
        dict_q_table[k] = int(np.argmax(tmp_dict_q_table[k]))

# 以下のメソッドを.pyファイルに書き込む
"""
def my_agent(observation, configuration):
    from random import choice
    q_table = str(dict_q_table).replace(' ', '')
    q_table =  '''{}'''.join(q_table)
    
    board = observation.board[:]
    board.append(observation.mark)
    state_key = list(map(str, board))
    state_key = hex(int(''.join(state_key), 3))[2:]

    if state_key not in q_table.keys():
        return choice([c for c in range(configuration.columns) if observation.board[c] == 0])

    action = q_table[state_key]

    if observation.board[action] != 0:
        return choice([c for c in range(configuration.columns) if observation.board[c] == 0])
    return action
"""
    

my_agent = '''def my_agent(observation, configuration):
    from random import choice
    q_table = ''' \
    + str(dict_q_table).replace(' ', '') \
    + '''
    board = observation.board[:]
    board.append(observation.mark)
    state_key = list(map(str, board))
    state_key = hex(int(''.join(state_key), 3))[2:]

    if state_key not in q_table.keys():
        return choice([c for c in range(configuration.columns) if observation.board[c] == 0])

    action = q_table[state_key]

    if observation.board[action] != 0:
        return choice([c for c in range(configuration.columns) if observation.board[c] == 0])
    return action
    '''

with open('submission.py', 'w') as f:
    f.write(my_agent)

In [None]:
env.run([my_agent, "negamax"])
env.render(mode="ipython")

In [None]:
def mean_reward(rewards):
    return sum(r[0] for r in rewards) / float(len(rewards))

# Run multiple episodes to estimate its performance.
print("My Agent vs Random Agent:", mean_reward(evaluate("connectx", [my_agent, "random"], num_episodes=10)))
print("My Agent vs Negamax Agent:", mean_reward(evaluate("connectx", [my_agent, "negamax"], num_episodes=10)))