<a href="https://colab.research.google.com/github/ruforavishnu/Project_Machine_Learning/blob/master/Project21_Reinforcement_Learning_Self_learning_Tic_Tac_Toe_Agent_using_Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install numpy matplotlib



In [2]:
import numpy as np

class TicTacToe:
  def __init__(self):
    self.reset()

  def reset(self):
    self.board = np.zeros((3,3), dtype=int)
    self.done = False
    self.winner = None
    return self.get_state()

  def get_state(self):
    return tuple(self.board.reshape(9))

  def available_actions(self):
    return [i for i in range(9) if self.board[i//3 , i%3] == 0]

  def step(self, action , player):
    if self.board[action//3, action%3] != 0:
      return self.get_state(), -10, True

    self.board[action//3, action%3] = player
    reward, done = self.check_game(player)
    return self.get_state(), reward, done

  def check_game(self, player):
    for i in range(3):
      if all(self.board[i, :] == player) or all(self.board[:, i] == player):
        return i, True

      if self.board.trace() == player*3 or np.fliplr(self.board).trace() == player*3:
        return 1, True

      if not any(0 in row for row in self.board):
        return 0.5, True

      return 0, False


  def render(self):
    print(self.board)




 Implement the Q-Learning Agent

In [7]:
import random

class QLearningAgent:
  def __init__(self, alpha=0.1, gamma=0.9, epsilon=1.0, epsilon_decay=0.995):
    self.q_table = {}
    self.alpha = alpha
    self.gamma = gamma
    self.epsilon = epsilon
    self.epsilon_decay = epsilon_decay


  def get_qs(self, state):
    if state not in self.q_table:
      self.q_table[state] = np.zeros(9)

    return self.q_table[state]


  def choose_action(self, state, available_actions):
    if random.random() < self.epsilon:
      return random.choice(available_actions)

    else:
      qs = self.get_qs(state)

      masked_qs = np.full(9, -np.inf)
      for a in available_actions:
        masked_qs[a] = qs[a]

      return np.argmax(masked_qs)

  def learn(self, s, a, r, s_, done, available_actions):
    current_q = self.get_qs(s)[a]
    max_future_q = 0 if done else max(self.get_qs(s_)[a_] for a_ in available_actions)
    new_q = current_q + self.alpha * (r + self.gamma * max_future_q - current_q)
    self.q_table[s][a] = new_q






Train the Agent (Self-Play)

In [8]:
env = TicTacToe()
agent = QLearningAgent()


episodes = 100_000
for ep in range(episodes):
  state = env.reset()
  done = False

  while not done:
    actions = env.available_actions()
    action = agent.choose_action(state, actions)
    next_state , reward, done = env.step(action, player=1)


    #Opponent random move
    if not done:
      opp_actions = env.available_actions()
      if opp_actions:
        opp_action = random.choice(opp_actions)
        next_state2, opp_reward,done = env.step(opp_action, player=2)
        reward =  -opp_reward # lets penalize if opponent wins
        next_state = next_state2


    next_actions = env.available_actions()
    agent.learn(state, action, reward, next_state, done, next_actions)
    state = next_state


  agent.epsilon *= agent.epsilon_decay

print('Training complete')



Training complete


Evaluate Agent’s Performance

In [9]:
def evaluate_agent(agent, games=1000):
    win, loss, draw = 0, 0, 0
    for _ in range(games):
        state = env.reset()
        done = False
        while not done:
            actions = env.available_actions()
            action = agent.choose_action(state, actions)
            next_state, reward, done = env.step(action, player=1)
            if done:
                if reward == 1:
                    win += 1
                elif reward == 0.5:
                    draw += 1
                else:
                    loss += 1
                break
            # Random opponent
            opp_actions = env.available_actions()
            if opp_actions:
                opp_action = random.choice(opp_actions)
                state, reward, done = env.step(opp_action, player=2)
                if done:
                    if reward == 1:
                        loss += 1
                    elif reward == 0.5:
                        draw += 1
                    else:
                        win += 1
        state = next_state
    print(f"Win: {win}, Loss: {loss}, Draw: {draw}")

evaluate_agent(agent)


Win: 861, Loss: 100, Draw: 39


Save Trained Q-Table

In [10]:
import pickle

with open('q_table.pkl', 'wb') as f:
  pickle.dump(agent.q_table, f)

