In [11]:
import numpy as np
import pandas as pd

from collections import defaultdict
from tqdm import tqdm

import random

import gym
import adaptive_tutor

In [12]:
import numpy as np
action_space = np.load('adaptive_tutor/action_space.npy', allow_pickle=True)

In [13]:
 
class TutorAgent:
    def __init__(
        self,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        """Initialize a Reinforcement Learning agent with an empty dictionary
        of state-action values (q_values), a learning rate and an epsilon.

        Args:
            learning_rate: The learning rate
            initial_epsilon: The initial epsilon value
            epsilon_decay: The decay for epsilon
            final_epsilon: The final epsilon value
            discount_factor: The discount factor for computing the Q-value
        """
        self.action_space = list(np.load('adaptive_tutor/action_space.npy', allow_pickle=True))
        self.action_space_dict = {action: i for i,action in enumerate(self.action_space)}
        self.q_values = np.zeros(len(self.action_space))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    
    def _get_q_table_index(self, obs):

        last_puzzle = obs[-1]
        theme, success, elo = last_puzzle[0], last_puzzle[1], last_puzzle[2]
        return (theme, elo)


    def get_action(self) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
        """
        # with probability epsilon return a random action to explore the environment
        if np.random.random() < self.epsilon:
            return random.choice(self.action_space)

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return self.action_space[np.argmax(self.q_values)]
    

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        
        future_q_value = (not terminated) * np.max(self.q_values)
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[self.action_space.index(action)]
        )

        self.q_values[self.action_space.index(action)] = (
            self.q_values[self.action_space.index(action)] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)

In [14]:
# Hyperparameters
learning_rate = 0.01
n_episodes = 1000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # Reduce the exploration over time
final_epsilon = 0.1

agent = TutorAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

In [15]:
# Load custom environment we created 
env = gym.make('adaptive_tutor/PuzzleTutorEnv-v0', render_mode=None) 

# Set to initial state
env.reset()

for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False

    # Play one episode
    for i in range(n_episodes):
        action = agent.get_action()
        next_obs, reward, terminated, truncated, info = env.step(action)
        print(action, reward)
        # Update the agent
        agent.update(obs, action, reward, terminated, next_obs)

        # Update the current observation, and whether the environment is done
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

  logger.warn(f"{pre} is not within the observation space.")
  0%|          | 0/1000 [00:00<?, ?it/s]<UciProtocol (pid=87584)>: stderr >> [1m[31m       _
<UciProtocol (pid=87584)>: stderr >> |   _ | |
<UciProtocol (pid=87584)>: stderr >> |_ |_ |_|[0m v0.30.0+git.dirty built Jul 22 2023
<UciProtocol (pid=87584)>: stderr >> Loading weights file from: /Users/shikharrastogi/AdaptiveChessTutorRL/maia_weights/maia_1100.pb
<UciProtocol (pid=87584)>: stderr >> Creating backend [metal]...
<UciProtocol (pid=87584)>: stderr >> Initialized metal backend on device Apple M1


1300-1400 strategic_concepts


  if not isinstance(terminated, (bool, np.bool8)):
  logger.warn(f"{pre} is not within the observation space.")


[['strategic_concepts' '0' '1300-1400']]
('1300-1400', 'strategic_concepts') 0.0
1700-1800 checkmate_patterns
[['strategic_concepts' '0' '1300-1400']
 ['checkmate_patterns' '0' '1700-1800']]
('1700-1800', 'checkmate_patterns') 0.0
lt_900 strategic_concepts
[['strategic_concepts' '0' '1300-1400']
 ['checkmate_patterns' '0' '1700-1800']
 ['strategic_concepts' '0' 'lt_900']]
('lt_900', 'strategic_concepts') -1.0
1200-1300 checkmate_patterns
[['strategic_concepts' '0' '1300-1400']
 ['checkmate_patterns' '0' '1700-1800']
 ['strategic_concepts' '0' 'lt_900']
 ['checkmate_patterns' '1' '1200-1300']]
('1200-1300', 'checkmate_patterns') 298.0
1500-1600 tactical_themes
[['strategic_concepts' '0' '1300-1400']
 ['checkmate_patterns' '0' '1700-1800']
 ['strategic_concepts' '0' 'lt_900']
 ['checkmate_patterns' '1' '1200-1300']
 ['tactical_themes' '0' '1500-1600']]
('1500-1600', 'tactical_themes') 238.0
1300-1400 checkmating_tactics
[['strategic_concepts' '0' '1300-1400']
 ['checkmate_patterns' '0' '