In [1]:
import numpy as np
import pandas as pd

from collections import defaultdict
from tqdm import tqdm

import random

import gym
import adaptive_tutor

In [2]:
import numpy as np
action_space = np.load('adaptive_tutor/action_space.npy', allow_pickle=True)

In [3]:
class TutorStatefulAgent:
    def __init__(
        self,
        learning_rate: float,
        initial_epsilon: float,
        epsilon_decay: float,
        final_epsilon: float,
        discount_factor: float = 0.95,
    ):
        """Initialize a Reinforcement Learning agent with an empty dictionary
        of state-action values (q_values), a learning rate and an epsilon.

        Args:
            learning_rate: The learning rate
            initial_epsilon: The initial epsilon value
            epsilon_decay: The decay for epsilon
            final_epsilon: The final epsilon value
            discount_factor: The discount factor for computing the Q-value
        """

        self.action_space = list(np.load('adaptive_tutor/action_space.npy', allow_pickle=True))
        self.q_values = defaultdict(lambda: np.zeros(len(self.action_space)))

        self.lr = learning_rate
        self.discount_factor = discount_factor

        self.epsilon = initial_epsilon
        self.epsilon_decay = epsilon_decay
        self.final_epsilon = final_epsilon

        self.training_error = []

    def _create_rating_bracket(self, row):
        if row<1300:
            return 'lt_1300'
        elif 1300<=row<1700:
            return '1300-1800'
        else:
            return 'gt_1700'
    
    def _get_agent_state(self, obs):
        return tuple([self._create_rating_bracket(val) for val in obs['themes_covered']])

    def get_action(self, obs: tuple[int, int, bool]) -> int:
        """
        Returns the best action with probability (1 - epsilon)
        otherwise a random action with probability epsilon to ensure exploration.
        """
        # with probability epsilon return a random action to explore the environment
        obs = self._get_agent_state(obs)
        if np.random.random() < self.epsilon:
            print("Exploring")
            return random.choice(range(len(self.action_space)))

        # with probability (1 - epsilon) act greedily (exploit)
        else:
            return (np.argmax(self.q_values[obs]))

    def update(
        self,
        obs: tuple[int, int, bool],
        action: int,
        reward: float,
        terminated: bool,
        next_obs: tuple[int, int, bool],
    ):
        """Updates the Q-value of an action."""
        obs = self._get_agent_state(obs)
        next_obs = self._get_agent_state(next_obs)
        future_q_value = (not terminated) * np.max(self.q_values[next_obs])
        temporal_difference = (
            reward + self.discount_factor * future_q_value - self.q_values[obs][action]
        )

        self.q_values[obs][action] = (
            self.q_values[obs][action] + self.lr * temporal_difference
        )
        self.training_error.append(temporal_difference)

    def decay_epsilon(self):
        self.epsilon = max(self.final_epsilon, self.epsilon - self.epsilon_decay)

In [4]:
# Hyperparameters
learning_rate = 0.01
n_episodes = 100
start_epsilon = 0.5
epsilon_decay = start_epsilon / (n_episodes / 2) # Reduce the exploration over time
final_epsilon = 0.1

agent = TutorStatefulAgent(
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

In [5]:
# Load custom environment we created 
env = gym.make('adaptive_tutor/PuzzleTutorEnv-v0', render_mode=None) 

# Set to initial state
env.reset()

episode_rewards = []
episode_lengths = []
action_rewards = {action: 0 for action in action_space}

for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False
    terminated = False
    episode_reward = 0
    
    # Play one episode
    step_counter = 0
    while not terminated:
        step_counter+=1
        action = agent.get_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)
        #print(action_space[action], reward)
        # Update the agent
        agent.update(obs, action, reward, terminated, next_obs)
        episode_reward += reward
        #print(action_space[action])
        # Update the current observation, and whether the environment is done
        done = terminated or truncated
        obs = next_obs
        action_rewards[action_space[action]] += reward
    
    print("Number of steps: ", step_counter)

    episode_rewards.append(episode_reward)
    episode_lengths.append(step_counter)
    print(f"Episode {episode + 1}: Reward = {episode_reward}, Length = {step_counter}")


    agent.decay_epsilon()

../maia_weights/maia_1100.pb
1


  0%|          | 0/100 [00:00<?, ?it/s]<UciProtocol (pid=56771)>: stderr >> [1m[31m       _
<UciProtocol (pid=56771)>: stderr >> |   _ | |
<UciProtocol (pid=56771)>: stderr >> |_ |_ |_|[0m v0.30.0+git.dirty built Jul 22 2023
<UciProtocol (pid=56771)>: stderr >> Loading weights file from: ../maia_weights/maia_1100.pb
<UciProtocol (pid=56771)>: stderr >> Creating backend [metal]...
<UciProtocol (pid=56771)>: stderr >> Initialized metal backend on device Apple M2


[['checkmating_tactics' '1' 'gt_1900']]
[['advanced_tactical_themes' '1' '1000-1100']]


  if not isinstance(terminated, (bool, np.bool8)):
  logger.warn(f"{pre} is not within the observation space.")


[['checkmate_patterns' '1' '1000-1100']]
[['checkmating_tactics' '1' '1000-1100']]
[['defensive_tactics' '1' '1400-1500']]
[['advanced_tactical_themes' '1' '1000-1100']]
[['checkmate_patterns' '1' '1300-1400']]
[['special_moves' '0' 'gt_1900']]
[['defensive_tactics' '1' '1400-1500']]
[['checkmate_patterns' '1' '1000-1100']]
[['tactical_themes' '0' '1400-1500']]
[['tactical_themes' '0' '1300-1400']]
[['checkmating_tactics' '1' '1000-1100']]
[['piece_specific_endgames' '1' '1500-1600']]
[['tactical_themes' '1' '1800-1900']]
../maia_Weights/maia_1300.pb
2
Bot_Upgraded
[['advanced_tactical_themes' '1' '1000-1100']]
../maia_Weights/maia_1500.pb
3
Bot_Upgraded
[['defensive_tactics' '1' '1400-1500']]
[['king_safety_and_attack' '1' '1000-1100']]
[['checkmate_patterns' '1' '1000-1100']]
[['checkmating_tactics' '1' '1000-1100']]
[['advanced_tactical_themes' '0' '1000-1100']]
[['checkmate_patterns' '1' '1000-1100']]
[['special_moves' '1' '1100-1200']]
[['checkmating_tactics' '1' '1700-1800']]
[['

  0%|          | 0/100 [00:24<?, ?it/s]

[['pawn_related_themes' '0' '1800-1900']]





KeyboardInterrupt: 

In [None]:
import numpy as np
a = np.load('adaptive_tutor/action_space.npy', allow_pickle=True)

In [6]:
action_rewards

{('1000-1100', 'advanced_tactical_themes'): -75.26578947368421,
 ('1000-1100', 'checkmate_patterns'): -70.3736842105263,
 ('1000-1100', 'checkmating_tactics'): -58.37368421052631,
 ('1000-1100', 'defensive_tactics'): -56.77421052631578,
 ('1000-1100', 'king_safety_and_attack'): -52.38421052631578,
 ('1000-1100', 'pawn_related_themes'): -41.73947368421052,
 ('1000-1100', 'piece_specific_endgames'): -43.34473684210526,
 ('1000-1100', 'special_moves'): -30.21052631578947,
 ('1000-1100', 'strategic_concepts'): -29.002631578947366,
 ('1000-1100', 'tactical_themes'): -26.623684210526314,
 ('1100-1200', 'advanced_tactical_themes'): -22.307894736842105,
 ('1100-1200', 'checkmate_patterns'): -17.571052631578944,
 ('1100-1200', 'checkmating_tactics'): -18.728947368421053,
 ('1100-1200', 'defensive_tactics'): -17.559473684210523,
 ('1100-1200', 'king_safety_and_attack'): -18.105263157894736,
 ('1100-1200', 'pawn_related_themes'): -15.684210526315788,
 ('1100-1200', 'piece_specific_endgames'): -14