### Run in collab
<a href="https://colab.research.google.com/github/racousin/data_science_practice/blob/master/website/public/modules/data-science-practice/module9/exercise/module9_exercise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install swig==4.2.1
!pip install gymnasium==1.2.0

In [3]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

# module9_exercise2 : ML - Arena <a href="https://ml-arena.com/viewcompetition/5" target="_blank"> FrozenLake Competition</a> 

### Objective
Get at list an agent running on ML-Arena <a href="https://ml-arena.com/viewcompetition/5" target="_blank"> FrozenLake Competition</a> with mean reward upper than 0.35 (ie 35%)


You should submit an agent file named `agent.py` with a class `Agent` that includes at least the following attributes:

In [4]:
class Agent:
    def __init__(self, env):
        self.env = env

    def choose_action(self, observation, reward=0.0, terminated=False, truncated=False, info=None):
        action = self.env.action_space.sample() # your logic here
        return action

### Description

The game starts with the player at location [0,0] of the frozen lake grid world with the goal located at far extent of the world [7,7].

Holes in the ice are distributed in set locations.

The player makes moves until they reach the goal or fall in a hole.

Each run will consist of 10 attempts to cross the ice. The reward will be the total amount accumulated during those trips. For example, if your agent reaches the goal 3 times out of 10, its reward will be 3.

The environment is based on :

In [5]:
env = gym.make('FrozenLake-v1', map_name="8x8")

In [6]:
import numpy as np
import math
from collections import defaultdict
from typing import Optional

class Agent:
    """
    Q-Learning agent for FrozenLake-v1 8x8 (stochastic, is_slippery=True).
    - Trains in __init__ on the provided env with an epsilon/alpha decay.
    - Uses greedy policy at evaluation time (choose_action).
    - Performs a tiny online update during evaluation to be robust.
    """

    def __init__(self, env):
        self.env = env
        self.nS = int(getattr(env.observation_space, "n"))
        self.nA = int(getattr(env.action_space, "n"))

        # Q-table
        self.Q = np.zeros((self.nS, self.nA), dtype=np.float32)

        # Hyperparams — choisis pour 8x8 glissant
        self.gamma = 0.99

        # Exploration (epsilon) : décroit lentement puis se fige à 0.1
        self.eps_start = 1.0
        self.eps_end   = 0.10
        self.eps_decay_episodes = 40000  # nb d'épisodes pour passer de start à end

        # Learning rate (alpha) : décroit jusqu'à 0.10
        self.alpha_start = 1.0
        self.alpha_end   = 0.10
        self.alpha_decay_episodes = 40000

        # Entraînement
        self.max_steps_per_ep = 200
        self.train_episodes   = 60000  # assez pour dépasser nettement 0.35 sur 8x8

        # Variables pour MAJ en ligne pendant l’éval
        self._last_state: Optional[int] = None
        self._last_action: Optional[int] = None
        self._eval_alpha = 0.05  # petit pas d’apprentissage pendant l’éval

        self._train_q_learning()
        # On se remet au propre pour l’évaluation
        self.env.reset()
        self._last_state, self._last_action = None, None

    # --- Utilitaires de scheduling ---
    def _linear_sched(self, start, end, step, total_steps):
        if total_steps <= 0:
            return end
        frac = min(1.0, max(0.0, step / total_steps))
        return start + (end - start) * frac

    # --- Entraînement Q-Learning offline sur l'env fourni ---
    def _train_q_learning(self):
        for ep in range(self.train_episodes):
            state, _ = self.env.reset()
            done = False

            # Schedules
            eps   = self._linear_sched(self.eps_start,   self.eps_end,   ep, self.eps_decay_episodes)
            alpha = self._linear_sched(self.alpha_start, self.alpha_end, ep, self.alpha_decay_episodes)

            for _ in range(self.max_steps_per_ep):
                # ε-greedy
                if np.random.random() < eps:
                    action = self.env.action_space.sample()
                else:
                    action = int(np.argmax(self.Q[state]))

                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                # Q-learning update: Q(s,a) ← Q + α [ r + γ max_a' Q(s',a') - Q(s,a) ]
                best_next = float(np.max(self.Q[next_state]))
                td_target = float(reward) + self.gamma * best_next * (0.0 if terminated else 1.0)
                self.Q[state, action] += alpha * (td_target - float(self.Q[state, action]))

                state = next_state
                if done:
                    break

    def choose_action(self, observation, reward=0.0, terminated=False, truncated=False, info=None):
        """
        MAJ Q en ligne sur la transition précédente (si dispo), puis action greedy.
        Gère reward=None au premier pas et coupe le bootstrap si l'étape précédente était terminale.
        """
        # Détection début d'épisode (reward=None dans ta boucle) : on réinitialise la mémoire online
        if reward is None:
            self._last_state, self._last_action = None, None
            r_prev = 0.0
        else:
            r_prev = float(reward)

        # Si on a la transition précédente (s_{t-1}, a_{t-1}, r_{t-1}, s_t), on fait la MAJ
        if self._last_state is not None and self._last_action is not None and observation is not None:
            s_prev = int(self._last_state)
            a_prev = int(self._last_action)
            s_curr = int(observation)

            best_next = float(np.max(self.Q[s_curr]))
            # Si l'étape précédente était terminale, pas de bootstrap
            bootstrap = 0.0 if (terminated or truncated) else (self.gamma * best_next)
            td_target = r_prev + bootstrap
            self.Q[s_prev, a_prev] += self._eval_alpha * (td_target - float(self.Q[s_prev, a_prev]))

        # Politique greedy sur l'état courant
        state = int(observation)
        action = int(np.argmax(self.Q[state]))

        # Mémoriser pour la prochaine MAJ online
        self._last_state = state
        self._last_action = action

        return action

    # # --- Politique greedy (avec petite MAJ online) ---
    # def choose_action(self, observation, reward=0.0, terminated=False, truncated=False, info=None):
    #     """
    #     Pendant l’évaluation:
    #     - on applique une petite MAJ Q(s_last, a_last) -> s_curr avec _eval_alpha
    #     - puis on joue l’action greedy sur l’état courant
    #     """
    #     # Si on a une transition (s_{t-1}, a_{t-1}, r_{t-1}, s_t), on fait une MAJ en ligne
    #     if self._last_state is not None and self._last_action is not None and observation is not None:
    #         s_prev = int(self._last_state)
    #         a_prev = int(self._last_action)
    #         s_curr = int(observation)
    #         best_next = float(np.max(self.Q[s_curr]))
    #         # Si l’épisode vient de se terminer, le prochain appel n’arrivera pas,
    #         # mais le code appelant passe terminated/truncated sur l'étape suivante.
    #         # Ici, on ne connaît que s_curr; on suppose non terminal pour la MAJ online,
    #         # ce qui reste conservateur.
    #         td_target = float(reward) + self.gamma * best_next
    #         self.Q[s_prev, a_prev] += self._eval_alpha * (td_target - float(self.Q[s_prev, a_prev]))

    #     # Action greedy
    #     state = int(observation)
    #     action = int(np.argmax(self.Q[state]))

    #     # On mémorise pour la prochaine MAJ online
    #     self._last_state = state
    #     self._last_action = action

    #     return action


### Glttr3

In [7]:
import numpy as np
import time

class Agent:
    """
    FrozenLake-v1 8x8 (is_slippery=True)
    - Entraînement Expected-SARSA tabulaire dans __init__ (time-box + early stop).
    - Pénalité de pas minuscule pendant l'entraînement pour encourager des trajets courts.
    - Évaluation 100% greedy (aucune mise à jour dans choose_action) -> rapide et stable.
    - Respecte strictement le template ML-Arena (self.env = env, signature de choose_action).
    """

    def __init__(self, env):
        self.env = env  # requis par le template

        # Dimensions
        self.nS = int(env.observation_space.n)
        self.nA = int(env.action_space.n)

        # Q init optimiste -> exploration utile
        self.Q = np.full((self.nS, self.nA), 0.5, dtype=np.float32)
        # Compteurs pour alpha adaptatif
        self.N = np.zeros((self.nS, self.nA), dtype=np.int32)

        # Hyperparams stables pour 8x8
        self.gamma = 0.99
        self.eps_start = 1.0
        self.eps_end   = 0.05
        self.train_episodes   = 60000
        self.max_steps_per_ep = 200

        # Garde-fous déploiement
        self.time_budget_s = 15.0   # coupe l'entraînement au-delà de 15s
        self.early_window  = 500    # fenêtre pour early-stop
        self.early_thresh  = 0.45   # stop si >= 45% de succès récents

        self._train_expected_sarsa_timeboxed()

        # reset propre (silencieux)
        try:
            self.env.reset()
        except Exception:
            pass

    # -------- utilitaires --------
    def _linear_sched(self, start, end, step, total_steps):
        if total_steps <= 0:
            return end
        frac = step / float(total_steps)
        if frac < 0.0: frac = 0.0
        if frac > 1.0: frac = 1.0
        return start + (end - start) * frac

    def _greedy_action(self, state):
        row = self.Q[state]
        m = np.max(row)
        idxs = np.flatnonzero(row == m)
        return int(np.random.choice(idxs))

    def _eps_greedy(self, state, eps):
        if np.random.random() < eps:
            return int(self.env.action_space.sample())
        return self._greedy_action(state)

    # -------- entraînement (time-box + early-stop + step penalty) --------
    def _train_expected_sarsa_timeboxed(self):
        t0 = time.perf_counter()
        recent_success = 0
        window = self.early_window

        for ep in range(self.train_episodes):
            # coupe par budget temps
            if time.perf_counter() - t0 > self.time_budget_s:
                break

            state, _ = self.env.reset()
            eps = self._linear_sched(self.eps_start, self.eps_end, ep, self.train_episodes)

            for _ in range(self.max_steps_per_ep):
                action = self._eps_greedy(state, eps)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                # --- shaping très léger pour raccourcir les épisodes ---
                # encourage l'agent à atteindre le but rapidement et à éviter de tourner en rond
                shaped_r = float(reward) - 0.001

                # alpha adaptatif par (s,a)
                self.N[state, action] += 1
                alpha = 1.0 / np.sqrt(1 + self.N[state, action])

                # Expected SARSA
                if done:
                    expected_next = 0.0
                else:
                    q_next = self.Q[next_state]
                    max_q = np.max(q_next)
                    greedy_mask = (q_next == max_q)
                    n_greedy = int(np.count_nonzero(greedy_mask))
                    pi = np.full(self.nA, eps / self.nA, dtype=np.float32)
                    pi[greedy_mask] += (1.0 - eps) / n_greedy
                    expected_next = float(np.dot(pi, q_next))

                td_target = shaped_r + self.gamma * expected_next
                self.Q[state, action] += alpha * (td_target - float(self.Q[state, action]))

                state = next_state
                if done:
                    if reward > 0.0:  # succès réel (but)
                        recent_success += 1
                    break

            # early stop si la moyenne de succès récents est bonne
            if ep > 0 and (ep % window) == 0:
                mean_recent = recent_success / float(window)
                if mean_recent >= self.early_thresh:
                    break
                recent_success = 0  # reset fenêtre

    # -------- politique d'évaluation (greedy, sans update) --------
    def choose_action(self, observation, reward=0.0, terminated=False, truncated=False, info=None):
        state = int(observation)
        return self._greedy_action(state)


### Nouvelle tentative

In [8]:
import numpy as np
import math
from collections import defaultdict
from typing import Optional

class Agent:
    """
    Q-Learning agent for FrozenLake-v1 8x8 (stochastic, is_slippery=True).
    - Trains in __init__ on the provided env with an epsilon/alpha decay.
    - Uses greedy policy at evaluation time (choose_action).
    - Performs a tiny online update during evaluation to be robust.
    """

    def __init__(self, env):
        self.env = env
        self.nS = int(getattr(env.observation_space, "n"))
        self.nA = int(getattr(env.action_space, "n"))

        # Q-table
        self.Q = np.zeros((self.nS, self.nA), dtype=np.float32)

        # Hyperparams — choisis pour 8x8 glissant
        self.gamma = 0.99

        # Exploration (epsilon) : décroit lentement puis se fige à 0.1
        self.eps_start = 1.0
        self.eps_end   = 0.10
        self.eps_decay_episodes = 40000  # nb d'épisodes pour passer de start à end

        # Learning rate (alpha) : décroit jusqu'à 0.10
        self.alpha_start = 1.0
        self.alpha_end   = 0.10
        self.alpha_decay_episodes = 40000

        # Entraînement
        self.max_steps_per_ep = 200
        self.train_episodes   = 60000  # assez pour dépasser nettement 0.35 sur 8x8

        # Variables pour MAJ en ligne pendant l’éval
        self._last_state: Optional[int] = None
        self._last_action: Optional[int] = None
        self._eval_alpha = 0.05  # petit pas d’apprentissage pendant l’éval

        self._train_q_learning()
        # On se remet au propre pour l’évaluation
        self.env.reset()
        self._last_state, self._last_action = None, None

    # --- Utilitaires de scheduling ---
    def _linear_sched(self, start, end, step, total_steps):
        if total_steps <= 0:
            return end
        frac = min(1.0, max(0.0, step / total_steps))
        return start + (end - start) * frac

    # --- Entraînement Q-Learning offline sur l'env fourni ---
    def _train_q_learning(self):
        for ep in range(self.train_episodes):
            state, _ = self.env.reset()
            done = False

            # Schedules
            eps   = self._linear_sched(self.eps_start,   self.eps_end,   ep, self.eps_decay_episodes)
            alpha = self._linear_sched(self.alpha_start, self.alpha_end, ep, self.alpha_decay_episodes)

            for _ in range(self.max_steps_per_ep):
                # ε-greedy
                if np.random.random() < eps:
                    action = self.env.action_space.sample()
                else:
                    action = int(np.argmax(self.Q[state]))

                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                # Q-learning update: Q(s,a) ← Q + α [ r + γ max_a' Q(s',a') - Q(s,a) ]
                best_next = float(np.max(self.Q[next_state]))
                td_target = float(reward) + self.gamma * best_next * (0.0 if terminated else 1.0)
                self.Q[state, action] += alpha * (td_target - float(self.Q[state, action]))

                state = next_state
                if done:
                    break

    def choose_action(self, observation, reward=0.0, terminated=False, truncated=False, info=None):
        """
        Évaluation *statique* et rapide :
        - AUCUNE mise à jour online (évite les épisodes trop longs / non déterministes).
        - Argmax avec départage aléatoire pour limiter les cycles.
        """
        state = int(observation)
        row = self.Q[state]
        m = np.max(row)
        # départage aléatoire entre les meilleures actions
        candidates = np.flatnonzero(row == m)
        action = int(np.random.choice(candidates))

        # on neutralise la mémoire online pour éviter toute MAJ accidentelle
        self._last_state, self._last_action = None, None
        return action

### Glttr 4 

In [9]:
import numpy as np
import math
from collections import defaultdict
from typing import Optional

class Agent:
    """
    Q-Learning agent for FrozenLake-v1 8x8 (stochastic, is_slippery=True).
    - Trains in __init__ on the provided env with an epsilon/alpha decay.
    - Uses greedy policy at evaluation time (choose_action).
    - Performs a tiny online update during evaluation to be robust.
    """

    def __init__(self, env):
        self.env = env
        self.nS = int(getattr(env.observation_space, "n"))
        self.nA = int(getattr(env.action_space, "n"))

        # Q-table
        self.Q = np.zeros((self.nS, self.nA), dtype=np.float32)

        # Hyperparams — choisis pour 8x8 glissant
        self.gamma = 0.99

        # Exploration (epsilon) : décroit lentement puis se fige à 0.1
        self.eps_start = 1.0
        self.eps_end   = 0.10
        self.eps_decay_episodes = 40000  # nb d'épisodes pour passer de start à end

        # Learning rate (alpha) : décroit jusqu'à 0.10
        self.alpha_start = 1.0
        self.alpha_end   = 0.10
        self.alpha_decay_episodes = 40000

        # Entraînement
        self.max_steps_per_ep = 200
        self.train_episodes   = 60000  # assez pour dépasser nettement 0.35 sur 8x8

        # Variables pour MAJ en ligne pendant l’éval
        self._last_state: Optional[int] = None
        self._last_action: Optional[int] = None
        self._eval_alpha = 0.05  # petit pas d’apprentissage pendant l’éval

        self._train_q_learning()
        # On se remet au propre pour l’évaluation
        self.env.reset()
        self._last_state, self._last_action = None, None

    # --- Utilitaires de scheduling ---
    def _linear_sched(self, start, end, step, total_steps):
        if total_steps <= 0:
            return end
        frac = min(1.0, max(0.0, step / total_steps))
        return start + (end - start) * frac

    # --- Entraînement Q-Learning offline sur l'env fourni ---
    def _train_q_learning(self):
        for ep in range(self.train_episodes):
            state, _ = self.env.reset()
            done = False

            # Schedules
            eps   = self._linear_sched(self.eps_start,   self.eps_end,   ep, self.eps_decay_episodes)
            alpha = self._linear_sched(self.alpha_start, self.alpha_end, ep, self.alpha_decay_episodes)

            for _ in range(self.max_steps_per_ep):
                # ε-greedy
                if np.random.random() < eps:
                    action = self.env.action_space.sample()
                else:
                    action = int(np.argmax(self.Q[state]))

                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                # Q-learning update: Q(s,a) ← Q + α [ r + γ max_a' Q(s',a') - Q(s,a) ]
                best_next = float(np.max(self.Q[next_state]))
                td_target = float(reward) + self.gamma * best_next * (0.0 if terminated else 1.0)
                self.Q[state, action] += alpha * (td_target - float(self.Q[state, action]))

                state = next_state
                if done:
                    break

    def choose_action(self, observation, reward=0.0, terminated=False, truncated=False, info=None):
        """
        Évaluation rapide et stable:
        - pas d'update online
        - évite l'action inverse immédiate pour casser les cycles
        - privilégie RIGHT(2) puis DOWN(1) parmi les meilleurs Q
        """
        state = int(observation)

        # Si on est revenu au départ, on oublie l'action précédente
        if state == 0 or terminated or truncated:
            self._prev_action = None

        row = self.Q[state]
        m = np.max(row)
        candidates = list(np.flatnonzero(row == m))  # meilleures actions

        # 0=LEFT, 1=DOWN, 2=RIGHT, 3=UP
        inverse = {0: 2, 2: 0, 1: 3, 3: 1}

        # 1) si plusieurs ex-aequo, évite l'inverse de la dernière action (si possible)
        if self._prev_action is not None and len(candidates) > 1:
            inv = inverse[self._prev_action]
            if inv in candidates:
                # ne retire l'inverse que s'il reste au moins une autre meilleure action
                tmp = [a for a in candidates if a != inv]
                if tmp:
                    candidates = tmp

        # 2) préférence directionnelle vers l'objectif : RIGHT > DOWN si disponibles
        for pref in (2, 1):  # RIGHT, then DOWN
            if pref in candidates:
                action = int(pref)
                break
        else:
            # sinon, tirage parmi les meilleurs restants
            action = int(np.random.choice(candidates))

        self._prev_action = action
        return action


### Glttr 5

In [None]:
import numpy as np
from typing import Optional

class Agent:
    """
    Q-Learning agent for FrozenLake-v1 8x8 (stochastic, is_slippery=True).
    - Entraîne dans __init__ avec décroissance epsilon/alpha raccourcie.
    - Politique greedy à l'évaluation (tie-breaker RIGHT > DOWN > LEFT > UP, évite l'inverse immédiat).
    """

    def __init__(self, env):
        self.env = env
        self.nS = int(getattr(env.observation_space, "n"))
        self.nA = int(getattr(env.action_space, "n"))

        # Q-table
        self.Q = np.zeros((self.nS, self.nA), dtype=np.float32)

        # Hyperparams
        self.gamma = 0.99

        # Exploration / apprentissage - décays plus courts
        self.eps_start = 1.0
        self.eps_end   = 0.10
        self.eps_decay_episodes = 30000

        self.alpha_start = 1.0
        self.alpha_end   = 0.10
        self.alpha_decay_episodes = 30000

        # Entraînement plus court pour éviter le time limit
        self.max_steps_per_ep = 100
        self.train_episodes   = 35000

        # État pour la politique d'éval
        self._prev_action: Optional[int] = None

        self._train_q_learning()

        # Reset propre
        try:
            self.env.reset()
        except TypeError:
            _ = self.env.reset()
        self._prev_action = None

        # cache inverse actions (0=LEFT,1=DOWN,2=RIGHT,3=UP)
        self._inverse = {0: 2, 2: 0, 1: 3, 3: 1}

    @staticmethod
    def _lin(start, end, step, total):
        if total <= 0:
            return end
        if step >= total:
            return end
        # linéaire rapide
        return start + (end - start) * (step / float(total))

    def _train_q_learning(self):
        Q = self.Q  # alias local (un peu plus rapide)
        gamma = self.gamma
        env = self.env
        max_steps = self.max_steps_per_ep

        for ep in range(self.train_episodes):
            # reset (gymnasium renvoie (obs, info))
            out = env.reset()
            state = int(out[0] if isinstance(out, tuple) else out)

            eps   = self._lin(self.eps_start,   self.eps_end,   ep, self.eps_decay_episodes)
            alpha = self._lin(self.alpha_start, self.alpha_end, ep, self.alpha_decay_episodes)

            for _ in range(max_steps):
                # ε-greedy
                if np.random.random() < eps:
                    action = env.action_space.sample()
                else:
                    action = int(np.argmax(Q[state]))

                step_out = env.step(action)
                # gymnasium: (obs, reward, terminated, truncated, info)
                next_state = int(step_out[0])
                reward     = float(step_out[1])
                terminated = bool(step_out[2])
                truncated  = bool(step_out[3])

                # Q-learning update
                best_next = float(np.max(Q[next_state]))
                td_target = reward + gamma * best_next * (0.0 if terminated else 1.0)
                Q[state, action] += alpha * (td_target - float(Q[state, action]))

                state = next_state
                if terminated or truncated:
                    break

    def choose_action(self, observation, reward=0.0, terminated=False, truncated=False, info=None):
        """
        Évaluation rapide et stable (pas d’update en ligne).
        - Évite l'action inverse immédiate pour casser des cycles.
        - Tie-breaker déterministe: RIGHT > DOWN > LEFT > UP parmi les meilleures.
        """
        state = int(observation)
        if state == 0 or terminated or truncated:
            self._prev_action = None

        row = self.Q[state]
        m = np.max(row)
        # candidates des meilleures actions
        # ordre de préférence fixe pour éviter np.random.choice (plus rapide/déterministe)
        pref_order = (2, 1, 0, 3)  # RIGHT, DOWN, LEFT, UP

        # exclure l'inverse si possible
        candidates = [a for a in range(4) if row[a] == m]
        if self._prev_action is not None and len(candidates) > 1:
            inv = self._inverse[self._prev_action]
            if inv in candidates and len(candidates) > 1:
                candidates = [a for a in candidates if a != inv] or candidates

        # applique l'ordre de préférence
        for a in pref_order:
            if a in candidates:
                action = int(a)
                break

        self._prev_action = action
        return action


### Before submit
Test that your agent has the right attributes

In [10]:
env = gym.make('FrozenLake-v1', map_name="8x8")
agent = Agent(env)

observation, _ = env.reset()
reward, terminated, truncated, info = None, False, False, None
rewards = []
while not (terminated or truncated):
    action = agent.choose_action(observation, reward=reward, terminated=terminated, truncated=truncated, info=info)
    observation, reward, terminated, truncated, info = env.step(action)
    rewards.append(reward)
print(f'Cumulative Reward: {sum(rewards)}')

Cumulative Reward: 0.0


In [11]:
env = gym.make('FrozenLake-v1', map_name="8x8")
agent = Agent(env)

scores = []
for ep in range(100):
    observation, _ = env.reset()
    reward, terminated, truncated, info = None, False, False, None
    rewards = []
    while not (terminated or truncated):
        action = agent.choose_action(observation, reward=reward,
                                     terminated=terminated, truncated=truncated, info=info)
        observation, reward, terminated, truncated, info = env.step(action)
        rewards.append(reward)
    scores.append(sum(rewards))

print(f"Mean reward over 100 eval episodes: {np.mean(scores):.3f} ± {np.std(scores):.3f}")


Mean reward over 100 eval episodes: 0.440 ± 0.496
