In [None]:
#| default_exp environments

# Imports

In [None]:
#| export
import numpy as np
from abc import ABC, abstractmethod

# Abstract Environment

In [None]:
#| export
class Abstract_Env(ABC):
    """A minimal Environment, every environment should be Derived from this class.

    Examples:
    >>> pass
    """

    def __init__(self, 
                 state: object):
        """
        Args:
            state: an object that defines the state of the environment            
        """
        self.state = state

    @abstractmethod
    def transition(self, action):
        """
        Args:
            action: an action (or actions) to process
        """
        raise NotImplementedError

    @abstractmethod
    def get_observation(self):
        """
        should determine and return an observation for an agent or agents as a function of self.state
        """
        raise NotImplementedError

# RLGL

A description

In [None]:
#| export
class RLGL(Abstract_Env):
    def __init__(self, state = 0, transition_matrix = None):
        self.state = state
        self.state_labels = {0: "red", 1: "green"}
        if transition_matrix is None:
            #create random uniform transition probabilities
            transition_matrix = np.array([[0.5,0.5],[0.5,0.5]])
        assert np.shape(transition_matrix) == (2,2)
        self.transition_matrix = transition_matrix            

    def transition(self, action):
        '''
        In this environment the agents action determines the reward but does not determine the state
        '''
        self.state = np.random.choice(range(2), p = self.transition_matrix[self.state,])

    def get_observation(self):
        return self.state_labels[self.state]

    def get_reward(self, action):
        if action == self.state:
            return 1
        else:
            return 0

A minimal example

# Delayed Response

The delayed response environments presents agents with a one stimulus from a set of stimuli of size N. Each stiumus is associated with a different correct response. However, taking the correct response immediately will have no effect - the agent must wait some number of steps W (wait time), during which no stimulus is available, before the correct action will produce a reward. There is no limit to the number steps the agent can wait before selecting a response, but after W steps any incorrect response will cause the environment to present a "fail" stimulus and then reset without reward. A maximum on the number of steps before a trial resets prevents trials from running forever if the agent learns to always wait (to avoid the surprising event of a new random stimulus, for example).

In [None]:
#| export
class Delayed_Response(Abstract_Env):
    def __init__(self, 
                 W: int, #number of steps agent must wait before response is evaluated
                 N: int, #number of stimuli (and associated actions) that are presented by random selection to agent at start of trial
                 max_trial_length: int = 10,
                 current_stimulus: int = None, 
                 rewarded_action: int = None, 
                 reward_available: bool = False,
                 trial_time: int = None
                ):
        super().__init__(state = {"current_stimulus": current_stimulus, "rewarded_action": rewarded_action, "reward_available": reward_available, "trial_time": trial_time}) #assigns state to self.state
        self.W = W
        self.N = N
        self.max_trial_length = max_trial_length
        assert self.max_trial_length >= self.W
        if self.state["current_stimulus"] is None:
            self.state["current_stimulus"] = np.random.randint(self.N) + 1  #this function randomly selects an interger from 0 to N-1. Add one because 0 is used to denote no stimulus
        if self.state["rewarded_action"] is None:
            self.state["rewarded_action"] = self.state["current_stimulus"]  #stores which stimulus was presented, and thus the correct action for the agent. 
            #Generally, if the user imputs "current_stimulus = 0", "rewarded_action" should not be 0 or None. This won't break the code, but it will give reward for waiting after W steps, which the environment otherwise will not do 
        if self.state["trial_time"] is None:
            self.state["trial_time"] = 0  #at trial time 0 the agent is presented with a stimulus. 

    def transition(self, action):
        if self.state["reward_available"] or self.state["current_stimulus"] == -1: #reset after reward or fail
            self.reset()
            
        #if agent acted, check if wait time has passed and if so, if action was correct
        elif not action == 0: 
            if self.state["trial_time"] < self.W:
                self.step()
            elif action == self.state["rewarded_action"]: #correct action, give reward
                self.state["reward_available"] = True
                self.step()
            #incorrect action, set fail state
            else:
                self.state["current_stimulus"] = -1
                self.step()

        #if agent did not act, continue trial
        else:
            self.step()      
            
    def step(self):
        #moves the trial one step forward
        if not self.state["current_stimulus"] == -1: #unless in fail state, agent only receives stimulus on trial reset
            self.state["current_stimulus"] = 0
        self.state["trial_time"] += 1
        if self.state["trial_time"] > self.max_trial_length:
            self.reset()

    def reset(self):
        #stats new trial, presenting agent with a new random stimulus
        self.state["trial_time"] = 0
        self.state["current_stimulus"] = np.random.randint(self.N) + 1
        self.state["rewarded_action"] = self.state["current_stimulus"]
        self.state["reward_available"] = False

    def get_observation(self):
        return [self.state["current_stimulus"], int(self.state["reward_available"])] #agents gets current stimulus and reward availability as observation
            
            

## Timed Response

In [None]:
#| export

class Timed_Response(Abstract_Env):
    def __init__(self,
                 delay: int = 1,
                 start_state: int = 0):
        super().__init__(state = start_state)
        self.delay = delay

    def transition(self, action):
        if self.state == 0: #start state, no light no reward
            self.state = 1 #always turn on light
        elif self.state <= self.delay: #light on, delay not finished
            if action == 1:
                self.state = 0  #reset if push before delay is finished
            else:
                self.state += 1 #if wait, light stays on and delay progresses
        elif self.state == self.delay + 1: #light on, delay finished
            if action == 0:
                self.state = 0 #reset if wait after delay is finished
            else:
                self.state += 1 #if push, move to reward state
        elif self.state == self.delay + 2: #reward state
            self.state = 1

    def get_observation(self):
        if self.state == 0:
            return [0,0]
        elif self.state <= self.delay + 1:
            return [1,0]
        else:
            return [0,1]
        