In [19]:
import numpy as np

In [20]:
class ActionSpace():
    """Abstract model for a space that is used for the state and action spaces. This class has the
    exact same API that OpenAI Gym uses so that integrating with it is trivial.
    Please refer to [Gym Documentation](https://gym.openai.com/docs/#spaces)
    """
    def __init__(self):
        self.actions = [(0, 0), (0, 1), (1, 0), (1, 1)]
    
    def sample(self, seed=None):
        if seed is not None:
            return self.actions[np.random.RandomState(seed=seed).randint(low=0, high=len(self.actions))]
        return self.actions[np.random.randint(low=0, high=len(self.actions))]

    def contains(self, x):
        """Return boolean specifying if x is a valid member of this space
        """
        return True if x in self.actions else False

In [21]:
class StateSpace():
    """Abstract model for a space that is used for the state and action spaces. This class has the
    exact same API that OpenAI Gym uses so that integrating with it is trivial.
    Please refer to [Gym Documentation](https://gym.openai.com/docs/#spaces)
    """
    def __init__(self):
        self.states = [(i, j) for j in range(1, 9) for i in range(1, 9)] ## Some are non reachable
    
    def sample(self, seed=None):
        if seed is not None:
            return np.random.RandomState(seed=seed).choice(self.states)
        return np.random.choice(self.states)

    def contains(self, x):
        """Return boolean specifying if x is a valid member of this space
        """
        return True if x in self.states else False

In [31]:
class Enviroment():
    
    def __init__(self):
        #reward_range = (-np.inf, np.inf)
        self.action_space = ActionSpace()
        self.state_space = StateSpace() # state_space
        self.q_table = {}
        for st in self.state_space.states:
            self.q_table[st] = np.array([[[0, 0], [0, 0]], [[0, 0], [0, 0]]])
        self.info = {}
        self.t = 0
        self.start_state = (1, 2)
        self.current_state = (1, 2)
        self._seed = None
    
    def _next_state_A(self, state_A, action_A):
        up = {1:3, 3:6, 6:6, 4:7, 7:7, 2:5, 5:8, 8:8}
        right = {1:4, 3:4, 6:7, 4:5, 7:8, 2:8, 5:8, 8:8}
        if action_A == 1:
            return up[state_A]  
        else:
            return right[state_A]
    
    def _next_state_B(self, state_B, action_B):
        up = {1:3, 3:6, 6:6, 4:7, 7:7, 2:5, 5:8, 8:8}
        left = {1:1, 3:3, 6:6, 4:3, 7:6, 2:4, 5:4, 8:7}
        if action_B == 1:
            return up[state_B]  
        else:
            return left[state_B]
    
    def _is_final_state(self, new_state):
        assert(self.state_space.contains(new_state))
        ## Co operation: All 3 ways
        if new_state[0] == 7 and new_state[1] == 7:
            return True
        
        ## Fight!: All 2 ways
        if new_state[0] == 4 and new_state[1] == 4:
            return True
        
        ## A hunts B: Just one way this happens: Sightly tricky
        if new_state[0] == 7 and new_state[1] != 7:
            return True
        
        ## B hunts A: Just one way this happens: Slightly tricy
        if new_state[0] != 7 and new_state[1] == 7:
            return True
        
    def _reward(self, new_state):
        assert(self.state_space.contains(new_state))
        ## Co operation
        if new_state[0] == 7 and new_state[1] == 7:
            return (3, 3)
        
        if new_state[0] == 4 and new_state[1] == 4:
            return (1, 1)
        
        if new_state[0] == 7 and new_state[1] != 7:
            return (4, 0)
        
        if new_state[0] != 7 and new_state[1] == 7:
            return (0, 4)
        return (0, 0)
        
    def step(self, action):
        """Run one timestep of the environment's dynamics.
        Accepts an action and returns a tuple (observation, reward, done, info).
        # Arguments
            action (object): An action provided by the environment.
        # Returns
            observation (object): Agent's observation of the current environment.
            reward (float) : Amount of reward returned after previous action.
            done (boolean): Whether the episode has ended, in which case further step() calls will return undefined results.
            info (dict): Contains auxiliary diagnostic information (helpful for debugging, and sometimes learning).
        """
        new_state = (self._next_state_A(self.current_state[0], action[0]),\
                     self._next_state_B(self.current_state[1], action[1]))
        done = self._is_final_state(new_state)
        reward = self._reward(new_state)
        self.t += 1
        self.info[self.t] = (new_state, reward, done)
        self.current_state = new_state
        return new_state, reward, done, self.info[self.t]

    def reset(self):
        """
        Resets the state of the environment and returns an initial observation.
        # Returns
            observation (object): The initial observation of the space. Initial reward is assumed to be 0.
        """
        self.t = 0
        self.info = {}
        self.current_state = self.start_state

    def render(self, mode='human', close=False):
        """Renders the environment.
        The set of supported modes varies per environment. (And some
        environments do not support rendering at all.)
        # Arguments
            mode (str): The mode to render with.
            close (bool): Close all open renderings.
        """
        raise NotImplementedError()

    def close(self):
        """Override in your subclass to perform any necessary cleanup.
        Environments will automatically close() themselves when
        garbage collected or when the program exits.
        """
        raise NotImplementedError()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        # Returns
            Returns the list of seeds used in this env's random number generators
        """
        _seed = seed
        return _seed

    def configure(self, *args, **kwargs):
        """Provides runtime configuration to the environment.
        This configuration should consist of data that tells your
        environment how to run (such as an address of a remote server,
        or path to your ImageNet data). It should not affect the
        semantics of the environment.
        """
        raise NotImplementedError()

    #def __del__(self):
    #    self.close()

    def __str__(self):
        return '<{} instance>'.format(type(self).__name__)

In [32]:
env = Enviroment()
q_table = env.q_table

In [35]:
num_episodes = 1
max_steps_per_episode = 5
for episode in range(num_episodes):
    state = env.reset()
    for step in range(max_steps_per_episode):
        #r = np.random.uniform(0, 1)
        r = 0
        exploration_rate = 0.5
        if r > exploration_rate:
            #action = nash_equilibrium(q_table[state])
            pass
        else:
            action = env.action_space.sample(seed=env._seed)
            #print(action, end=" ")
        new_state, reward, done, info = env.step(action)
        
        print(new_state, reward, done)
        q_table[state][action[0]][action[1]] = (1 - alpha) * q_table[state][action[0]][action[1]] + \
                        alpha * (reward + discount_rate * nash_equilibrium(q_table[new_state]))
        state = new_state
        if done == True:
            break
    #exploration_rate etc

(1, 0) (3, 4) (0, 0) None
(0, 1) (4, 7) (0, 4) True
