In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [229]:
class Environment:
    """Class for a reinforcement learning environment"""
    def __init__(self, nstate=5, naction=2, nagents = 2):
        """Create a new environment"""
        self.Ns = nstate   # number of states
        self.Na = naction  # number of actions
        self.Nag = nagents #number of agents
        
    def start(self):
        """start an episode"""
        # randomly pick a state
#         self.state = np.random.randint(self.Ns)
        self.state = 0
        return(self.state)
    
#     def step(self, action):
#         """step by an action"""
#         # random reward
#         self.reward = np.random.random()  # between 0 and 1
#         # shift up/down and rotate in [0,Ns)
#         self.state = (self.state+(-1 if action==0 else 1))%self.Ns
#         return(self.reward, self.state)

In [230]:
class Agent:
    """Class for a reinforcement learning agent"""
    
    def __init__(self, nstate, naction):
        """Create a new agent"""
        self.Ns = nstate   # number of states
        self.Na = naction  # number of actions
        
    def start(self, state):
        """first action, without reward feedback"""
        # randomly pick an action
        self.action = np.random.randint(self.Na)
        return(self.action)
    
    def step(self, reward, state):
        """learn by reward and take an action"""
        # do nothing for reward
        # randomly pick an action
        self.action = np.random.randint(self.Na)
        return(self.action)

In [246]:
class RL:
    """Reinforcement learning by interacton of Environment and Agents"""
 
    def __init__(self, environment, east_agent, west_agent, nstate, naction):
        """Create the environment and the agent"""
        self.env = environment(nstate, naction)
        self.east_agent = east_agent(nstate, naction)
        self.west_agent = west_agent(nstate, naction)
    
    def episode(self, tmax=10000):
        """One episode"""
        # First contact
        state = self.env.start()
        print('initial_state:', state)
        east_action = self.east_agent.start(state)
        west_action = self.west_agent.start(state)
        # Table of t, er, wr, s, ea, wa
        Trsa = np.zeros((tmax+1,6))
        Trsa[0,:] = [0, 0, 0, state, east_action, west_action]
        # Repeat interaction
        for t in range(1, tmax+1):
            print('current_state:', state)
            east_reward, west_reward, state = self.env.step(east_action, west_action)
            east_action = self.east_agent.step(east_reward, state)
            west_action = self.west_agent.step(west_reward, state)
            Trsa[t,:] = [t, east_reward, west_reward, state, east_action, west_action]
        return(Trsa)
    
    def run(self, nrun=10, tmax=10000):
        """Multiple runs of episodes"""
        East_Return = np.zeros(nrun)
        West_Return = np.zeros(nrun)
        for n in range(nrun):
            east_r = self.episode(tmax)[:,1]  # reward sequence for east agent
            west_r = self.episode(tmax)[:,2]  # reward sequence for west agent
            East_Return[n] = sum(east_r)
            West_Return[n] = sum(west_r)
        return(East_Return, West_Return)

In [247]:
class PainGain(Environment):
    """Pain-Gain environment """
    def __init__(self, nstate=4, naction=2, gain=5):
        self.Ns = nstate   # number of states
        self.Na = naction  # number of actions
        # setup the reward function as an array
        self.R = np.ones((self.Ns, self.Na))
        self.R[0,1] = -1   # small pains for action 1
        self.R[0,0] = -gain  # large pain
        self.R[-1,1] = gain  # large gain
    
    def step(self, action):
        """step by an action"""
        self.reward = self.R[self.state, action]  # reward
        self.state = (self.state + 2*action-1)%self.Ns  # move left or right
        return(self.reward, self.state)

In [248]:
class QL(Agent):
    """Class for a Q-learning agent"""  
    def __init__(self, nstate, naction):
        self.Ns = nstate   # number of states
        self.Na = naction  # number of actions
        # allocate Q table
        self.Q = np.zeros((nstate, naction))
        # default parameters
        self.alpha = 0.1  # learning rate
        self.beta = 1.0   # inverse temperature
        self.gamma = 0.9  # discount factor
    
    def boltzmann(self, q):
        """Boltzmann selection"""
        p = np.exp( self.beta*q)   # unnormalized probability
        p = p/sum(p)    # probability
        # take the index of a segment in [0,1]
        return(np.searchsorted( np.cumsum(p), np.random.random()))      
    
    def start(self, state):
        """first action, without reward feedback"""
        # Boltzmann action selection
        self.action = self.boltzmann( self.Q[state,:])
        # remember the state
        self.state = state
        return(self.action)
    
    def step(self, reward, state):
        """learn by reward and take an action"""
        # TD error: self.state/action retains the previous ones
        delta = reward + self.gamma*max(self.Q[state,:]) - self.Q[self.state,self.action]
        # Update the value for previous state and action
        self.Q[self.state,self.action] += self.alpha*delta
        # Boltzmann action selection
        self.action = self.boltzmann( self.Q[state,:])
        # remember the state
        self.state = state
        return(self.action)

In [256]:
class IG(Environment):
        def __init__(self, nstate=6, nagents = 2, naction=2, gain=5):
            self.Ns = nstate   # number of states
            self.Na = naction*nagents  # number of actions
            # setup the reward function as an array
            self.state_dict = {0:[1, 0, 0, -1],
                          1:[1, 0, -1, 0],
                          2:[1, -1, 0, 0],
                          3:[0, 1, 0, -1], 
                          4:[0, 1, -1, 0],
                          5:[0, 0, 1, -1]}
            
            self.state_dict_inv = {(1, 0, 0, -1):0,
                          (1, 0, -1, 0):1,
                          (1, -1, 0, 0):2,
                          (0, 1, 0, -1):3, 
                          (0, 1, -1, 0):4,
                          (0, 0, 1, -1):5}
            
            self.joint_action_dict = {(0, 0):0,
                                 (0, 1):1,
                                 (1, 0):2,
                                 (1, 1):3}
                
            #reward_tables
            #east agent rewards
            self.east_reward = np.zeros((self.Ns, self.Na))
            self.east_reward = np.zeros((6, 4))
            self.east_reward[1:,0] = -1
            self.east_reward[5, 1] = -1
            self.east_reward[3:5,1] = 1
            self.east_reward[0:3, 2:4] = -1

            #west agent rewards
            self.west_reward = np.zeros((self.Ns, self.Na))
            self.west_reward[1:,0] = -1
            self.west_reward[[0,3,5], 1:4:2] = -1
            self.west_reward[2,2] = -1
            self.west_reward[[1,4], 2] = 1            
            
            
        def step(self, east_action, west_action):
            curr_state = self.state
            joint_action = joint_action_dict[tuple((east_action, west_action))]
            self.east_agent_reward = self.east_reward[curr_state, joint_action]
            self.west_agent_reward = self.west_reward[curr_state, joint_action]
            
            print(east_action, west_action)
            
            east_pos = np.where(np.array(self.state_dict[curr_state])==1)[0][0]
            print(curr_state, self.state_dict[curr_state])
            west_pos = np.where(np.array(self.state_dict[curr_state])==-1)[0][0]

            state_array = np.zeros(4)

            if east_action: 
                new_east_pos = east_pos - 1
            else:
                new_east_pos = east_pos + 1
            
            if west_action:
                new_west_pos = west_pos + 1
            else:
                new_west_pos = west_pos - 1
            
            
            if new_east_pos < 0: 
                print('Out of bounds for east agent')
                new_east_pos = 0
                
            if new_west_pos > 3:
                print('Out of bounds for west agent')
                new_west_pos = 3

            if new_east_pos > new_west_pos:
                new_east_pos = east_pos
                new_west_pos = west_pos
                print('Collision')
            
            if new_east_pos == new_west_pos: 
                new_east_pos = east_pos
                new_west_pos = west_pos
                print('Collision')
                
            print("Old pos:", east_pos, west_pos)
            print("New pos:",new_east_pos, new_west_pos)
            
            state_array[new_east_pos] = 1
            state_array[new_west_pos] = -1
            
            new_state = self.state_dict_inv[tuple(state_array)]
            self.state = new_state
            print("New state:", new_state, state_array)
            
            print("east_reward:",self.east_agent_reward, "west_reward:", self.west_agent_reward)
            return (self.east_agent_reward, self.west_agent_reward, self.state)         

In [257]:
east_agent = QL
west_agent = QL

igq = RL(IG, east_agent,west_agent, 6, 2)

In [258]:
trsa = igq.episode(1000)

initial_state: 0
current_state: 0
1 0
0 [1, 0, 0, -1]
Out of bounds for east agent
Old pos: 0 3
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
0 1
1 [1, 0, -1, 0]
Old pos: 0 2
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 3
0 0
3 [0, 1, 0, -1]
Collision
Old pos: 1 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 3
1 1
3 [0, 1, 0, -1]
Out of bounds for west agent
Old pos: 1 3
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: 0.0 west_reward: -1.0
current_state: 0
1 0
0 [1, 0, 0, -1]
Out of bounds for east agent
Old pos: 0 3
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
0 1
1 [1, 0, -1, 0]
Old pos: 0 2
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 3
1 1
3 [0, 1, 0, -1]
Out of bounds for west agent
Old pos: 1 3
New pos: 0 3
New state: 0 [ 1.  0.

New state: 5 [ 0.  0.  1. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 5
0 0
5 [0, 0, 1, -1]
Collision
Old pos: 2 3
New pos: 2 3
New state: 5 [ 0.  0.  1. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 5
1 1
5 [0, 0, 1, -1]
Out of bounds for west agent
Old pos: 2 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: -1.0
current_state: 3
1 1
3 [0, 1, 0, -1]
Out of bounds for west agent
Old pos: 1 3
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: 0.0 west_reward: -1.0
current_state: 0
0 0
0 [1, 0, 0, -1]
Old pos: 0 3
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 0
4 [0, 1, -1, 0]
Collision
Old pos: 1 2
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 4
0 0
4 [0, 1, -1, 0]
Collision
Old pos: 1 2
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 4
0 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 2 3
New st

current_state: 0
0 0
0 [1, 0, 0, -1]
Old pos: 0 3
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 2 3
New state: 5 [ 0.  0.  1. -1.]
east_reward: 1.0 west_reward: 0.0
current_state: 5
1 1
5 [0, 0, 1, -1]
Out of bounds for west agent
Old pos: 2 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: -1.0
current_state: 3
0 0
3 [0, 1, 0, -1]
Collision
Old pos: 1 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 3
1 1
3 [0, 1, 0, -1]
Out of bounds for west agent
Old pos: 1 3
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: 0.0 west_reward: -1.0
current_state: 0
1 0
0 [1, 0, 0, -1]
Out of bounds for east agent
Old pos: 0 3
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
0 0
1 [1, 0, -1, 0]
Collision
Old pos: 0 2
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_rew

0 1
2 [1, -1, 0, 0]
Old pos: 0 1
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 0
4 [0, 1, -1, 0]
Collision
Old pos: 1 2
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 4
1 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 0
0 0
0 [1, 0, 0, -1]
Old pos: 0 3
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
1 0
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 0 1
New state: 2 [ 1. -1.  0.  0.]
east_reward: 0.0 west_reward: 1.0
current_state: 2
0 1
2 [1, -1, 0, 0]
Old pos: 0 1
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 0
4 [0, 1, -1, 0]
Collision
Old pos: 1 2
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 4
1 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
eas

New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: -1.0
current_state: 3
1 0
3 [0, 1, 0, -1]
Old pos: 1 3
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 1
1 1
1 [1, 0, -1, 0]
Out of bounds for east agent
Old pos: 0 2
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: -1.0 west_reward: 0.0
current_state: 0
1 0
0 [1, 0, 0, -1]
Out of bounds for east agent
Old pos: 0 3
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
0 1
1 [1, 0, -1, 0]
Old pos: 0 2
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 3
0 0
3 [0, 1, 0, -1]
Collision
Old pos: 1 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 3
0 0
3 [0, 1, 0, -1]
Collision
Old pos: 1 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 3
0 1
3 [0, 1, 0, -1]
Out of bounds for west agent
Old pos: 1 3
New

0 1
2 [1, -1, 0, 0]
Old pos: 0 1
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
1 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 0
1 0
0 [1, 0, 0, -1]
Out of bounds for east agent
Old pos: 0 3
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
0 1
1 [1, 0, -1, 0]
Old pos: 0 2
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 3
0 0
3 [0, 1, 0, -1]
Collision
Old pos: 1 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 3
0 0
3 [0, 1, 0, -1]
Collision
Old pos: 1 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 3
0 1
3 [0, 1, 0, -1]
Out of bounds for west agent
Old pos: 1 3
New pos: 2 3
New state: 5 [ 0.  0.  1. -1.]
east_reward: 1.0 west_reward: -1.0
current_state: 5
1 1
5 [0, 0, 1, -1]


New state: 2 [ 1. -1.  0.  0.]
east_reward: -1.0 west_reward: 1.0
current_state: 2
0 1
2 [1, -1, 0, 0]
Old pos: 0 1
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 0
4 [0, 1, -1, 0]
Collision
Old pos: 1 2
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 4
1 0
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 0 1
New state: 2 [ 1. -1.  0.  0.]
east_reward: 0.0 west_reward: 1.0
current_state: 2
0 1
2 [1, -1, 0, 0]
Old pos: 0 1
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 2 3
New state: 5 [ 0.  0.  1. -1.]
east_reward: 1.0 west_reward: 0.0
current_state: 5
1 1
5 [0, 0, 1, -1]
Out of bounds for west agent
Old pos: 2 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: -1.0
current_state: 3
1 0
3 [0, 1, 0, -1]
Old pos: 1 3
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: 0.0 west_reward: 

east_reward: -1.0 west_reward: 0.0
current_state: 1
1 0
1 [1, 0, -1, 0]
Out of bounds for east agent
Old pos: 0 2
New pos: 0 1
New state: 2 [ 1. -1.  0.  0.]
east_reward: -1.0 west_reward: 1.0
current_state: 2
1 1
2 [1, -1, 0, 0]
Out of bounds for east agent
Old pos: 0 1
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
1 0
1 [1, 0, -1, 0]
Out of bounds for east agent
Old pos: 0 2
New pos: 0 1
New state: 2 [ 1. -1.  0.  0.]
east_reward: -1.0 west_reward: 1.0
current_state: 2
0 1
2 [1, -1, 0, 0]
Old pos: 0 1
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
1 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 0
0 0
0 [1, 0, 0, -1]
Old pos: 0 3
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 0
4 [0, 1, -1, 0]
Collision
Old pos: 1 2
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
eas

Old pos: 1 2
New pos: 0 1
New state: 2 [ 1. -1.  0.  0.]
east_reward: 0.0 west_reward: 1.0
current_state: 2
0 0
2 [1, -1, 0, 0]
Collision
Old pos: 0 1
New pos: 0 1
New state: 2 [ 1. -1.  0.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 2
1 1
2 [1, -1, 0, 0]
Out of bounds for east agent
Old pos: 0 1
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
0 1
1 [1, 0, -1, 0]
Old pos: 0 2
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 3
1 0
3 [0, 1, 0, -1]
Old pos: 1 3
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 1
1 1
1 [1, 0, -1, 0]
Out of bounds for east agent
Old pos: 0 2
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: -1.0 west_reward: 0.0
current_state: 0
0 0
0 [1, 0, 0, -1]
Old pos: 0 3
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
1 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 0 3
New st

current_state: 4
1 0
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 0 1
New state: 2 [ 1. -1.  0.  0.]
east_reward: 0.0 west_reward: 1.0
current_state: 2
1 1
2 [1, -1, 0, 0]
Out of bounds for east agent
Old pos: 0 1
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
0 0
1 [1, 0, -1, 0]
Collision
Old pos: 0 2
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 1
0 1
1 [1, 0, -1, 0]
Old pos: 0 2
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 3
0 1
3 [0, 1, 0, -1]
Out of bounds for west agent
Old pos: 1 3
New pos: 2 3
New state: 5 [ 0.  0.  1. -1.]
east_reward: 1.0 west_reward: -1.0
current_state: 5
0 0
5 [0, 0, 1, -1]
Collision
Old pos: 2 3
New pos: 2 3
New state: 5 [ 0.  0.  1. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 5
1 0
5 [0, 0, 1, -1]
Old pos: 2 3
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
1 0

0 1
2 [1, -1, 0, 0]
Old pos: 0 1
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 2 3
New state: 5 [ 0.  0.  1. -1.]
east_reward: 1.0 west_reward: 0.0
current_state: 5
1 0
5 [0, 0, 1, -1]
Old pos: 2 3
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 2 3
New state: 5 [ 0.  0.  1. -1.]
east_reward: 1.0 west_reward: 0.0
current_state: 5
1 1
5 [0, 0, 1, -1]
Out of bounds for west agent
Old pos: 2 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: -1.0
current_state: 3
1 0
3 [0, 1, 0, -1]
Old pos: 1 3
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 1
0 0
1 [1, 0, -1, 0]
Collision
Old pos: 0 2
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 1
0 1
1 [1, 0, -1, 0]
Old pos: 0 2
New pos: 1 3
New state: 3 [ 0

New state: 2 [ 1. -1.  0.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 2
1 1
2 [1, -1, 0, 0]
Out of bounds for east agent
Old pos: 0 1
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
0 0
1 [1, 0, -1, 0]
Collision
Old pos: 0 2
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 1
1 0
1 [1, 0, -1, 0]
Out of bounds for east agent
Old pos: 0 2
New pos: 0 1
New state: 2 [ 1. -1.  0.  0.]
east_reward: -1.0 west_reward: 1.0
current_state: 2
0 1
2 [1, -1, 0, 0]
Old pos: 0 1
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
1 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 0
0 0
0 [1, 0, 0, -1]
Old pos: 0 3
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 0
4 [0, 1, -1, 0]
Collision
Old pos: 1 2
New pos: 1 2
New state: 4 [ 0.  1

2 [1, -1, 0, 0]
Out of bounds for east agent
Old pos: 0 1
New pos: 0 2
New state: 1 [ 1.  0. -1.  0.]
east_reward: -1.0 west_reward: 0.0
current_state: 1
0 1
1 [1, 0, -1, 0]
Old pos: 0 2
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: 0.0 west_reward: 0.0
current_state: 3
0 0
3 [0, 1, 0, -1]
Collision
Old pos: 1 3
New pos: 1 3
New state: 3 [ 0.  1.  0. -1.]
east_reward: -1.0 west_reward: -1.0
current_state: 3
1 1
3 [0, 1, 0, -1]
Out of bounds for west agent
Old pos: 1 3
New pos: 0 3
New state: 0 [ 1.  0.  0. -1.]
east_reward: 0.0 west_reward: -1.0
current_state: 0
0 0
0 [1, 0, 0, -1]
Old pos: 0 3
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: 0.0 west_reward: 0.0
current_state: 4
0 0
4 [0, 1, -1, 0]
Collision
Old pos: 1 2
New pos: 1 2
New state: 4 [ 0.  1. -1.  0.]
east_reward: -1.0 west_reward: -1.0
current_state: 4
0 1
4 [0, 1, -1, 0]
Old pos: 1 2
New pos: 2 3
New state: 5 [ 0.  0.  1. -1.]
east_reward: 1.0 west_reward: 0.0
current_state: 5
0 0
5 [0, 0, 1, -1]
Coll

In [237]:
igq.

array([[-0.12824225, -1.69249195],
       [-1.01207336, -1.37505773],
       [-0.33513446, -1.75881455],
       [ 0.23908937, -0.32106355],
       [-0.01057663, -0.23969134],
       [-1.13538685, -0.06090171]])

In [238]:
igq.west_agent.Q

array([[-0.10939696, -1.75602191],
       [-0.21179065, -0.77624558],
       [-0.9384595 , -0.04945055],
       [-1.31433493, -1.21700079],
       [-0.40278476, -0.20517264],
       [-0.36115856, -1.78026538]])

In [243]:
trsa[0:100, 1:6]

array([[ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  4.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  4.,  0.,  0.],
       [-1., -1.,  4.,  0.,  1.],
       [ 1.,  0.,  5.,  0.,  1.],
       [-1., -1.,  5.,  0.,  0.],
       [-1., -1.,  5.,  0.,  1.],
       [-1., -1.,  5.,  0.,  0.],
       [-1., -1.,  5.,  0.,  0.],
       [-1., -1.,  5.,  1.,  1.],
       [ 0., -1.,  3.,  1.,  0.],
       [ 0.,  0.,  1.,  1.,  1.],
       [-1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  4.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  4.,  1.,  1.],
       [ 0.,  0.,  0.,  1.,  0.],
       [-1.,  0.,  1.,  1.,  1.],
       [-1.,  0.,  0.,  1.,  1.],
       [-1., -1.,  0.,  1.,  1.],
       [-1., -1.,  0.,  1.,  0.],
       [-1.,  0.,  1.,  0.,  0.],
       [-1., -1.,  1.,  0.,  1.],
       [ 0.,  0.,  3.,  1.,  1.],
       [ 0., -1.,  0.,  1.,  1.],
       [-1., -1.,  0.,  1.,  1.],
       [-1., -1.,  0.,  0.,  1.],
       [ 0., -1.,  3.,  0.,  0.],
       [-1., -

In [260]:
trsa[200:500, 3:6]

array([[0., 0., 1.],
       [3., 1., 0.],
       [1., 0., 0.],
       [1., 0., 1.],
       [3., 0., 0.],
       [3., 0., 0.],
       [3., 1., 0.],
       [1., 1., 0.],
       [2., 1., 1.],
       [1., 0., 1.],
       [3., 1., 0.],
       [1., 0., 1.],
       [3., 0., 0.],
       [3., 0., 1.],
       [5., 1., 1.],
       [3., 1., 0.],
       [1., 0., 1.],
       [3., 1., 0.],
       [1., 1., 0.],
       [2., 0., 1.],
       [4., 1., 1.],
       [0., 0., 0.],
       [4., 0., 1.],
       [5., 0., 0.],
       [5., 1., 0.],
       [4., 0., 0.],
       [4., 1., 1.],
       [0., 0., 0.],
       [4., 1., 1.],
       [0., 0., 0.],
       [4., 1., 0.],
       [2., 0., 1.],
       [4., 0., 0.],
       [4., 1., 0.],
       [2., 0., 1.],
       [4., 1., 0.],
       [2., 0., 1.],
       [4., 1., 0.],
       [2., 0., 1.],
       [4., 0., 0.],
       [4., 1., 1.],
       [0., 0., 0.],
       [4., 1., 0.],
       [2., 0., 1.],
       [4., 0., 0.],
       [4., 1., 1.],
       [0., 0., 1.],
       [3., 0