In [9]:
#!/usr/bin/env python


import numpy as np
from base_gridworld import GridWorld
import time
#from base_csp_test import *
from copy import deepcopy

def update_state_action(state_action_matrix, visit_counter_matrix, observation, new_observation, 
                   action, new_action, reward, alpha, gamma):
    '''Return the updated utility matrix

    @param state_action_matrix the matrix before the update
    @param observation the state obsrved at t
    @param new_observation the state observed at t+1
    @param action the action at t
    @param new_action the action at t+1
    @param reward the reward observed after the action
    @param alpha the ste size (learning rate)
    @param gamma the discount factor
    @return the updated state action matrix
    '''
    #Getting the values of Q at t and at t+1
    col = observation[1] + (observation[0]*3)
    q = state_action_matrix[action, col]
    col_t1 = new_observation[1] + (new_observation[0]*3)
    q_t1 = state_action_matrix[int(new_action) ,col_t1]
    #Calculate alpha based on how many time it
    #has been visited
    alpha_counted = 1.0 / (1.0 + visit_counter_matrix[action, col])
    #Applying the update rule
    #Here you can change "alpha" with "alpha_counted" if you want
    #to take into account how many times that particular state-action
    #pair has been visited until now.
    state_action_matrix[action ,col] = state_action_matrix[action ,col] + alpha * (reward + gamma * q_t1 - q)
    return state_action_matrix

def update_visit_counter(visit_counter_matrix, observation, action):
    '''Update the visit counter
   
    Counting how many times a state-action pair has been 
    visited. This information can be used during the update.
    @param visit_counter_matrix a matrix initialised with zeros
    @param observation the state observed
    @param action the action taken
    '''
    col = observation[1] + (observation[0]*4)
    visit_counter_matrix[action ,col] += 1.0
    return visit_counter_matrix

def update_policy(policy_matrix, state_action_matrix, observation):
    '''Return the updated policy matrix

    @param policy_matrix the matrix before the update
    @param state_action_matrix the state-action matrix
    @param observation the state obsrved at t
    @return the updated state action matrix
    '''
    col = observation[1] + (observation[0]*3)
    #Getting the index of the action with the highest utility
    best_action = np.argmax(state_action_matrix[:, col])
    #Updating the policy
    policy_matrix[observation[0], observation[1]] = best_action
    return policy_matrix

def return_epsilon_greedy_action(policy_matrix, observation, epsilon=0.1):
    '''Return an action choosing it with epsilon-greedy

    @param policy_matrix the matrix before the update
    @param observation the state obsrved at t
    @param epsilon the value used for computing the probabilities
    @return the updated policy_matrix
    '''
    tot_actions = int(np.nanmax(policy_matrix) + 1)
    action = int(policy_matrix[observation[0], observation[1]])
    non_greedy_prob = epsilon / tot_actions
    greedy_prob = 1 - epsilon + non_greedy_prob
    weight_array = np.full((tot_actions), non_greedy_prob)
    weight_array[action] = greedy_prob
    return np.random.choice(tot_actions, 1, p=weight_array)

def print_policy(policy_matrix):
    '''Print the policy using specific symbol.

    * terminal state
    ^ > v < up, right, down, left
    # obstacle
    '''
    counter = 0
    shape = policy_matrix.shape
    policy_string = ""
    for row in range(shape[0]):
        for col in range(shape[1]):
            if(policy_matrix[row,col] == -1): policy_string += " **  "            
            elif(policy_matrix[row,col] == 0): policy_string += " B0  "
            elif(policy_matrix[row,col] == 1): policy_string += " B1  "
            elif(policy_matrix[row,col] == 2): policy_string += " B2  "
            elif(policy_matrix[row,col] == 3): policy_string += " B3  "
            elif(policy_matrix[row,col] == 4): policy_string += " B4  "
            elif(policy_matrix[row,col] == 5): policy_string += " B5  "
            elif(policy_matrix[row,col] == 6): policy_string += " B6  "
            elif(policy_matrix[row,col] == 7): policy_string += " B7  "
            elif(policy_matrix[row,col] == 8): policy_string += " B8  "
            #elif(policy_matrix[row,col] == 3): policy_string += " <  "
            elif(np.isnan(policy_matrix[row,col])): policy_string += " #   "
            counter += 1
        policy_string += '\n'
    print(policy_string)

def return_decayed_value(starting_value, global_step, decay_step):
        """Returns the decayed value.

        decayed_value = starting_value * decay_rate ^ (global_step / decay_steps)
        @param starting_value the value before decaying
        @param global_step the global step to use for decay (positive integer)
        @param decay_step the step at which the value is decayed
        """
        decayed_value = starting_value * np.power(0.1, (global_step/decay_step))
        return decayed_value


def policy_maker(destination,other_agent):
    policy_matrices=[]
    policy_array=[]
    iter_segment=[] #Number of heads
    exe_time=[]
    ret_val=[]   # return value plot
    ret_val2=[]
    utility_val=[]
    policy_plot=[]
    avg_rew=[]

    start_time = time.time()
    env = GridWorld(6, 13)

    #Define the state matrix
    state_matrix = np.zeros((6,13))
    state_matrix[destination] = 1

    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    r1 = np.full((6,13), -0.1)
    r1[destination]= 1.5

    print("Reward Matrix:")
    reward_matrix=r1
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.eye(9)

    #Random policy
    policy_matrix = np.random.randint(low=0, high=9, size=(6,3)).astype(np.float32)
    policy_matrix[destination] =-1 #No action for the terminal states

    print("Policy Matrix:")
    print(policy_matrix)

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    #utility_matrix = np.zeros((3,4))
    state_action_matrix = np.zeros((9,6*13))
    visit_counter_matrix = np.zeros((9,6*13))
    gamma = 0.999
    alpha = 0.001 #constant step size
    tot_epoch = 50000
    print_epoch = 1000
    episod=[]
#####################################
    for epoch in range(tot_epoch):
        iteration=0
        epsilon = return_decayed_value(0.1, epoch, decay_step=100000)
        #Reset and return the first observation
        observation = env.reset(exploring_starts=True)
        is_starting = True 
        for step in range(1000):
            iteration+=1

            action = return_epsilon_greedy_action(policy_matrix, observation, epsilon=0.1)
            if(is_starting): 
                action = np.random.randint(0, 9)
                is_starting = False  
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            new_action = policy_matrix[new_observation[0], new_observation[1]]
            #Updating the state-action matrix
            state_action_matrix = update_state_action(state_action_matrix, visit_counter_matrix, observation, new_observation, 
                                                      action, new_action, reward, alpha, gamma)
            
            #Updating the policy
            policy_matrix = update_policy(policy_matrix, state_action_matrix, observation)
            pol_mat=list(policy_matrix)
            #Increment the visit counter
            visit_counter_matrix = update_visit_counter(visit_counter_matrix, observation, action)
            observation = new_observation
            
            #print(utility_matrix)
            if done: break
        episod.append(iteration)
        if(epoch % print_epoch == 0):
            print("")
            print("Epsilon: " + str(epsilon))
            print("State-Action matrix after " + str(epoch+1) + " iterations:") 
            sam=state_action_matrix.copy()
            utility_val.append(sam)
            print(sam)
            print("Policy matrix after " + str(epoch+1) + " iterations:") 
            print_policy(policy_matrix)
    #Time to check the utility matrix obtained
    """
    print("State-Action matrix after " + str(tot_epoch) + " iterations:")
    print(state_action_matrix)
    print("Policy matrix after " + str(tot_epoch) + " iterations:")
    print_policy(policy_matrix)
    end_time=time.time()
    exe_time=end_time - start_time
    print(exe_time)"""
    end_time=time.time()
    exe_time=end_time - start_time
    return policy_matrix,exe_time,utility_val,episod

# agent 1 parameters
all_policies_agent1=[]
all_exe_time_agent1=[]
all_iteration=[]
uv_agent1=[]
episodes_agent1=[]



# # agent 1 goal & obst and states to avoid
goals_agent1=ocurrent_position.copy()


# # states to avoid (neighbouring states)
state_avoid_agent1=[(0,12),(0,12),(0,12)]
  

for i in range(len(goals_agent1)):
    if i==0:
        #policy,e_time,iteration=policy_maker(goals[i])
        policy_a1,e_time_a1,utility_value_a1,episod_a1=policy_maker(goals_agent1[i],state_avoid_agent1[i])
    if i>0:
        if goals_agent1[i]!=goals_agent1[i-1]:
            #policy,e_time,iteration=policy_maker(goals[i])
            policy_a1,e_time_a1,utility_value_a1,episod_a1=policy_maker(goals_agent1[i],state_avoid_agent1[i])

    #agent 1 updates
    all_policies_agent1.append(policy_a1)
    uv_agent1.append(utility_value_a1)
    episodes_agent1.append(episod_a1)
    all_exe_time_agent1.append(e_time_a1)
    print (policy_a1)

np.save("allpolicies_agent1.npy",all_policies_agent1)
np.save("goals_agent1.npy",goals_agent1)
np.save("utility_value_agent1",uv_agent1)
np.save("episodes_agent1",episodes_agent1)


        

NameError: name 'ocurrent_position' is not defined

In [19]:
#!/usr/bin/env python


import numpy as np
from base_gridworld import GridWorld
import time
#from base_csp_test import *
from copy import deepcopy

def update_state_action(state_action_matrix, visit_counter_matrix, observation, new_observation, 
                   action, new_action, reward, alpha, gamma):

    #Getting the values of Q at t and at t+1
    col = observation[1] + (observation[0]*3)
    q = state_action_matrix[action, col]
    col_t1 = new_observation[1] + (new_observation[0]*3)
    q_t1 = state_action_matrix[int(new_action) ,col_t1]
    #Calculate alpha based on how many time it
    #has been visited
    alpha_counted = 1.0 / (1.0 + visit_counter_matrix[action, col])

    state_action_matrix[action ,col] = state_action_matrix[action ,col] + alpha * (reward + gamma * q_t1 - q)
    return state_action_matrix

def update_visit_counter(visit_counter_matrix, observation, action):

    col = observation[1] + (observation[0]*3)
    visit_counter_matrix[action ,col] += 1.0
    return visit_counter_matrix

def update_policy(policy_matrix, state_action_matrix, observation):

    col = observation[1] + (observation[0]*3)
    #Getting the index of the action with the highest utility
    best_action = np.argmax(state_action_matrix[:, col])
    #Updating the policy
    policy_matrix[observation[0], observation[1]] = best_action
    return policy_matrix

def return_epsilon_greedy_action(policy_matrix, observation, epsilon=0.1):

    tot_actions = int(np.nanmax(policy_matrix) + 1)
    action = int(policy_matrix[observation[0], observation[1]])
    non_greedy_prob = epsilon / tot_actions
    greedy_prob = 1 - epsilon + non_greedy_prob
    weight_array = np.full((tot_actions), non_greedy_prob)
    weight_array[action] = greedy_prob
    return np.random.choice(tot_actions, 1, p=weight_array)

def print_policy(policy_matrix):

    counter = 0
    shape = policy_matrix.shape
    policy_string = ""
    for row in range(shape[0]):
        for col in range(shape[1]):
            if(policy_matrix[row,col] == -1): policy_string += " **  "            
            elif(policy_matrix[row,col] == 0): policy_string += " A0  "
            elif(policy_matrix[row,col] == 1): policy_string += " A1  "
            elif(policy_matrix[row,col] == 2): policy_string += " A2  "
            elif(policy_matrix[row,col] == 3): policy_string += " A3  "
            elif(policy_matrix[row,col] == 4): policy_string += " A4  "
            elif(policy_matrix[row,col] == 5): policy_string += " A5  "
            elif(policy_matrix[row,col] == 6): policy_string += " A6  "
            elif(policy_matrix[row,col] == 7): policy_string += " A7  "
            elif(policy_matrix[row,col] == 8): policy_string += " A8  "
            #elif(policy_matrix[row,col] == 3): policy_string += " <  "
            elif(np.isnan(policy_matrix[row,col])): policy_string += " #   "
            counter += 1
        policy_string += '\n'
    print(policy_string)

def return_decayed_value(starting_value, global_step, decay_step):

        decayed_value = starting_value * np.power(0.1, (global_step/decay_step))
        return decayed_value


def policy_maker(destination,other_agent):
    policy_matrices=[]
    policy_array=[]
    iter_segment=[] #Number of heads
    exe_time=[]
    ret_val=[]   # return value plot
    ret_val2=[]
    utility_val=[]
    policy_plot=[]
    avg_rew=[]

    start_time = time.time()
    env = GridWorld(6, 3)

    #Define the state matrix
    state_matrix = np.zeros((6,3))
    state_matrix[destination] = 1

    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    r1 = np.full((6,3), -0.1)
    r1[destination]= 1.5

    print("Reward Matrix:")
    reward_matrix=r1
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.eye(9)

    #Random policy
    policy_matrix = np.random.randint(low=0, high=9, size=(6,3)).astype(np.float32)
    policy_matrix[destination] =-1 #No action for the terminal states

    print("Policy Matrix:")
    print(policy_matrix)

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    #utility_matrix = np.zeros((3,4))
    state_action_matrix = np.zeros((9,6*3))
    visit_counter_matrix = np.zeros((9,6*3))
    gamma = 0.999
    alpha = 0.001 #constant step size
    tot_epoch = 50000
    print_epoch = 1000
    episod=[]
#####################################
    for epoch in range(tot_epoch):
        iteration=0
        epsilon = return_decayed_value(0.1, epoch, decay_step=100000)
        #Reset and return the first observation
        observation = env.reset(exploring_starts=True)
        is_starting = True 
        for step in range(1000):
            iteration+=1

            action = return_epsilon_greedy_action(policy_matrix, observation, epsilon=0.1)
            if(is_starting): 
                action = np.random.randint(0, 9)
                is_starting = False  
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            new_action = policy_matrix[new_observation[0], new_observation[1]]
            #Updating the state-action matrix
            state_action_matrix = update_state_action(state_action_matrix, visit_counter_matrix, observation, new_observation, 
                                                      action, new_action, reward, alpha, gamma)
            
            #Updating the policy
            policy_matrix = update_policy(policy_matrix, state_action_matrix, observation)
            pol_mat=list(policy_matrix)
            #Increment the visit counter
            visit_counter_matrix = update_visit_counter(visit_counter_matrix, observation, action)
            observation = new_observation
            
            #print(utility_matrix)
            if done: break
        episod.append(iteration)
        if(epoch % print_epoch == 0):
            print("")
            print("Epsilon: " + str(epsilon))
            print("State-Action matrix after " + str(epoch+1) + " iterations:") 
            sam=state_action_matrix.copy()
            utility_val.append(sam)
            print(sam)
            print("Policy matrix after " + str(epoch+1) + " iterations:") 
            print_policy(policy_matrix)
    #Time to check the utility matrix obtained

    end_time=time.time()
    exe_time=end_time - start_time
    return policy_matrix,exe_time,utility_val,episod

# agent 1 parameters
all_policies_agent1=[]
all_exe_time_agent1=[]
all_iteration=[]
uv_agent1=[]
episodes_agent1=[]



# # agent 1 goal & obst and states to avoid
goals_agent1=(4,2)


# # states to avoid (neighbouring states)
state_avoid_agent1=(1,0)



policy_a1,e_time_a1,utility_value_a1,episod_a1=policy_maker(goals_agent1,state_avoid_agent1)

# for i in range(len(goals_agent1)):
#     if i==0:
#         #policy,e_time,iteration=policy_maker(goals[i])
#         policy_a1,e_time_a1,utility_value_a1,episod_a1=policy_maker(goals_agent1[i],state_avoid_agent1[i])
#     if i>0:
#         if goals_agent1[i]!=goals_agent1[i-1]:
#             #policy,e_time,iteration=policy_maker(goals[i])
#             policy_a1,e_time_a1,utility_value_a1,episod_a1=policy_maker(goals_agent1[i],state_avoid_agent1[i])

#     #agent 1 updates
#     all_policies_agent1.append(policy_a1)
#     uv_agent1.append(utility_value_a1)
#     episodes_agent1.append(episod_a1)
#     all_exe_time_agent1.append(e_time_a1)
#     print (policy_a1)

# np.save("allpolicies_agent1.npy",all_policies_agent1)
# np.save("goals_agent1.npy",goals_agent1)
# np.save("utility_value_agent1",uv_agent1)
# np.save("episodes_agent1",episodes_agent1)


        

State Matrix:
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 1.]
 [0. 0. 0.]]
Reward Matrix:
[[-0.1 -0.1 -0.1]
 [-0.1 -0.1 -0.1]
 [-0.1 -0.1 -0.1]
 [-0.1 -0.1 -0.1]
 [-0.1 -0.1  1.5]
 [-0.1 -0.1 -0.1]]
Policy Matrix:
[[ 1.  6.  5.]
 [ 8.  1.  4.]
 [ 5.  5.  3.]
 [ 5.  2.  5.]
 [ 2.  3. -1.]
 [ 5.  6.  6.]]

Epsilon: 0.1
State-Action matrix after 1 iterations:
[[ 0.         0.        -0.0001     0.         0.        -0.0001
   0.         0.        -0.0001999  0.         0.         0.
   0.         0.         0.         0.         0.         0.       ]
 [ 0.         0.        -0.0001     0.         0.        -0.0001
   0.         0.        -0.0001     0.         0.         0.
   0.         0.         0.        -0.0001     0.         0.       ]
 [ 0.         0.        -0.0001     0.         0.         0.
   0.         0.        -0.0001     0.        -0.0001     0.
  -0.0001     0.         0.         0.         0.         0.       ]
 [ 0.         0.        -0.0001     0.         0


Epsilon: 0.09120108393559098
State-Action matrix after 4001 iterations:
[[-1.52853783e-02 -1.35948775e-02 -1.45953561e-02 -8.25925191e-03
  -6.99857893e-03 -8.12639555e-03 -5.64755243e-04  8.83526131e-03
  -3.36081815e-04 -8.20474951e-04  5.77669032e-02  3.66873697e-03
  -6.35374578e-05  3.39647366e-03  0.00000000e+00 -2.76999036e-03
   6.84505041e-03  1.82798397e-03]
 [-1.45955826e-02 -1.34945795e-02 -1.43979980e-02 -8.74450468e-03
  -7.83963091e-03 -8.99172054e-03 -2.85548721e-03 -3.59821651e-03
  -3.27333529e-03  3.02393140e-04  2.07671948e-02  8.61974888e-04
   2.65138362e-04  2.62730845e-02  0.00000000e+00 -6.70155079e-04
   3.27253547e-03  3.51643811e-01]
 [-1.47911327e-02 -1.38934083e-02 -1.47952607e-02 -8.42092240e-03
  -9.02756413e-03 -7.66788215e-03 -3.43734245e-03 -4.49566909e-03
  -1.59457759e-03  7.91304663e-03  2.58593300e-03  3.48341670e-03
   2.23840767e-01  3.70537691e-03  0.00000000e+00  3.45360629e-03
   5.38977580e-01  3.36771299e-03]
 [-1.51322335e-02 -1.42546916e


Epsilon: 0.08511380382023764
State-Action matrix after 7001 iterations:
[[-1.67425577e-02 -1.49112936e-02 -1.68106518e-02 -7.46468298e-03
  -9.47605525e-04 -5.76203455e-03  4.62116037e-03  4.80695631e-02
   9.29851638e-03  5.45366598e-03  1.16140175e-01  1.36853546e-02
   5.79953927e-03  1.09865446e-02  0.00000000e+00 -2.88923266e-03
   2.06641015e-02  8.35864678e-03]
 [-1.61307281e-02 -1.50947929e-02 -1.53322885e-02 -1.07390295e-02
  -1.00728956e-02 -1.04929638e-02 -2.32843564e-03  5.22499063e-03
  -8.87399996e-04  4.01288247e-03  7.06397997e-02  7.22387476e-03
   5.34758469e-03  5.64140777e-02  0.00000000e+00  6.19166344e-03
   1.58460227e-02  5.31254688e-01]
 [-1.70207118e-02 -1.58897602e-02 -1.59197770e-02 -9.88848430e-03
  -1.20782384e-02 -6.23197050e-03  2.09467115e-03 -1.09302255e-03
   3.86350044e-03  2.39404688e-02  1.52282033e-02  1.14645852e-02
   4.35807689e-01  1.34514204e-02  0.00000000e+00  1.43702031e-02
   8.22897529e-01  9.10916270e-03]
 [-1.67690061e-02 -1.56956061e


Epsilon: 0.07943282347242815
State-Action matrix after 10001 iterations:
[[-1.65936324e-02 -1.47909829e-02 -1.68485218e-02 -2.97552008e-03
   1.41854471e-02 -2.17558991e-03  1.51436608e-02  9.81210184e-02
   2.36428874e-02  1.15654736e-02  1.72340061e-01  2.59060333e-02
   1.73016296e-02  2.61170694e-02  0.00000000e+00 -5.07595475e-04
   3.90440007e-02  2.16080092e-02]
 [-1.61759533e-02 -1.48193828e-02 -1.52343011e-02 -1.10069106e-02
  -1.00210265e-02 -1.06174616e-02  2.75187242e-03  2.15916355e-02
   6.91778163e-03  1.38636007e-02  1.24987538e-01  2.14780162e-02
   1.34471579e-02  7.89462411e-02  0.00000000e+00  1.71319216e-02
   3.19326126e-02  6.97358889e-01]
 [-1.70675383e-02 -1.60029747e-02 -1.58360390e-02 -9.82347609e-03
  -1.18836513e-02 -2.46391579e-04  1.34365565e-02  8.72454975e-03
   2.05940554e-02  4.67707914e-02  3.80575902e-02  2.90720193e-02
   6.28718780e-01  2.85401825e-02  0.00000000e+00  2.53826715e-02
   1.01134363e+00  2.29168174e-02]
 [-1.65425530e-02 -1.54932705


Epsilon: 0.07413102413009175
State-Action matrix after 13001 iterations:
[[-1.37829042e-02 -1.21005147e-02 -1.48692023e-02  5.33922019e-03
   4.20462817e-02  9.83427856e-03  3.62107246e-02  1.47088608e-01
   4.13865845e-02  2.59771807e-02  2.22758839e-01  4.09159624e-02
   2.88392049e-02  4.05213697e-02  0.00000000e+00  5.58104687e-03
   6.81852585e-02  3.53012415e-02]
 [-1.40640048e-02 -1.13373775e-02 -1.31030872e-02 -7.61517049e-03
  -6.78524216e-03 -8.32799835e-03  9.97479482e-03  5.39567127e-02
   1.42637352e-02  2.74308412e-02  1.89918845e-01  3.67357870e-02
   2.27275309e-02  1.07657112e-01  0.00000000e+00  3.17232778e-02
   5.24236385e-02  8.44232299e-01]
 [-1.48608392e-02 -1.36674457e-02 -1.26997574e-02 -7.33949854e-03
  -7.78102635e-03  9.16487460e-03  3.07241848e-02  2.41720144e-02
   3.69677821e-02  7.44374678e-02  7.55516177e-02  4.15008025e-02
   7.64393292e-01  4.94613601e-02  0.00000000e+00  4.80544613e-02
   1.15879622e+00  4.01439486e-02]
 [-1.31369526e-02 -1.20450287


Epsilon: 0.06760829753919818
State-Action matrix after 17001 iterations:
[[-6.57476443e-03 -3.80696720e-03 -9.42457634e-03  1.63015501e-02
   8.87116944e-02  2.78491739e-02  5.42543081e-02  1.98366995e-01
   6.29888140e-02  4.52242618e-02  2.94574961e-01  6.90087128e-02
   5.95325038e-02  7.33992762e-02  0.00000000e+00  1.70163795e-02
   1.06194624e-01  5.33674293e-02]
 [-4.73829672e-03  3.34519025e-04 -3.55058797e-03 -1.30493882e-03
   5.46091127e-04 -1.78769544e-03  2.50777410e-02  1.00299390e-01
   3.24923132e-02  4.47935883e-02  2.61074595e-01  6.67479295e-02
   4.17383212e-02  1.53320150e-01  0.00000000e+00  5.16846031e-02
   7.88390398e-02  9.91391011e-01]
 [-8.73044853e-03 -2.85794932e-03 -4.88691850e-03 -1.95122788e-07
  -2.26396164e-03  2.84434796e-02  5.37098206e-02  4.91919644e-02
   5.98663617e-02  1.13839998e-01  1.17959515e-01  6.53973584e-02
   9.26435284e-01  7.09087563e-02  0.00000000e+00  6.96455828e-02
   1.29247927e+00  5.88894495e-02]
 [-2.78684488e-03 -1.01178865


Epsilon: 0.06309573444801933
State-Action matrix after 20001 iterations:
[[7.42787787e-05 2.34749950e-03 5.06036498e-04 3.01555937e-02
  1.18327854e-01 4.22047755e-02 7.02127814e-02 2.42434402e-01
  8.31051971e-02 6.92893059e-02 3.64080943e-01 8.90665198e-02
  7.79763377e-02 9.40366277e-02 0.00000000e+00 2.73847385e-02
  1.31639747e-01 8.11520731e-02]
 [6.85807365e-03 5.63615361e-03 4.38409739e-03 6.17319618e-03
  1.32584707e-02 5.38596144e-03 4.19311079e-02 1.34511493e-01
  4.72639169e-02 6.62644408e-02 3.10520491e-01 8.10506811e-02
  6.35162296e-02 1.74319568e-01 0.00000000e+00 5.98640986e-02
  1.03073263e-01 1.08279385e+00]
 [5.73676242e-04 5.15741864e-03 4.81159353e-03 1.26405353e-02
  8.12091749e-03 4.70735351e-02 8.63364582e-02 7.61122398e-02
  8.17438091e-02 1.38396234e-01 1.65917754e-01 8.54385599e-02
  1.01049384e+00 8.89461638e-02 0.00000000e+00 9.11928455e-02
  1.35128011e+00 7.02438495e-02]
 [8.81029746e-03 3.79548268e-03 5.94549284e-03 8.03246695e-02
  6.04730663e-02 4.50


Epsilon: 0.057543993733715694
State-Action matrix after 24001 iterations:
[[0.01508115 0.01213672 0.01245442 0.05433208 0.16838193 0.06546067
  0.09191718 0.28783385 0.11899791 0.09556144 0.43982005 0.12470409
  0.09884414 0.12686833 0.         0.04148688 0.16445868 0.10449691]
 [0.02346743 0.02113116 0.02330357 0.02922968 0.0314187  0.01736656
  0.0697031  0.18577431 0.06644123 0.09309815 0.37074901 0.10834144
  0.09952791 0.21289216 0.         0.09347535 0.1453644  1.17414357]
 [0.01858865 0.0206178  0.01762558 0.02593391 0.03068894 0.07421557
  0.11512379 0.10857721 0.10914333 0.16700223 0.23455206 0.10902603
  1.1011563  0.12451369 0.         0.11245671 1.40298642 0.10227568]
 [0.02111765 0.02204834 0.02055805 0.12068637 0.08874264 0.06959685
  0.17777789 0.18205185 0.11385763 1.10702312 0.26421784 0.12132949
  0.11626606 1.24549174 0.         0.81357947 0.12070851 0.10193316]
 [0.71121072 0.0767607  0.01525723 0.91813949 0.14118706 0.07021054
  1.10032109 0.18639849 0.12293175 0.


Epsilon: 0.05248074602497726
State-Action matrix after 28001 iterations:
[[0.03194897 0.03341318 0.03156819 0.06962314 0.21698154 0.09290464
  0.11991406 0.33707481 0.15786932 0.1134478  0.50107974 0.15422731
  0.11824677 0.16909322 0.         0.06663988 0.20537645 0.13706552]
 [0.03680856 0.03834659 0.04835891 0.05183617 0.05966905 0.03450662
  0.09525207 0.2297009  0.10497984 0.12088366 0.41770494 0.13593426
  0.12933457 0.23754251 0.         0.12042536 0.1869177  1.24726796]
 [0.03632992 0.03613619 0.03463614 0.04474864 0.05447022 0.11408486
  0.15291217 0.15346955 0.13763733 0.19601762 0.29920698 0.13885612
  1.17335386 0.15388999 0.         0.14501714 1.43925913 0.13799581]
 [0.04142758 0.04162287 0.04215273 0.1552106  0.1240823  0.08869565
  0.21520961 0.23590734 0.14411972 1.17468303 0.32849089 0.15302399
  0.14543878 1.309969   0.         0.92882664 0.15480986 0.13392553]
 [0.81759753 0.10756247 0.03817604 1.00344886 0.19169004 0.09512039
  1.16953672 0.23270604 0.14904002 0.1


Epsilon: 0.04786300923226384
State-Action matrix after 32001 iterations:
[[0.05929657 0.05715786 0.05208964 0.09444886 0.24294804 0.12362698
  0.14602862 0.37933812 0.18330437 0.14178613 0.55758402 0.17899003
  0.14548767 0.18907522 0.         0.09149604 0.23593568 0.16463234]
 [0.06052536 0.06090852 0.07005008 0.07495835 0.08536158 0.05390865
  0.11688764 0.28842107 0.13624194 0.15252987 0.47882377 0.16791646
  0.15725634 0.26622082 0.         0.14753134 0.21674697 1.29760551]
 [0.05660605 0.05599996 0.05358865 0.06372872 0.07724364 0.14599313
  0.17652373 0.19349835 0.18175892 0.2402698  0.34663825 0.17257309
  1.23306215 0.18722932 0.         0.17502646 1.46283486 0.16444483]
 [0.06208629 0.066157   0.06410715 0.18683608 0.15456906 0.1124386
  0.24404347 0.2850382  0.17393385 1.23021044 0.38868296 0.18528695
  0.18065048 1.35364186 0.         1.03358428 0.17594408 0.16354368]
 [0.90602236 0.13520133 0.05445065 1.07025081 0.23644411 0.11877738
  1.22338769 0.27524383 0.17868795 0.17


Epsilon: 0.043651583224016605
State-Action matrix after 36001 iterations:
[[0.07627664 0.07790074 0.07892028 0.11844476 0.28390413 0.15675486
  0.17453399 0.42673589 0.21889523 0.17147688 0.60816198 0.22550382
  0.18403501 0.21680527 0.         0.12391222 0.27271634 0.19064529]
 [0.08809378 0.08157267 0.09496415 0.09675658 0.11494523 0.07522625
  0.13697087 0.3274211  0.15814197 0.19142497 0.52957721 0.20097199
  0.19431538 0.30299432 0.         0.17229818 0.24998593 1.34096916]
 [0.0750132  0.08283919 0.08272125 0.09773778 0.11081759 0.17566821
  0.19967074 0.22502978 0.20536003 0.26946491 0.39902093 0.204409
  1.27546763 0.22376876 0.         0.20314867 1.47723724 0.19560771]
 [0.08590642 0.09322722 0.09207927 0.21758965 0.18748001 0.14316855
  0.26696901 0.32872202 0.20377549 1.26991998 0.44411099 0.2124156
  0.21188434 1.3902821  0.         1.11690944 0.21279833 0.19168635]
 [0.96884667 0.16698729 0.07428867 1.11920961 0.28250773 0.15491665
  1.2633618  0.3244073  0.21642227 0.212


Epsilon: 0.03981071705534973
State-Action matrix after 40001 iterations:
[[0.10679248 0.08855859 0.10107482 0.14202269 0.31173114 0.18906262
  0.21293574 0.471686   0.24511647 0.21033212 0.65047739 0.26082399
  0.21185471 0.25511991 0.         0.14394604 0.31190749 0.22529637]
 [0.10952291 0.10346569 0.12080062 0.11098528 0.14114275 0.10163226
  0.16282882 0.36644844 0.19771133 0.2225563  0.58363628 0.23575642
  0.22179334 0.3406923  0.         0.20625342 0.29024778 1.3759141 ]
 [0.10387617 0.11636209 0.11133954 0.12835923 0.14546255 0.19931214
  0.23296161 0.28406266 0.23698065 0.30581966 0.45993004 0.23501483
  1.30645272 0.25700876 0.         0.22379301 1.48576233 0.22802954]
 [0.10433828 0.1134833  0.1162798  0.24838933 0.23019397 0.17133687
  0.29642833 0.37145589 0.23137584 1.30382989 0.49645001 0.24816366
  0.24734485 1.41600318 0.         1.1752537  0.24373436 0.22839681]
 [1.0183836  0.189011   0.10086382 1.1596806  0.3154667  0.19290522
  1.29648117 0.36411752 0.25261221 0.2


Epsilon: 0.03630780547701014
State-Action matrix after 44001 iterations:
[[0.13540054 0.11390731 0.12386155 0.16132904 0.3438503  0.21304478
  0.24136069 0.5205551  0.27112911 0.24391859 0.70246692 0.29054231
  0.23470875 0.28578401 0.         0.17126761 0.34993568 0.25359221]
 [0.13978213 0.12751342 0.14147628 0.13472611 0.17430168 0.12444418
  0.19064489 0.41318919 0.23135257 0.25161218 0.63122835 0.26792643
  0.25305459 0.37709305 0.         0.24590673 0.32784943 1.40181465]
 [0.1373823  0.13701087 0.13305741 0.15250268 0.17661871 0.22577772
  0.25108707 0.3226045  0.27306507 0.33897301 0.51236449 0.26220226
  1.33126827 0.28675247 0.         0.25322139 1.49125355 0.25520438]
 [0.13341115 0.14162516 0.14036251 0.28189591 0.25834407 0.19854628
  0.32017031 0.4247354  0.27062451 1.32795497 0.54131273 0.27707955
  0.28654652 1.43690536 0.         1.22351609 0.2633819  0.24943132]
 [1.06045085 0.21028979 0.12695277 1.18749072 0.35604718 0.21320345
  1.31753299 0.4188043  0.2910117  0.2


Epsilon: 0.03311311214825911
State-Action matrix after 48001 iterations:
[[0.1618177  0.14462546 0.14610522 0.18634086 0.38784407 0.23328399
  0.27045964 0.55399871 0.29870545 0.27129726 0.75084305 0.32296625
  0.26536035 0.31084443 0.         0.20009985 0.37972165 0.29108323]
 [0.16538297 0.14534339 0.17067283 0.16128763 0.2091265  0.15254571
  0.21747697 0.45211537 0.26004419 0.28136824 0.65990558 0.28776025
  0.2763759  0.40530967 0.         0.26847249 0.36668671 1.42498243]
 [0.17187444 0.17091829 0.15001217 0.18271332 0.20533957 0.24839075
  0.28723532 0.36097587 0.29961506 0.3728688  0.55787187 0.29226525
  1.34855129 0.31821257 0.         0.28332728 1.49452379 0.28032925]
 [0.15689246 0.16914446 0.15720587 0.30357671 0.28442644 0.23461297
  0.35000526 0.46192952 0.29918996 1.34465464 0.60055735 0.30675894
  0.32085633 1.45116205 0.         1.26318869 0.30354251 0.2756691 ]
 [1.09287964 0.23508395 0.14997851 1.21293443 0.38830331 0.24600075
  1.33638986 0.4651498  0.31513903 0.3

In [15]:
e_time_a1



17.72572612762451