In [1]:
import numpy as np 
import itertools

In [2]:

class SetupDeterministicTransitionByStateSet2Agent(object):
    def __init__(self, stateSet, actionSet, goalStates = []):
        self.stateSet = stateSet
        # create a joint state set from a single agent state set, add terminal state to the set
        self.jointStateSet = [(s1, s2) for s1, s2 in itertools.product(stateSet, stateSet) if s1 != s2] + ['terminal']
        self.jointActionSet = list(itertools.product(actionSet, actionSet))
        self.goalStates = goalStates

    def __call__(self):
        transitionTable = {state: self.getStateTransition(state) for state in self.jointStateSet}
        return (transitionTable)

    def getStateTransition(self, state):
        actionTransitionDistribution = {action: self.getStateActionTransition(state, action) for action in
                                        self.jointActionSet}
        return (actionTransitionDistribution)

    def getStateActionTransition(self, currentState, action):
        reachedGoalState = any([currentPos in self.goalStates for currentPos in currentState])
        if currentState == 'terminal' or reachedGoalState:
            transitionDistribution = {'terminal': 1.0}
        else:
            transitionDistribution = self.getTransitionDistribution(currentState, action)
        return (transitionDistribution)

    def getTransitionDistribution(self, state, action):
        # if you directly apply the action to the current state, what the potential next state for each agent is
        potentialNextState = tuple([self.addTuples(agentS, agentA) for agentS, agentA in zip(state, action)])
        agent1NextState = potentialNextState[0]
        agent2NextState = potentialNextState[1]

        agent1Fixed = False
        agent2Fixed = False
        # if a move takes you off the board, you cannot take it and instead that agent remains stationary, if fixed = true, that agent must remain stationary
        if agent1NextState not in self.stateSet:
            agent1NextState = state[0]
            agent1Fixed = True
        if agent2NextState not in self.stateSet:
            agent2NextState = state[1]
            agent2Fixed = True

        # resulting joint state from taking into account moves off the board - is it a viable move
        onBoardPotentialNextState = (agent1NextState, agent2NextState)

        # if it is viable, agents will not collide and it should be in the joint state set
        if onBoardPotentialNextState in self.jointStateSet:
            return ({onBoardPotentialNextState: 1.0})

        # if it is not in the joint state set, there is a collision
        if agent1NextState == agent2NextState:
            # collision 1: one agent runs into the stationary other
            if action[0] == (0, 0) or action[1] == (0, 0):
                return ({state: 1.0})
            # collision 2: one agent tries to move off the board (and must stay stationary), the other collides into it there
            elif agent1Fixed or agent2Fixed:
                return ({state: 1.0})
            # collision 3: a collision on the board, probabilistically sample who moves and who stays
            else:
                agent1Moves = (agent1NextState, state[1])
                agent2Moves = (state[0], agent2NextState)
                return ({agent1Moves: .5, agent2Moves: .5})

    def addTuples(self, tuple1, tuple2):
        lengthOfShorterTuple = min(len(tuple1), len(tuple2))
        summedTuple = tuple([tuple1[i] + tuple2[i] for i in range(lengthOfShorterTuple)])
        return (summedTuple)

    
class SetupRewardTable2AgentDistanceCost(object):
    def __init__(self, transitionTable, goalStates = [], trapStates = []):
        self.transitionTable = transitionTable
        self.goalStates = goalStates
        self.trapStates = trapStates
        
    def __call__(self, goalReward = 10, trapCost = -100, costOfNoMovement = .1):
        rewardTable = {state:{action: {nextState: self.applyRewardFunction(state, action, nextState, goalReward, trapCost, costOfNoMovement) \
                                        for nextState in nextStateDict.keys()} 
                                for action, nextStateDict in actionDict.items()} 
                        for state, actionDict in self.transitionTable.items()}
        return(rewardTable)

    def applyRewardFunction(self, state, action, nextState, goalReward, trapCost, costOfNoMovement):
        # terminal state has no reward or cost
        if state == 'terminal':
            return(0)
        #Unless already in the terminal state, incur the cost of action
        movementCosts = self.getCosts(state, action, costOfNoMovement)
        
        # if the intended next state is a special tile, 
        # the cost/reward of s, a, s' corresponds to the value of that tile
        specialTileCosts = self.getSpecialTileRewards(nextState, trapCost, goalReward)
        return(movementCosts + specialTileCosts)

    def getSpecialTileRewards(self, state, trapCost, goalReward):
        # if the next state is a special tile, the agent receives the rewards/costs of that location
        agent1State = state[0]
        agent2State = state[1]
        #get special rewards -- if either state is the goal state or the trap state
        reward = 0

        if (agent1State in self.goalStates) or (agent2State in self.goalStates):
            reward = reward + abs(goalReward)
        if agent1State in self.trapStates or agent2State in self.trapStates:
            reward = reward - abs(trapCost)
        return(reward)

    def getCosts(self, state, action, costOfNoMovement):
        # move costs - if in the goal state, no move cost 
        # because transition will move agent to terminal state no matter what
        agent1State = state[0]
        agent2State = state[1]
        if agent1State in self.goalStates or agent2State in self.goalStates:
            moveCost = 0
        else:
            moveCost = sum([self.getCostOfDistance(agentAction, costOfNoMovement) 
                for agentAction in action])
        return(moveCost)

    def getCostOfDistance(self, action, costOfNoMovement, nullAction = (0,0)):
        #Need to fix this for two agents
        if action == nullAction:
            return(-abs(costOfNoMovement))
        else:
            actionDistance = sum([abs(actionCoordinate) for actionCoordinate in action])
            return(-actionDistance)

import math

class BoltzmannValueIteration(object):
    def __init__(self, transitionTable, rewardTable, valueTable, convergenceTolerance, discountingFactor, beta):
        self.transitionTable = transitionTable
        self.rewardTable  = rewardTable
        self.valueTable = valueTable
        self.convergenceTolerance = convergenceTolerance
        self.gamma = discountingFactor
        self.beta = beta

    def __call__(self):
        
        delta = self.convergenceTolerance*100
        while(delta > self.convergenceTolerance):
            delta = 0
            for state, actionDict in self.transitionTable.items():
                valueOfStateAtTimeT = self.valueTable[state]
                qforAllActions = [self.getQValue(state, action) for action in actionDict.keys()]
                self.valueTable[state] = max(qforAllActions) 
                delta = max(delta, abs(valueOfStateAtTimeT-self.valueTable[state]))
        policyTable = {state:self.getBoltzmannPolicy(state) for state in self.transitionTable.keys()}

        return([self.valueTable, policyTable])
    
    def getBoltzmannPolicy(self, state, printStatments = False):
        exponents = [self.beta*self.getQValue(state, action) for action in self.transitionTable[state].keys()]
        actions = [action for action in self.transitionTable[state].keys()]

        # Scale to [0,700] if there are exponents larger than 700
        if len([exponent for exponent in exponents if exponent>700])>0:
            if printStatments:
                print("scaling exponents to [0,700]... On State:")
                print(state)
            exponents = [700*(exponent/max(exponents)) for exponent in exponents]

        statePolicy = {action: math.exp(exponent) for exponent, action in zip(exponents,actions)}
        normalizedPolicy = self.normalizeDictionaryValues(statePolicy)
        return(normalizedPolicy)

    def getQValue(self, state, action):
        nextStatesQ = [prob*(self.rewardTable[state][action][nextState] \
                             + self.gamma*self.valueTable[nextState]) \
                      for nextState, prob in self.transitionTable[state][action].items()]

        qValue = sum(nextStatesQ)
        return(qValue)
    
    def normalizeDictionaryValues(self, unnormalizedDictionary):
        totalSum = sum(unnormalizedDictionary.values())
        normalizedDictionary = {originalKey: val/totalSum for originalKey, val in unnormalizedDictionary.items()}
        return(normalizedDictionary)




In [3]:
gridNumberX = 5
gridNumberY = 5
states = list(itertools.product(range(gridNumberX), range(gridNumberY)))
actions = [(-1, 0), (0, 1), (1, 0), (0, -1), (0, 0)]

# unambiguous goals
# goals = [(4, 2), (0, 3)]
# goals = [(4, 0), (2, 3)]

# ambiguous goals
goals = [(1, 4), (2, 1)]
gettransition = SetupDeterministicTransitionByStateSet2Agent(states, actions, goals)
transitionTable = gettransition()

getReward = SetupRewardTable2AgentDistanceCost(transitionTable, goals)
rewardTable = getReward()

convergence = .000001
gamma = .95
valueTable = {state: 0 for state in transitionTable.keys()}
beta = 3

performValueIteration = BoltzmannValueIteration(transitionTable, rewardTable, valueTable, convergence, gamma, beta)
optimalValues, policy = performValueIteration()

initialState = ((0, 0), (4, 4))

states = list(itertools.product(range(gridNumberX), range(gridNumberY)))



In [4]:
actionSpace = [(-1, 0), (1, 0), (0, 1), (0, -1), (0,0)]
policy_initState = policy[initialState]

In [5]:
{action1: sum([policy_initState[(action1, action2)] for action2 in actionSpace]) for action1 in actionSpace}

{(-1, 0): 0.021482853568032607,
 (1, 0): 0.318687597543442,
 (0, 1): 0.318687597543442,
 (0, -1): 0.021482853568032607,
 (0, 0): 0.31965909777705115}

In [6]:
{action2: sum([policy_initState[(action1, action2)] for action1 in actionSpace]) for action2 in actionSpace}

{(-1, 0): 0.318687597543442,
 (1, 0): 0.03810529223482553,
 (0, 1): 0.03810529223482553,
 (0, -1): 0.03810529223482553,
 (0, 0): 0.5669965257520818}

In [7]:
{action2: policy_initState[((1, 0), action2)] for action2 in actionSpace}

{(-1, 0): 0.016879879554834595,
 (1, 0): 0.016879879554834595,
 (0, 1): 0.016879879554834595,
 (0, -1): 0.016879879554834595,
 (0, 0): 0.2511680793241036}

In [8]:
{action2: policy_initState[((0, -1), action2)] for action2 in actionSpace}

{(-1, 0): 0.016879879554834595,
 (1, 0): 0.0002574408880416658,
 (0, 1): 0.0002574408880416658,
 (0, -1): 0.0002574408880416658,
 (0, 0): 0.0038306513490730144}

In [9]:
{action2: policy_initState[((0, 0), action2)] for action2 in actionSpace}

{(-1, 0): 0.2511680793241036,
 (1, 0): 0.0038306513490730144,
 (0, 1): 0.0038306513490730144,
 (0, -1): 0.0038306513490730144,
 (0, 0): 0.056999064405728515}

In [11]:
a = (4,4)
b = [(2,3), (1,2), (1,1)]
a in b

False

In [12]:
a = {'a': 1, 'b': 2}
for i in a.keys():
    print(i)
    print(a[i])

a
1
b
2


In [17]:
np.unique([(1,2), (1,2),(2,3)], axis = 0)

array([[1, 2],
       [2, 3]])

In [21]:
agent1ActionDist =  {(-1, 0): 0.2, (1, 0): 0.2, (0, -1): 0.2, (0, 1): 0.2, (0, 0): 0.2}
agent2ActionDist =  {(-1, 0): 1}

actionDist = {(agent1Action, agent2Action): agent1ActionDist[agent1Action] * agent2ActionDist[agent2Action]
              for agent1Action in list(agent1ActionDist.keys()) for agent2Action in list(agent2ActionDist.keys())}
actionDist

{((-1, 0), (-1, 0)): 0.2,
 ((1, 0), (-1, 0)): 0.2,
 ((0, -1), (-1, 0)): 0.2,
 ((0, 1), (-1, 0)): 0.2,
 ((0, 0), (-1, 0)): 0.2}

In [25]:
list(zip(*a.items()))

[('a', 'b'), (1, 2)]

In [None]:

def getTransitionWithSingleStateActionPair(self, state, action):
    agent1CurrentState, agent2CurrentState = state
    agent1Action, agent2Action = action
    agent1NextState, agent2NextState = self.addTuples(agent1CurrentState, agent1Action)

    agent1Fixed = False
    agent2Fixed = False
    # if a move takes you off the board, you cannot take it and instead that agent remains stationary, if fixed = true, that agent must remain stationary
    if agent1NextState not in self.stateSet:
        agent1NextState = state[0]
        agent1Fixed = True
    if agent2NextState not in self.stateSet:
        agent2NextState = state[1]
        agent2Fixed = True

    # resulting joint state from taking into account moves off the board - is it a viable move
    onBoardPotentialNextState = (agent1NextState, agent2NextState)

    # if it is viable, agents will not collide and it should be in the joint state set
    if onBoardPotentialNextState in self.jointStateSet:
        return ({onBoardPotentialNextState: 1.0})

    # if it is not in the joint state set, there is a collision
    if agent1NextState == agent2NextState:
        # collision 1: one agent runs into the stationary other
        if action[0] == (0, 0) or action[1] == (0, 0):
            return ({state: 1.0})
        # collision 2: one agent tries to move off the board (and must stay stationary), the other collides into it there
        elif agent1Fixed or agent2Fixed:
            return ({state: 1.0})
        # collision 3: a collision on the board, probabilistically sample who moves and who stays
        else:
            agent1Moves = (agent1NextState, agent2CurrentState)
            agent2Moves = (agent1CurrentState, agent2NextState)
            return ({agent1Moves: .5, agent2Moves: .5})


In [None]:
list(zip(*a.items()))

In [32]:
a = {(1,0): 1, (1,1): 2}
b = {(1,9): 3, (1,1): 4}
l = [a,b]

In [33]:
[list(zip(*li.items())) for li in l]

[[((1, 0), (1, 1)), (1, 2)], [((1, 9), (1, 1)), (3, 4)]]

In [34]:
[list(li.keys()) for li in l]

[[(1, 0), (1, 1)], [(1, 9), (1, 1)]]

In [37]:
np.unique([list(li.keys()) for li in l], axis = 1)

array([[[1, 0],
        [1, 1]],

       [[1, 9],
        [1, 1]]])

In [61]:
k = np.unique(np.concatenate([list(li.keys()) for li in l]), axis = 0)
{tuple(ki): sum([dic.get(tuple(ki), 0) for dic in l]) for ki in k}

{(1, 0): 1, (1, 1): 6, (1, 9): 3}

In [55]:
k = np.unique(np.concatenate([list(li.keys()) for li in l]), axis = 0).tolist()
for ki in k:
    print(ki)

[1, 0]
[1, 1]
[1, 9]


In [53]:
l[]

(array([1, 0]), array([1, 1]), array([1, 9]))

In [66]:
t = transitionTable[initialState]

In [69]:
actionDist = {(agent1Action, agent2Action): agent1ActionDist[agent1Action] * agent2ActionDist[agent2Action]
              for agent1Action in list(agent1ActionDist.keys()) for agent2Action in
              list(agent2ActionDist.keys())}

scaleDictValues = lambda scale, dic: {dictKey: value* scale for dictKey, value in zip(dic.keys(), dic.values())}

In [73]:
actionDist

{((-1, 0), (-1, 0)): 0.2,
 ((1, 0), (-1, 0)): 0.2,
 ((0, -1), (-1, 0)): 0.2,
 ((0, 1), (-1, 0)): 0.2,
 ((0, 0), (-1, 0)): 0.2}

In [78]:
t = {((-1, 0), (-1, 0)): {((0, 0), (3, 4)): 0.5, ((-5, -5),(6,6)): 0.5},
 ((-1, 0), (0, 1)): {((0, 0), (4, 4)): 1.0},
 ((-1, 0), (1, 0)): {((0, 0), (4, 4)): 1.0},
 ((-1, 0), (0, -1)): {((0, 0), (4, 3)): 1.0},
 ((-1, 0), (0, 0)): {((0, 0), (4, 4)): 1.0},
 ((0, 1), (-1, 0)): {((0, 1), (3, 4)): 1.0},
 ((0, 1), (0, 1)): {((0, 1), (4, 4)): 1.0},
 ((0, 1), (1, 0)): {((0, 1), (4, 4)): 1.0},
 ((0, 1), (0, -1)): {((0, 1), (4, 3)): 1.0},
 ((0, 1), (0, 0)): {((0, 1), (4, 4)): 1.0},
 ((1, 0), (-1, 0)): {((1, 0), (3, 4)): 1.0},
 ((1, 0), (0, 1)): {((1, 0), (4, 4)): 1.0},
 ((1, 0), (1, 0)): {((1, 0), (4, 4)): 1.0},
 ((1, 0), (0, -1)): {((1, 0), (4, 3)): 1.0},
 ((1, 0), (0, 0)): {((1, 0), (4, 4)): 1.0},
 ((0, -1), (-1, 0)): {((0, 0), (3, 4)): 1.0},
 ((0, -1), (0, 1)): {((0, 0), (4, 4)): 1.0},
 ((0, -1), (1, 0)): {((0, 0), (4, 4)): 1.0},
 ((0, -1), (0, -1)): {((0, 0), (4, 3)): 1.0},
 ((0, -1), (0, 0)): {((0, 0), (4, 4)): 1.0},
 ((0, 0), (-1, 0)): {((0, 0), (3, 4)): 1.0},
 ((0, 0), (0, 1)): {((0, 0), (4, 4)): 1.0},
 ((0, 0), (1, 0)): {((0, 0), (4, 4)): 1.0},
 ((0, 0), (0, -1)): {((0, 0), (4, 3)): 1.0},
 ((0, 0), (0, 0)): {((0, 0), (4, 4)): 1.0}}

In [74]:
{((-1, 0), (-1, 0)): {((0, 0), (3, 4)): 1.0},
 ((1, 0), (-1, 0)): {((1, 0), (3, 4)): 1.0},
 ((0, -1), (-1, 0)): {((0, 0), (3, 4)): 1.0},
 ((0, 1), (-1, 0)): {((0, 1), (3, 4)): 1.0},
 ((0, 0), (-1, 0)): {((0, 0), (3, 4)): 1.0},


In [81]:
transitionDists

[{((0, 0), (3, 4)): 0.1, ((-5, -5), (6, 6)): 0.1},
 {((1, 0), (3, 4)): 0.2},
 {((0, 0), (3, 4)): 0.2},
 {((0, 1), (3, 4)): 0.2},
 {((0, 0), (3, 4)): 0.2}]

In [95]:
transitionDists = [scaleDictValues(scale, t[action]) for action, scale in zip(actionDist.keys(), actionDist.values())]
possibleNextStatesWithDuplicates = [list(transition.keys()) for transition in transitionDists]

possibleNextStates = np.unique(np.concatenate(possibleNextStatesWithDuplicates), axis=0)
possibleNextStatesTuple = tuple([tuple([tuple(agentState) for agentState in agentStates]) for agentStates in possibleNextStates])

transitionDistCombined = {nextState: sum([transition.get(nextState, 0) for transition in transitionDists]) for nextState in possibleNextStatesTuple}



normalizeValues = lambda dic: {dictKey: value/sum(dic.values()) for dictKey, value in zip(dic.keys(), dic.values())}
normalizeValues(transitionDistCombined)

{((-5, -5), (6, 6)): 0.1,
 ((0, 0), (3, 4)): 0.5,
 ((0, 1), (3, 4)): 0.2,
 ((1, 0), (3, 4)): 0.2}

In [90]:
# transitionDistCombined = {tuple(nextState): 1 for nextState in possibleNextStates}
tuple(possibleNextStates[0][0])


(-5, -5)

In [None]:
possibleNextStatesTuple = tuple([tuple([tuple(agentState) for agentState in agentStates]) for agentStates in possibleNextStates])



In [93]:
tuple([tuple([tuple(agentState) for agentState in agentStates]) for agentStates in possibleNextStates])



(((-5, -5), (6, 6)), ((0, 0), (3, 4)), ((0, 1), (3, 4)), ((1, 0), (3, 4)))

In [92]:
possibleNextStates[0]

array([[-5, -5],
       [ 6,  6]])

In [97]:
a
normalizeValues(a)

{(1, 0): 0.3333333333333333, (1, 1): 0.6666666666666666}

In [100]:
{(1, 0): 1, (1, 1): 2} == {(1, 0): 2, (1, 1): 2}

False

In [104]:
a = {((-1, 0), (-1, 0)): {((0, 0), (0, 1)): 1.0}, ((-1, 0), (0, 1)): {((0, 0), (0, 2)): 1.0}, ((-1, 0), (1, 0)): {((0, 0), (1, 1)): 1.0}, ((-1, 0), (0, -1)): {((0, 0), (0, 1)): 1.0}, ((-1, 0), (0, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 1), (-1, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 1), (0, 1)): {((0, 1), (0, 2)): 1.0}, ((0, 1), (1, 0)): {((0, 1), (1, 1)): 1.0}, ((0, 1), (0, -1)): {((0, 1), (0, 0)): 1.0}, ((0, 1), (0, 0)): {((0, 0), (0, 1)): 1.0}, ((1, 0), (-1, 0)): {((1, 0), (0, 1)): 1.0}, ((1, 0), (0, 1)): {((1, 0), (0, 2)): 1.0}, ((1, 0), (1, 0)): {((1, 0), (1, 1)): 1.0}, ((1, 0), (0, -1)): {((1, 0), (0, 0)): 1.0}, ((1, 0), (0, 0)): {((1, 0), (0, 1)): 1.0}, ((0, -1), (-1, 0)): {((0, 0), (0, 1)): 1.0}, ((0, -1), (0, 1)): {((0, 0), (0, 2)): 1.0}, ((0, -1), (1, 0)): {((0, 0), (1, 1)): 1.0}, ((0, -1), (0, -1)): {((0, 0), (0, 1)): 1.0}, ((0, -1), (0, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 0), (-1, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 0), (0, 1)): {((0, 0), (0, 2)): 1.0}, ((0, 0), (1, 0)): {((0, 0), (1, 1)): 1.0}, ((0, 0), (0, -1)): {((0, 0), (0, 1)): 1.0}, ((0, 0), (0, 0)): {((0, 0), (0, 1)): 1.0}}

In [106]:
b = {((-1, 0), (-1, 0)): {((0, 0), (0, 1)): 1.0}, ((-1, 0), (0, 1)): {((0, 0), (0, 1)): 1.0}, ((-1, 0), (1, 0)): {((0, 0), (0, 1)): 1.0}, ((-1, 0), (0, -1)): {((0, 0), (0, 1)): 1.0}, ((-1, 0), (0, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 1), (-1, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 1), (0, 1)): {((0, 0), (0, 1)): 1.0}, ((0, 1), (1, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 1), (0, -1)): {((0, 0), (0, 1)): 1.0}, ((0, 1), (0, 0)): {((0, 0), (0, 1)): 1.0}, ((1, 0), (-1, 0)): {((0, 0), (0, 1)): 1.0}, ((1, 0), (0, 1)): {((0, 0), (0, 1)): 1.0}, ((1, 0), (1, 0)): {((0, 0), (0, 1)): 1.0}, ((1, 0), (0, -1)): {((0, 0), (0, 1)): 1.0}, ((1, 0), (0, 0)): {((0, 0), (0, 1)): 1.0}, ((0, -1), (-1, 0)): {((0, 0), (0, 1)): 1.0}, ((0, -1), (0, 1)): {((0, 0), (0, 1)): 1.0}, ((0, -1), (1, 0)): {((0, 0), (0, 1)): 1.0}, ((0, -1), (0, -1)): {((0, 0), (0, 1)): 1.0}, ((0, -1), (0, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 0), (-1, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 0), (0, 1)): {((0, 0), (0, 1)): 1.0}, ((0, 0), (1, 0)): {((0, 0), (0, 1)): 1.0}, ((0, 0), (0, -1)): {((0, 0), (0, 1)): 1.0}, ((0, 0), (0, 0)): {((0, 0), (0, 1)): 1.0}}

In [110]:
list(a.values())

[{((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 2)): 1.0},
 {((0, 0), (1, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 1), (0, 2)): 1.0},
 {((0, 1), (1, 1)): 1.0},
 {((0, 1), (0, 0)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((1, 0), (0, 1)): 1.0},
 {((1, 0), (0, 2)): 1.0},
 {((1, 0), (1, 1)): 1.0},
 {((1, 0), (0, 0)): 1.0},
 {((1, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 2)): 1.0},
 {((0, 0), (1, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 2)): 1.0},
 {((0, 0), (1, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0}]

In [111]:
list(b.values())

[{((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0},
 {((0, 0), (0, 1)): 1.0}]

In [113]:
policy_pudd = {((-1, 0), (-1, 0)): {((0, 0), (3, 4)): 0.2, ((0, 1), (3, 4)): 0.4, ((0, 2), (3, 4)): 0.2, ((1, 1), (3, 4)): 0.2}, ((-1, 0), (0, 1)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((-1, 0), (1, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((-1, 0), (0, -1)): {((0, 0), (4, 3)): 0.2, ((0, 1), (4, 3)): 0.4, ((0, 2), (4, 3)): 0.2, ((1, 1), (4, 3)): 0.2}, ((-1, 0), (0, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((0, 1), (-1, 0)): {((0, 0), (3, 4)): 0.2, ((0, 1), (3, 4)): 0.4, ((0, 2), (3, 4)): 0.2, ((1, 1), (3, 4)): 0.2}, ((0, 1), (0, 1)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((0, 1), (1, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((0, 1), (0, -1)): {((0, 0), (4, 3)): 0.2, ((0, 1), (4, 3)): 0.4, ((0, 2), (4, 3)): 0.2, ((1, 1), (4, 3)): 0.2}, ((0, 1), (0, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((1, 0), (-1, 0)): {((0, 0), (3, 4)): 0.2, ((0, 1), (3, 4)): 0.4, ((0, 2), (3, 4)): 0.2, ((1, 1), (3, 4)): 0.2}, ((1, 0), (0, 1)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((1, 0), (1, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((1, 0), (0, -1)): {((0, 0), (4, 3)): 0.2, ((0, 1), (4, 3)): 0.4, ((0, 2), (4, 3)): 0.2, ((1, 1), (4, 3)): 0.2}, ((1, 0), (0, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((0, -1), (-1, 0)): {((0, 0), (3, 4)): 0.2, ((0, 1), (3, 4)): 0.4, ((0, 2), (3, 4)): 0.2, ((1, 1), (3, 4)): 0.2}, ((0, -1), (0, 1)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((0, -1), (1, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((0, -1), (0, -1)): {((0, 0), (4, 3)): 0.2, ((0, 1), (4, 3)): 0.4, ((0, 2), (4, 3)): 0.2, ((1, 1), (4, 3)): 0.2}, ((0, -1), (0, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((0, 0), (-1, 0)): {((0, 0), (3, 4)): 0.2, ((0, 1), (3, 4)): 0.4, ((0, 2), (3, 4)): 0.2, ((1, 1), (3, 4)): 0.2}, ((0, 0), (0, 1)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((0, 0), (1, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}, ((0, 0), (0, -1)): {((0, 0), (4, 3)): 0.2, ((0, 1), (4, 3)): 0.4, ((0, 2), (4, 3)): 0.2, ((1, 1), (4, 3)): 0.2}, ((0, 0), (0, 0)): {((0, 0), (4, 4)): 0.2, ((0, 1), (4, 4)): 0.4, ((0, 2), (4, 4)): 0.2, ((1, 1), (4, 4)): 0.2}}

In [115]:
{action1: [policy_pudd[(action1, action2)] for action2 in actionSpace] for action1 in actionSpace}


{(-1,
  0): [{((0, 0), (3, 4)): 0.2,
   ((0, 1), (3, 4)): 0.4,
   ((0, 2), (3, 4)): 0.2,
   ((1, 1), (3, 4)): 0.2}, {((0, 0), (4, 4)): 0.2,
   ((0, 1), (4, 4)): 0.4,
   ((0, 2), (4, 4)): 0.2,
   ((1, 1), (4, 4)): 0.2}, {((0, 0), (4, 4)): 0.2,
   ((0, 1), (4, 4)): 0.4,
   ((0, 2), (4, 4)): 0.2,
   ((1, 1), (4, 4)): 0.2}, {((0, 0), (4, 3)): 0.2,
   ((0, 1), (4, 3)): 0.4,
   ((0, 2), (4, 3)): 0.2,
   ((1, 1), (4, 3)): 0.2}, {((0, 0), (4, 4)): 0.2,
   ((0, 1), (4, 4)): 0.4,
   ((0, 2), (4, 4)): 0.2,
   ((1, 1), (4, 4)): 0.2}],
 (1,
  0): [{((0, 0), (3, 4)): 0.2,
   ((0, 1), (3, 4)): 0.4,
   ((0, 2), (3, 4)): 0.2,
   ((1, 1), (3, 4)): 0.2}, {((0, 0), (4, 4)): 0.2,
   ((0, 1), (4, 4)): 0.4,
   ((0, 2), (4, 4)): 0.2,
   ((1, 1), (4, 4)): 0.2}, {((0, 0), (4, 4)): 0.2,
   ((0, 1), (4, 4)): 0.4,
   ((0, 2), (4, 4)): 0.2,
   ((1, 1), (4, 4)): 0.2}, {((0, 0), (4, 3)): 0.2,
   ((0, 1), (4, 3)): 0.4,
   ((0, 2), (4, 3)): 0.2,
   ((1, 1), (4, 3)): 0.2}, {((0, 0), (4, 4)): 0.2,
   ((0, 1), (4, 4)): 0.

In [117]:
action2 = (-1, 0)
{action1: policy_pudd[(action1, action2)] for action1 in actionSpace}

{(-1, 0): {((0, 0), (3, 4)): 0.2,
  ((0, 1), (3, 4)): 0.4,
  ((0, 2), (3, 4)): 0.2,
  ((1, 1), (3, 4)): 0.2},
 (1, 0): {((0, 0), (3, 4)): 0.2,
  ((0, 1), (3, 4)): 0.4,
  ((0, 2), (3, 4)): 0.2,
  ((1, 1), (3, 4)): 0.2},
 (0, 1): {((0, 0), (3, 4)): 0.2,
  ((0, 1), (3, 4)): 0.4,
  ((0, 2), (3, 4)): 0.2,
  ((1, 1), (3, 4)): 0.2},
 (0, -1): {((0, 0), (3, 4)): 0.2,
  ((0, 1), (3, 4)): 0.4,
  ((0, 2), (3, 4)): 0.2,
  ((1, 1), (3, 4)): 0.2},
 (0, 0): {((0, 0), (3, 4)): 0.2,
  ((0, 1), (3, 4)): 0.4,
  ((0, 2), (3, 4)): 0.2,
  ((1, 1), (3, 4)): 0.2}}

In [118]:
policy_pudd = {((-1, 0), (-1, 0)): 0.0004555442327720293, ((-1, 0), (0, 1)): 0.0004555442327720293, ((-1, 0), (1, 0)): 0.0004555442327720293, ((-1, 0), (0, -1)): 0.024231917428378217, ((-1, 0), (0, 0)): 0.0067783759724608155, ((0, 1), (-1, 0)): 0.002322973921571803, ((0, 1), (0, 1)): 0.002322973921571803, ((0, 1), (1, 0)): 0.002322973921571803, ((0, 1), (0, -1)): 0.024231917428378217, ((0, 1), (0, 0)): 0.03456522875686419, ((1, 0), (-1, 0)): 0.017360477595931404, ((1, 0), (0, 1)): 0.017360477595931404, ((1, 0), (1, 0)): 0.017360477595931404, ((1, 0), (0, -1)): 0.07731942338935156, ((1, 0), (0, 0)): 0.258319249243124, ((0, -1), (-1, 0)): 0.0004555442327720293, ((0, -1), (0, 1)): 0.0004555442327720293, ((0, -1), (1, 0)): 0.0004555442327720293, ((0, -1), (0, -1)): 0.024231917428378217, ((0, -1), (0, 0)): 0.0067783759724608155, ((0, 0), (-1, 0)): 0.0067783759724608155, ((0, 0), (0, 1)): 0.0067783759724608155, ((0, 0), (1, 0)): 0.0067783759724608155, ((0, 0), (0, -1)): 0.3605644305135387, ((0, 0), (0, 0)): 0.10086041600054105}

In [120]:
{action1: sum([policy_pudd[(action1, action2)] for action2 in actionSpace]) for action1 in actionSpace}


{(-1, 0): 0.03237692609915512,
 (1, 0): 0.3877201054202698,
 (0, 1): 0.06576606794995782,
 (0, -1): 0.03237692609915512,
 (0, 0): 0.48175997443146223}

In [121]:
{action2: sum([policy_pudd[(action1, action2)] for action1 in actionSpace]) for action2 in actionSpace}



{(-1, 0): 0.027372915955508084,
 (1, 0): 0.027372915955508084,
 (0, 1): 0.027372915955508084,
 (0, -1): 0.5105796061880249,
 (0, 0): 0.40730164594545093}

In [124]:
policy_pudd1= {((-1, 0), (-1, 0)): 0.0017828406305877775, ((-1, 0), (0, 1)): 0.0003700343678171849, ((-1, 0), (1, 0)): 0.0017828406305877775, ((-1, 0), (0, -1)): 0.022502948406436615, ((-1, 0), (0, 0)): 0.02652819029134927, ((0, 1), (-1, 0)): 0.0017828406305877775, ((0, 1), (0, 1)): 0.0003700343678171849, ((0, 1), (1, 0)): 0.0017828406305877775, ((0, 1), (0, -1)): 0.022502948406436615, ((0, 1), (0, 0)): 0.02652819029134927, ((1, 0), (-1, 0)): 0.0017828406305877775, ((1, 0), (0, 1)): 0.0003700343678171849, ((1, 0), (1, 0)): 0.0017828406305877775, ((1, 0), (0, -1)): 0.022502948406436615, ((1, 0), (0, 0)): 0.02652819029134927, ((0, -1), (-1, 0)): 0.0017828406305877775, ((0, -1), (0, 1)): 0.0003700343678171849, ((0, -1), (1, 0)): 0.0017828406305877775, ((0, -1), (0, -1)): 0.022502948406436615, ((0, -1), (0, 0)): 0.02652819029134927, ((0, 0), (-1, 0)): 0.02652819029134927, ((0, 0), (0, 1)): 0.005506012122102616, ((0, 0), (1, 0)): 0.02652819029134927, ((0, 0), (0, -1)): 0.3348378353064301, ((0, 0), (0, 0)): 0.3947323546816544}

In [125]:
{action2: sum([policy_pudd1[(action1, action2)] for action1 in actionSpace]) for action2 in actionSpace}



{(-1, 0): 0.03365955281370038,
 (1, 0): 0.03365955281370038,
 (0, 1): 0.006986149593371355,
 (0, -1): 0.42484962893217654,
 (0, 0): 0.5008451158470515}

In [126]:
{action1: sum([policy_pudd1[(action1, action2)] for action2 in actionSpace]) for action1 in actionSpace}



{(-1, 0): 0.052966854326778626,
 (1, 0): 0.052966854326778626,
 (0, 1): 0.052966854326778626,
 (0, -1): 0.052966854326778626,
 (0, 0): 0.7881325826928857}