In [148]:
#imports

!pip install ipython-cache
import cache_magic
from random import randint
import sys
import math
import json



# MDP Implementations

In [146]:
# Creating MDP representation for the grid world and organizing its functionalities

class ActionResult:
    def __init__ (self, resultState, prob, reward):
        self.resultState = resultState
        self.prob = prob
        self.reward = reward
        
    def __str__(self):
        return "resultstate: {0}\nprobability: {1}\nReward: {2}".format(str(self.resultState), str(self.prob), str(self.reward))

class Action:
    # abstraction for an action. Expects a state on which this action
    # can be taken and a list of ActionResult
    def __init__ (self, state, results, label):
        self.state = state
        self.results = results
        self.label = label
    
    def __str__(self):
        s = "{0} - {1}\n".format(str(self.state), self.label)
        for r in self.results:
            s = s + str(r) + "\n"
            
        return s
#         return str(self.state) + " " + str([str(r) for r in self.results])
     
class State:
    def __init__(self, id, actions, value):
        self.id = id
        self.actions = actions
        self.value = value       

        def __str__(self):
            return str(self.id) + " " + str(self.value) + " " + str(self.actions)

class GridWorldMDP(object):        
    def __init__ (self, rewards, rows, cols, actions):
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
        self.policy = [['u' for j in range(cols)] for i in range(rows)]
        self.value = [[0 for j in range(cols)] for i in range(rows)]
        self.states = [[State((i, j), actions[(i, j)], 0) for j in range(cols)] for i in range(rows)]
    
    def getStateValue(self, state):
        return self.states[state[0]][state[1]].value

    def setStateValue(self, state, v):
        self.states[state[0]][state[1]].value = v
        
    def upadateStateValue(self, state):
        maxV = float("-inf")
        actions = self.states[state[0]][state[1]].actions
        for i in range(len(actions)):
            action = actions[i]
            resultState = action.results[0].resultState
            r = action.results[0].reward
            v = self.getStateValue(resultState)
            if r + v > maxV:
                maxV = r + v
                self.policy[state[0]][state[1]] = action.label
                self.setStateValue(state, v)
    
    def printValues(self):
        for r in self.value:
            print(r)
            
    def getPolicy(self):
        return self.policy
        
        
    def printPolicy(self):
        for r in self.policy:
            print(r)
        print()

    def iterateValues(self):
        for i in range(self.rows):
            for j in range(self.cols):
                self.upadateStateValue((i, j))

    def iterateValuesUntilConverge(self, printSteps=False):
        while True:
            values = self.value
            self.iterateValues()
            if printSteps:
                print()
                self.printPolicy()      
            if values == self.value:
                break
                
    def repeatIterateValues(self, n, printSteps=False):
        for i in range(n):
            self.iterateValues()
            if printSteps:
                print()
                self.printPolicy()        

class DeterministicGridWorldMDP(GridWorldMDP):
    transitions = [(0, 1), (0, -1), (1, 0), (-1, 0)]
    nTransitionsPerState = len(transitions)
    # creates a gridworld representation where the reward for every transition
    # is given in the rewards matrix which means the reward the agent gets when achieving that reward
    def __init__ (self, rewards, rows, cols):
        actions = DeterministicGridWorldMDP.getDeterministicGridWorldDefaultActions(rewards, rows, cols)
        super(DeterministicGridWorldMDP, self).__init__(rewards, rows, cols, actions)

    @staticmethod
    def getDeterministicGridWorldDefaultActions(rewards, rows, cols):
        keys = []
        for j in range(cols):
            for i in range(rows):
                keys.append((i, j))
        actions = {key: DeterministicGridWorldMDP.generateActions(key[0], key[1], rewards, rows, cols) for key in keys}
        
        return actions
        
    @staticmethod
    def isValidTransition(dest, rows, cols):
        return  not ((dest[0] < 0 or dest[0] >= rows) or (dest[1] < 0 or dest[1] >= cols))
    
    @staticmethod
    def generateActions(i, j, rewards, rows, cols):
        actions = []
        prettyDirs = ['r', 'l', 'd', 'u']
        for transition in range(DeterministicGridWorldMDP.nTransitionsPerState):
            resultState = (i + DeterministicGridWorldMDP.transitions[transition][0], j + DeterministicGridWorldMDP.transitions[transition][1])
            if DeterministicGridWorldMDP.isValidTransition(resultState, rows, cols):
                actions.append(Action((i, j), [ActionResult(resultState, 1, rewards[resultState[0]][resultState[1]])], prettyDirs[transition]))
         
        return actions

# Auxiliary functions

In [91]:
def printMatrix(matrix):
    for r in matrix:
        print(r)
    print()

def generateRandomRewards(row, cols, goalReward, notGoalReward):
    goalCoordX, goalCoordY = randint(0, rows - 1), randint(0, cols - 1)
    rewards = [[notGoalReward for j in range(cols)] for i in range(rows)]
    rewards[goalCoordX][goalCoordY] = goalReward
    return rewards, (goalCoordX, goalCoordY)

def generateRandomRewardsWithObstacles(row, cols, goalReward, notGoalReward, obstacleReward, nObstacles):
    
    rewards = [[notGoalReward for j in range(cols)] for i in range(rows)]
    
    goalCoordX, goalCoordY = randint(0, rows - 1), randint(0, cols - 1)
    goal = (goalCoordX, goalCoordY)
    rewards[goalCoordX][goalCoordY] = goalReward
    
    obstacles = []
    while nObstacles > 0:
        obs = randint(0, rows - 1), randint(0, cols - 1)
        if not obs in obstacles and not obs == goal:
            obstacles.append(obs)
            nObstacles = nObstacles - 1
            rewards[obs[0]][obs[1]] = obstacleReward
    
    return rewards, goal, obstacles


def markGoalAndObstaclesOnGrid(grid, goal, obstacles):
    grid[goal[0]][goal[1]] = 'G'
    for o in obstacles:
        grid[o[0]][o[1]] = 'X'
    
    return grid


# Tests with deterministics gridworld environments

In [125]:
# First dummy smoke test to make sure everything is not absurdly wrong

rewards = [
    [-1, -1, -1],
    [-1, 100, -1],
    [-1, -1, -1]
]

rows, cols = 3, 3

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.printPolicy()
mdp.repeatIterateValues(10)
print ()
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), (1,1), []))

['u', 'u', 'u']
['u', 'u', 'u']
['u', 'u', 'u']


['r', 'd', 'l']
['r', 'G', 'l']
['r', 'u', 'l']



In [93]:
# Testing random test case for 10 rows and cols  

rows, cols = 10, 10

rewards, goal = generateRandomRewards(rows, cols, 100, 0)
mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.printPolicy()
mdp.repeatIterateValues(5)
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, []))

['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u', 'u']

['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'd', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'G', 'l', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'u', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'r', 'r',

In [94]:
# Grid world with obstacles (things are getting interesting)

rows, cols = 5, 5

rewards, goal, obs = generateRandomRewardsWithObstacles(rows, cols, 100, 0, -1000, 5)
print ("rewards")
printMatrix(rewards)
print ("goal", goal)
print ("obstacles", obs)
print()

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.printPolicy()
mdp.repeatIterateValues(10)
print()

printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

rewards
[0, 0, 0, 0, -1000]
[100, -1000, -1000, -1000, 0]
[0, 0, 0, 0, 0]
[-1000, 0, 0, 0, 0]
[0, 0, 0, 0, 0]

goal (1, 0)
obstacles [(0, 4), (1, 2), (1, 3), (1, 1), (3, 0)]

['u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u']
['u', 'u', 'u', 'u', 'u']


['d', 'r', 'r', 'l', 'X']
['G', 'X', 'X', 'X', 'd']
['u', 'r', 'r', 'r', 'l']
['X', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'l']



In [95]:
# This case proves my iterate until converge is not working propperly

rewards = [
    [0, 0, 0, -1000, 0],
    [0, -1000, 0, 0, -1000],
    [-1000, 0, 0, 0, 0],
    [0, -1000, 100, 0, 0],
    [0, 0, 0, 0, 0]
]

goal = (3, 2)
obs = [(1, 4), (3, 1), (2, 0), (1, 1), (0, 3)]
print ("rewards")
printMatrix(rewards)
print ("goal", goal)
print ("obstacles", obs)
print()

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.iterateValuesUntilConverge()

printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.repeatIterateValues(10)
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))


rewards
[0, 0, 0, -1000, 0]
[0, -1000, 0, 0, -1000]
[-1000, 0, 0, 0, 0]
[0, -1000, 100, 0, 0]
[0, 0, 0, 0, 0]

goal (3, 2)
obstacles [(1, 4), (3, 1), (2, 0), (1, 1), (0, 3)]

['r', 'r', 'l', 'X', 'l']
['u', 'X', 'r', 'l', 'X']
['X', 'r', 'd', 'r', 'l']
['d', 'X', 'G', 'l', 'l']
['r', 'r', 'u', 'r', 'l']

['r', 'r', 'l', 'X', 'l']
['u', 'X', 'r', 'l', 'X']
['X', 'r', 'd', 'r', 'l']
['d', 'X', 'G', 'l', 'l']
['r', 'r', 'u', 'r', 'l']



In [96]:
# Another case where iterating until converge do not work

rewards = [
    [0,     0,      0,     0, 0],
    [0,     0,     -1000,  0, -1000],
    [0,     0,     0,      0, 0],
    [0,     -1000, -1000,  -1000, -1000],
    [0,     0,     0,    0, 100]
]

rows, cols = 5, 5

goal = (4, 4)
obs = [(3, 1),(3, 2), (3, 3), (3, 4), (1, 2), (1, 4)]
print ("rewards")
printMatrix(rewards)
print ("goal", goal)
print ("obstacles", obs)
print()

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.iterateValuesUntilConverge()
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.repeatIterateValues(10)
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

rewards
[0, 0, 0, 0, 0]
[0, 0, -1000, 0, -1000]
[0, 0, 0, 0, 0]
[0, -1000, -1000, -1000, -1000]
[0, 0, 0, 0, 100]

goal (4, 4)
obstacles [(3, 1), (3, 2), (3, 3), (3, 4), (1, 2), (1, 4)]

['r', 'r', 'r', 'r', 'l']
['r', 'l', 'X', 'd', 'X']
['r', 'r', 'r', 'r', 'l']
['d', 'X', 'X', 'X', 'X']
['r', 'r', 'r', 'r', 'G']

['r', 'r', 'r', 'r', 'l']
['r', 'l', 'X', 'd', 'X']
['r', 'r', 'r', 'r', 'l']
['d', 'X', 'X', 'X', 'X']
['r', 'r', 'r', 'r', 'G']



In [97]:
# Lets have variable obstacles with variable penalties

rewards = [
    [0,     0,      0,     0, 0],
    [0,     0,     -1000,  0, -1000],
    [0,     0,     0,      0, 0],
    [0,     -250, -500,  -750, -1000],
    [0,     0,     0,    0, 100]
]

rows, cols = 5, 5

goal = (4, 4)
obs = [(3, 1),(3, 2), (3, 3), (3, 4), (1, 2), (1, 4)]
print ("rewards")
printMatrix(rewards)
print ("goal", goal)
print ("obstacles", obs)
print()

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.iterateValuesUntilConverge()
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.repeatIterateValues(10)
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

rewards
[0, 0, 0, 0, 0]
[0, 0, -1000, 0, -1000]
[0, 0, 0, 0, 0]
[0, -250, -500, -750, -1000]
[0, 0, 0, 0, 100]

goal (4, 4)
obstacles [(3, 1), (3, 2), (3, 3), (3, 4), (1, 2), (1, 4)]

['r', 'r', 'r', 'r', 'l']
['r', 'l', 'X', 'd', 'X']
['r', 'r', 'r', 'r', 'l']
['d', 'X', 'X', 'X', 'X']
['r', 'r', 'r', 'r', 'G']

['r', 'r', 'r', 'r', 'l']
['r', 'l', 'X', 'd', 'X']
['r', 'r', 'r', 'r', 'l']
['d', 'X', 'X', 'X', 'X']
['r', 'r', 'r', 'r', 'G']



In [98]:
# Lets now give negative reward for moving

rewards = [
    [-100,     -100,      -100,     -100, -10],
    [-100,     -100,     -1000,  -100, -1000],
    [-100,     -100,     -100,      -100, -10],
    [-100,     -250, -500,  -750, -1000],
    [-100,     -100,     -100,    -100, 100]
]

rows, cols = 5, 5

goal = (4, 4)
obs = [(3, 1),(3, 2), (3, 3), (3, 4), (1, 2), (1, 4)]
print ("rewards")
printMatrix(rewards)
print ("goal", goal)
print ("obstacles", obs)
print()

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.iterateValuesUntilConverge()
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.repeatIterateValues(20)
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

rewards
[-100, -100, -100, -100, -10]
[-100, -100, -1000, -100, -1000]
[-100, -100, -100, -100, -10]
[-100, -250, -500, -750, -1000]
[-100, -100, -100, -100, 100]

goal (4, 4)
obstacles [(3, 1), (3, 2), (3, 3), (3, 4), (1, 2), (1, 4)]

['r', 'r', 'r', 'r', 'l']
['r', 'l', 'X', 'd', 'X']
['r', 'r', 'r', 'r', 'l']
['d', 'X', 'X', 'X', 'X']
['r', 'r', 'r', 'r', 'G']

['r', 'r', 'r', 'r', 'l']
['r', 'l', 'X', 'd', 'X']
['r', 'r', 'r', 'r', 'l']
['d', 'X', 'X', 'X', 'X']
['r', 'r', 'r', 'r', 'G']



In [99]:
# how long it takes to converge

rows, cols = 100, 100

rewards, goal, obs = generateRandomRewardsWithObstacles(rows, cols, 100, 0, -1000, 100)
print ("rewards")
# printMatrix(rewards)
print ("goal", goal)
print ("obstacles", obs)
print()

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.iterateValuesUntilConverge()
policy1 = mdp.getPolicy()
# printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.repeatIterateValues(10)
policy2 = mdp.getPolicy()
# printMatrix(markGoalAndObstaclesOnGrid(mdp.getPolicy(), goal, obs))

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.repeatIterateValues(50)
policy3 = mdp.getPolicy()

mdp = DeterministicGridWorldMDP(rewards, rows, cols)
mdp.repeatIterateValues(100)
policy4 = mdp.getPolicy()

print(policy1 == policy2)
print(policy2 == policy3)
print(policy3 == policy4)


rewards
goal (34, 49)
obstacles [(24, 48), (91, 15), (77, 84), (92, 57), (22, 10), (71, 2), (75, 21), (38, 47), (7, 73), (21, 38), (31, 89), (59, 68), (28, 9), (23, 98), (74, 98), (45, 55), (5, 29), (64, 29), (1, 28), (96, 77), (41, 45), (3, 55), (53, 9), (62, 62), (99, 75), (74, 65), (31, 52), (81, 10), (66, 52), (10, 78), (52, 68), (80, 82), (9, 93), (80, 8), (68, 17), (8, 64), (59, 8), (66, 41), (74, 82), (39, 7), (60, 10), (31, 72), (23, 68), (19, 63), (97, 59), (95, 35), (33, 87), (99, 66), (24, 14), (31, 10), (0, 69), (27, 6), (98, 49), (93, 65), (79, 82), (71, 78), (72, 2), (94, 68), (88, 30), (12, 26), (36, 25), (23, 2), (6, 34), (55, 8), (38, 76), (91, 71), (55, 26), (51, 17), (64, 44), (48, 58), (77, 8), (82, 50), (91, 49), (48, 29), (7, 40), (58, 80), (46, 25), (8, 62), (88, 40), (16, 63), (73, 2), (1, 14), (54, 51), (90, 77), (97, 77), (14, 97), (87, 96), (90, 38), (12, 24), (31, 43), (74, 22), (29, 65), (98, 46), (41, 53), (81, 27), (99, 39), (88, 37), (22, 41), (59, 55), 

In [151]:
# First dummy smoke test to make sure everything is not absurdly wrong

rewards = [
    [-1, -1, -1],
    [-1, 100, -1],
    [-1, -1, -1]
]

rows, cols = 3, 3

stateActions = DeterministicGridWorldMDP.getDeterministicGridWorldDefaultActions(rewards, rows, cols)
for s in stateActions:
    for a in stateActions[s]:
        print(str(a))

# print(stateActions.__dict__)
# s = json.dumps(stateActions)
# print(s)

(0, 0) - r
resultstate: (0, 1)
probability: 1
Reward: -1

(0, 0) - d
resultstate: (1, 0)
probability: 1
Reward: -1

(1, 0) - r
resultstate: (1, 1)
probability: 1
Reward: 100

(1, 0) - d
resultstate: (2, 0)
probability: 1
Reward: -1

(1, 0) - u
resultstate: (0, 0)
probability: 1
Reward: -1

(2, 0) - r
resultstate: (2, 1)
probability: 1
Reward: -1

(2, 0) - u
resultstate: (1, 0)
probability: 1
Reward: -1

(0, 1) - r
resultstate: (0, 2)
probability: 1
Reward: -1

(0, 1) - l
resultstate: (0, 0)
probability: 1
Reward: -1

(0, 1) - d
resultstate: (1, 1)
probability: 1
Reward: 100

(1, 1) - r
resultstate: (1, 2)
probability: 1
Reward: -1

(1, 1) - l
resultstate: (1, 0)
probability: 1
Reward: -1

(1, 1) - d
resultstate: (2, 1)
probability: 1
Reward: -1

(1, 1) - u
resultstate: (0, 1)
probability: 1
Reward: -1

(2, 1) - r
resultstate: (2, 2)
probability: 1
Reward: -1

(2, 1) - l
resultstate: (2, 0)
probability: 1
Reward: -1

(2, 1) - u
resultstate: (1, 1)
probability: 1
Reward: 100

(0, 2) - l
