In [0]:
# Creating MDP representation for the grid world and organizing its functionalities

from random import randint
import sys
import math

class GridWorldMDP(object):
    # creates a gridworld representation where the reward for every transition
    # is given in the rewards matrix
    def __init__ (self, rewards, rows, cols, offLimitsPenalty=-100):
        self.transitions = [(0, 1), (0, -1), (1, 0), (-1, 0)]
        self.nTransitionsPerState = len(self.transitions)
        self.policy = [[randint(0, self.nTransitionsPerState-1) for j in range(cols)] for i in range(rows)]
        self.value = [[0 for j in range(cols)] for i in range(rows)]
        self.offLimitsPenalty = offLimitsPenalty
        self.rows = rows
        self.cols = cols
        self.rewards = rewards
    
    def getReward(self, dest):
        return self.rewards[dest[0]][dest[1]]

    def getStateValue(self, state):
        return self.value[state[0]][state[1]]

    def isValidTransition(self, dest):
        return  not ((dest[0] < 0 or dest[0] >= self.rows) or (dest[1] < 0 or dest[1] >= self.cols))

    def upadateStateValue(self, state):
        maxV = float("-inf")
        for i in range(self.nTransitionsPerState):
            destination = (state[0] + self.transitions[i][0], state[1] + self.transitions[i][1])
            if self.isValidTransition(destination):
                r = self.getReward(destination)
                v = self.getStateValue(destination)
                # print (state, destination, r, v, maxV)
                if r + v > maxV:
                    maxV = r + v
                    self.policy[state[0]][state[1]] = i
                    self.value[state[0]][state[1]] = maxV
    
    def printValues(self):
        for r in self.value:
            print(r)
            
    def getPrettyPolicy(self):
        def prettyDirections(d):
            # [(0, 1), (0, -1), (1, 0), (-1, 0)]
            prettyDirs = ['r', 'l', 'd', 'u']
            return prettyDirs[d]
        
        pretty = []
        
        for r in self.policy:
            pretty.append(list(map(prettyDirections, r)))
        
        return pretty
        
        
    def printPolicy(self, pretty=False):
        def prettyDirections(d):
            # [(0, 1), (0, -1), (1, 0), (-1, 0)]
            prettyDirs = ['r', 'l', 'd', 'u']
            return prettyDirs[d]
        for r in self.policy:
            if pretty:
                print(list(map(prettyDirections, r)))
            else:
                print(r)

    def iterateValues(self):
        for i in range(self.rows):
            for j in range(self.cols):
                self.upadateStateValue((i, j))

    def iterateValuesUntilConverge(self, printSteps=False):
        while True:
            values = self.value
            self.iterateValues()
            if printSteps:
                print()
                self.printPolicy(True)      
            if values == self.value:
                break
                
    def repeatIterateValues(self, n, printSteps=False):
        for i in range(n):
            self.iterateValues()
            if printSteps:
                print()
                self.printPolicy(True)

In [49]:
# First dummy smoke test to make sure everything is not absurdly wrong

rewards = [
    [-1, -1, -1],
    [-1, 100, -1],
    [-1, -1, -1]
]

rows, cols = 3, 3

mdp = GridWorldMDP(rewards, rows, cols, -100)
mdp.printPolicy(True)
mdp.iterateValuesUntilConverge(printSteps=True)

['r', 'd', 'd']
['u', 'l', 'd']
['d', 'l', 'u']

['r', 'd', 'l']
['r', 'l', 'l']
['u', 'u', 'l']


In [52]:
# Testing random test case for 10 rows and cols

def printMatrix(matrix):
    for r in matrix:
        print(r)
    print()

def generateRandomRewards(row, cols, goalReward, notGoalReward):
    goalCoordX, goalCoordY = randint(0, rows - 1), randint(0, cols - 1)
    rewards = [[notGoalReward for j in range(cols)] for i in range(rows)]
    rewards[goalCoordX][goalCoordY] = goalReward
    return rewards, (goalCoordX, goalCoordY)
  
rows, cols = 10, 10

rewards, goal = generateRandomRewards(rows, cols, 100, 0)
printMatrix(rewards)

mdp = GridWorldMDP(rewards, rows, cols, -100)
mdp.printPolicy(True)
mdp.iterateValuesUntilConverge(printSteps=True)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 100, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

['u', 'u', 'd', 'r', 'u', 'd', 'u', 'd', 'u', 'u']
['u', 'u', 'l', 'r', 'u', 'l', 'd', 'r', 'u', 'l']
['l', 'd', 'l', 'r', 'u', 'l', 'u', 'u', 'l', 'd']
['u', 'u', 'r', 'd', 'd', 'r', 'r', 'r', 'd', 'd']
['l', 'u', 'l', 'l', 'u', 'r', 'd', 'r', 'l', 'l']
['l', 'l', 'l', 'r', 'r', 'l', 'r', 'u', 'l', 'd']
['d', 'l', 'l', 'd', 'l', 'r', 'l', 'r', 'u', 'r']
['r', 'u', 'l', 'u', 'u', 'd', 'd', 'd', 'r', 'u']
['r', 'u', 'd', 'u', 'd', 'd', 'l', 'd', 'd', 'd']
['l', 'r', 'd', 'u', 'd', 'r', 'l', 'l', 'u', 'd']

['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'r', 'r

In [54]:
# Grid world with obstacles (things are getting interesting)

def generateRandomRewardsWithObstacles(row, cols, goalReward, notGoalReward, obstacleReward, nObstacles):
    
    rewards = [[notGoalReward for j in range(cols)] for i in range(rows)]
    
    goalCoordX, goalCoordY = randint(0, rows - 1), randint(0, cols - 1)
    goal = (goalCoordX, goalCoordY)
    rewards[goalCoordX][goalCoordY] = goalReward
    
    obstacles = []
    while nObstacles > 0:
        obs = randint(0, rows - 1), randint(0, cols - 1)
        if not obs in obstacles and not obs == goal:
            obstacles.append(obs)
            nObstacles = nObstacles - 1
            rewards[obs[0]][obs[1]] = obstacleReward
    
    return rewards, goal, obstacles


def markGoalAndObstaclesOnGrid(grid, goal, obstacles):
    grid[goal[0]][goal[1]] = 'G'
    for o in obstacles:
        grid[o[0]][o[1]] = 'X'
    
    return grid

rows, cols = 5, 5

rewards, goal, obs = generateRandomRewardsWithObstacles(rows, cols, 100, 0, -1000, 5)
print ("rewards")
printMatrix(rewards)
print ("goal", goal)
print ("obstacles", obs)
print()

mdp = GridWorldMDP(rewards, rows, cols)
mdp.printPolicy(True)
mdp.iterateValuesUntilConverge()
print()

printMatrix(markGoalAndObstaclesOnGrid(mdp.getPrettyPolicy(), goal, obs))

rewards
[0, 0, 0, 0, 0]
[0, 0, 0, 0, 0]
[-1000, -1000, 0, 100, 0]
[-1000, 0, 0, 0, 0]
[0, -1000, 0, 0, -1000]

goal (2, 3)
obstacles [(4, 1), (4, 4), (2, 0), (2, 1), (3, 0)]

['l', 'r', 'd', 'r', 'l']
['l', 'u', 'l', 'r', 'r']
['r', 'u', 'u', 'r', 'u']
['r', 'u', 'u', 'd', 'r']
['l', 'd', 'r', 'r', 'd']

['r', 'r', 'r', 'r', 'l']
['r', 'r', 'r', 'd', 'l']
['X', 'X', 'r', 'G', 'l']
['X', 'r', 'u', 'u', 'l']
['r', 'X', 'u', 'u', 'X']



In [61]:
# This case proves my iterate until converge is not working propperly

rewards = [
    [0, 0, 0, -1000, 0],
    [0, -1000, 0, 0, -1000],
    [-1000, 0, 0, 0, 0],
    [0, -1000, 100, 0, 0],
    [0, 0, 0, 0, 0]
]

goal = (3, 2)
obs = [(1, 4), (3, 1), (2, 0), (1, 1), (0, 3)]
print ("rewards")
printMatrix(rewards)
print ("goal", goal)
print ("obstacles", obs)
print()

mdp = GridWorldMDP(rewards, rows, cols)
mdp.iterateValuesUntilConverge()

printMatrix(markGoalAndObstaclesOnGrid(mdp.getPrettyPolicy(), goal, obs))

mdp = GridWorldMDP(rewards, rows, cols)
mdp.repeatIterateValues(10)
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPrettyPolicy(), goal, obs))


rewards
[0, 0, 0, -1000, 0]
[0, -1000, 0, 0, -1000]
[-1000, 0, 0, 0, 0]
[0, -1000, 100, 0, 0]
[0, 0, 0, 0, 0]

goal (3, 2)
obstacles [(1, 4), (3, 1), (2, 0), (1, 1), (0, 3)]

['r', 'r', 'l', 'X', 'l']
['u', 'X', 'r', 'l', 'X']
['X', 'r', 'd', 'l', 'l']
['d', 'X', 'G', 'l', 'l']
['r', 'r', 'u', 'l', 'l']

['r', 'r', 'd', 'X', 'l']
['u', 'X', 'd', 'l', 'X']
['X', 'r', 'd', 'l', 'l']
['d', 'X', 'G', 'l', 'l']
['r', 'r', 'u', 'l', 'l']



In [67]:
# Another case where iterating until converge do not work

rewards = [
    [0,     0,      0,     0, 0],
    [0,     0,     -1000,  0, -1000],
    [0,     0,     0,      0, 0],
    [0,     -1000, -1000,  -1000, -1000],
    [0,     0,     0,    0, 100]
]

rows, cols = 5, 5

goal = (4, 4)
obs = [(3, 1),(3, 2), (3, 3), (3, 4), (1, 2), (1, 4)]
print ("rewards")
printMatrix(rewards)
print ("goal", goal)
print ("obstacles", obs)
print()

mdp = GridWorldMDP(rewards, rows, cols)
mdp.iterateValuesUntilConverge()
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPrettyPolicy(), goal, obs))

mdp = GridWorldMDP(rewards, rows, cols)
mdp.repeatIterateValues(10)
printMatrix(markGoalAndObstaclesOnGrid(mdp.getPrettyPolicy(), goal, obs))

rewards
[0, 0, 0, 0, 0]
[0, 0, -1000, 0, -1000]
[0, 0, 0, 0, 0]
[0, -1000, -1000, -1000, -1000]
[0, 0, 0, 0, 100]

goal (4, 4)
obstacles [(3, 1), (3, 2), (3, 3), (3, 4), (1, 2), (1, 4)]

['r', 'r', 'r', 'r', 'l']
['r', 'l', 'X', 'd', 'X']
['r', 'r', 'r', 'r', 'l']
['d', 'X', 'X', 'X', 'X']
['r', 'r', 'r', 'r', 'G']

['d', 'l', 'l', 'l', 'l']
['d', 'l', 'X', 'd', 'X']
['d', 'l', 'l', 'l', 'l']
['d', 'X', 'X', 'X', 'X']
['r', 'r', 'r', 'r', 'G']

