In [104]:
import numpy as np
import matplotlib.pyplot as plt

In [105]:
ACTION_SPACE = ('U', 'D', 'L', 'R')
class gridWorld:
    
    def __init__(self,rows,cols,startPosition):
        self.rows = rows
        self.cols = cols
        self.i = startPosition[0]
        self.j = startPosition[1]
        
    
    def setOnGrid(self,rewards,actions):
        self.rewards = rewards
        self.actions = actions
    
    def setState(self,s):
        self.i = s[0]
        self.j = s[1]
    
    def move(self,a):
        if a in self.actions[self.i,self.j] :
            if a =='U':
                self.i -= 1
            elif a =='D':
                self.i += 1
            elif a == 'L':
                self.j -= 1
            elif a == 'R':
                self.j += 1
        return self.rewards.get((self.i,self.j),0)
    
    def getNextState(self,s,a):
        i,j = s[0],s[1]
        
        if a in self.actions[(i,j)]:
            if a =='U':
                i -= 1
            elif a =='D':
                i += 1
            elif a == 'L':
                j -= 1
            elif a == 'R':
                j += 1
        return i,j
                
    def allStates(self):
        return set(self.actions.keys()) | set(self.rewards.keys())
        
    def allActions(self):
        return np.array(['U','D','L','R'])
    
    def currentState(self):
        return (self.i,self.j)
    
    def isTerminal(self,s):
        return s not in self.actions
    
    def reset(self):
        self.i = 2
        self.j = 0
        return (self.i,self.j)
    
    def gameOver(self):
        if (self.i==self.maxX and self.j==self.maxY):
            return True
        else:
            return False
            
    
def standardGrid():
    g = gridWorld(2+1,3+1,(2,0))
    rewards = {(0,3): +1 ,(1,3): -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
    }
    g.setOnGrid(rewards,actions)
    return g

def negativeGrid():
    g = standardGrid()
    g.rewards.update({
        (0, 0): step_cost,
        (0, 1): step_cost,
        (0, 2): step_cost,
        (1, 0): step_cost,
        (1, 2): step_cost,
        (2, 0): step_cost,
        (2, 1): step_cost,
        (2, 2): step_cost,
        (2, 3): step_cost,
    })
    return g

def printPolicy(P, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            a = P.get((i,j), ' ')
            print("  %s  |" % a, end="")
        print("")

def printValues(V, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            v = V.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")
        print("")
        

In [106]:
rewards = {}
transitionProbs = {}
grid = standardGrid()
                    
for i in range(grid.rows):
    for j in range(grid.cols):
        s = (i,j)
        if not grid.isTerminal(s):
            for a in ACTION_SPACE:
                sPrime = grid.getNextState(s,a)
                transitionProbs[(s,a,sPrime)] = 1
                if sPrime in grid.rewards:
                    rewards [(s,a,sPrime)] = grid.rewards[(sPrime)]

policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'U',
    (2, 1): 'R',
    (2, 2): 'U',
    (2, 3): 'L',
  }
printPolicy(policy,grid)
print("\n\n")
v = {}
for s in grid.allStates():
    v[s] = 0

    
gamma = 0.9
it = 0 
while True:
    delta = 0
    for s in grid.allStates():
        if not grid.isTerminal(s):
            vOld = v[s]
            vNew = 0
            for a in ACTION_SPACE:
                for sPrime in grid.allStates():
                    actionProb = 1 if policy.get(s)==a else 0
                    vNew += actionProb * transitionProbs.get((s,a,sPrime),0) * (rewards.get((s,a,sPrime),0) + gamma*v[sPrime])
            v[s] = vNew
            delta = max(delta , np.abs(vOld - vNew))
    print("iter: ", it ,"delta: ",delta)
    printValues(v,grid)
    it += 1
    
    if delta < 1e-3:
        break
    print("\n\n")

---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  U  |     |
---------------------------
  U  |  R  |  U  |  L  |



iter:  0 delta:  1.0
---------------------------
 0.00| 0.00| 1.00| 0.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|



iter:  1 delta:  0.9
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00| 0.90| 0.00|
---------------------------
 0.00| 0.00| 0.81| 0.00|



iter:  2 delta:  0.7290000000000001
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00| 0.90| 0.00|
---------------------------
 0.66| 0.73| 0.81| 0.73|



iter:  3 delta:  0
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00| 0.90| 0.00|
---------------------------
 0.66| 0.73| 0.81| 0.73|


In [120]:
ACTION_SPACE = ('U', 'D', 'L', 'R')

class windyGridWorld:
    
    def __init__(self,rows,cols,startPosition):
        self.rows = rows
        self.cols = cols
        self.i = startPosition[0]
        self.j = startPosition[1]
        
    
    def setOnGrid(self,rewards,actions,probs):
        self.rewards = rewards
        self.actions = actions
        self.probs = probs
    
    def setState(self,s):
        self.i = s[0]
        self.j = s[1]
    
    def move(self,action):
        s = (self.i,self.j)
        a = action
        next_state_probs = self.probs[(s,a)]
        next_states = next_state_probs.keys()
        next_probs = next_state_probs.values()
        next_state_idx = np.random.choice(len(next_states),p = next_probs)
        next_state = next_states[next_state_idx]
        self.i,self.j = next_state
        return self.rewards.get(next_state,0)
                
    def allStates(self):
        return set(self.actions.keys()) | set(self.rewards.keys())
        
    def allActions(self):
        return np.array(['U','D','L','R'])
    
    def currentState(self):
        return (self.i,self.j)
    
    def isTerminal(self,s):
        return s not in self.actions
    
    def reset(self):
        self.i = 2
        self.j = 0
        return (self.i,self.j)
    
    def gameOver(self):
        if (self.i==self.maxX and self.j==self.maxY):
            return True
        else:
            return False
            
    
def windyGrid():
    g = windyGridWorld(2+1,3+1,(2,0))
    rewards = {(0,3): +1 ,(1,3): -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
    }
    
    probs = {
        ((2, 0), 'U'): {(1, 0): 1.0},
        ((2, 0), 'D'): {(2, 0): 1.0},
        ((2, 0), 'L'): {(2, 0): 1.0},
        ((2, 0), 'R'): {(2, 1): 1.0},
        ((1, 0), 'U'): {(0, 0): 1.0},
        ((1, 0), 'D'): {(2, 0): 1.0},
        ((1, 0), 'L'): {(1, 0): 1.0},
        ((1, 0), 'R'): {(1, 0): 1.0},
        ((0, 0), 'U'): {(0, 0): 1.0},
        ((0, 0), 'D'): {(1, 0): 1.0},
        ((0, 0), 'L'): {(0, 0): 1.0},
        ((0, 0), 'R'): {(0, 1): 1.0},
        ((0, 1), 'U'): {(0, 1): 1.0},
        ((0, 1), 'D'): {(0, 1): 1.0},
        ((0, 1), 'L'): {(0, 0): 1.0},
        ((0, 1), 'R'): {(0, 2): 1.0},
        ((0, 2), 'U'): {(0, 2): 1.0},
        ((0, 2), 'D'): {(1, 2): 1.0},
        ((0, 2), 'L'): {(0, 1): 1.0},
        ((0, 2), 'R'): {(0, 3): 1.0},
        ((2, 1), 'U'): {(2, 1): 1.0},
        ((2, 1), 'D'): {(2, 1): 1.0},
        ((2, 1), 'L'): {(2, 0): 1.0},
        ((2, 1), 'R'): {(2, 2): 1.0},
        ((2, 2), 'U'): {(1, 2): 1.0},
        ((2, 2), 'D'): {(2, 2): 1.0},
        ((2, 2), 'L'): {(2, 1): 1.0},
        ((2, 2), 'R'): {(2, 3): 1.0},
        ((2, 3), 'U'): {(1, 3): 1.0},
        ((2, 3), 'D'): {(2, 3): 1.0},
        ((2, 3), 'L'): {(2, 2): 1.0},
        ((2, 3), 'R'): {(2, 3): 1.0},
        ((1, 2), 'U'): {(0, 2): 0.5, (1, 3): 0.5},
        ((1, 2), 'D'): {(2, 2): 1.0},
        ((1, 2), 'L'): {(1, 2): 1.0},
        ((1, 2), 'R'): {(1, 3): 1.0},
      }
    g.setOnGrid(rewards, actions, probs)
    return g

def print_values(V, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            v = V.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="") # -ve sign takes up an extra space
        print("")


def print_policy(P, g):
      for i in range(g.rows):
            print("---------------------------")
            for j in range(g.cols):
                a = P.get((i,j), ' ')
                print("  %s  |" % a, end="")
            print("")



In [125]:
rewards = {}
transitionProbs = {}
grid = windyGrid()


for (s,a),v in grid.probs.items():
    for s2,p in v.items():
        transitionProbs[(s,a,s2)] = p
        rewards[(s,a,s2)] = grid.rewards.get(s2,0)

policy = {
    (2, 0): {'U':0.5,'R': 0.5},
    (1, 0): {'U':1.0},
    (0, 0): {'R':1.0},
    (0, 1): {'R':1.0},
    (0, 2): {'R':1.0},
    (1, 2): {'U':1.0},
    (2, 1): {'R':1.0},
    (2, 2): {'U':1.0},
    (2, 3): {'L':1.0},
  }
printPolicy(policy,grid)
print("\n\n")
v = {}
for s in grid.allStates():
    v[s] = 0

    
gamma = 0.9
it = 0 
while True:
    delta = 0
    for s in grid.allStates():
        if not grid.isTerminal(s):
            vOld = v[s]
            vNew = 0
            for a in ACTION_SPACE:
                for sPrime in grid.allStates():
                    print(s)
                    print(a)
                    actionProb = policy[s].get(a,0)
                    vNew += actionProb * transitionProbs.get((s,a,sPrime),0) * (rewards.get((s,a,sPrime),0) + gamma*v[sPrime])
            v[s] = vNew
            delta = max(delta , np.abs(vOld - vNew))
    print("iter: ", it ,"delta: ",delta)
    printValues(v,grid)
    it += 1
    
    if delta < 1e-3:
        break
print("\n\n")

---------------------------
  {'R': 1.0}  |  {'R': 1.0}  |  {'R': 1.0}  |     |
---------------------------
  {'U': 1.0}  |     |  {'U': 1.0}  |     |
---------------------------
  {'U': 0.5, 'R': 0.5}  |  {'R': 1.0}  |  {'U': 1.0}  |  {'L': 1.0}  |



(0, 1)
U
(0, 1)
U
(0, 1)
U
(0, 1)
U
(0, 1)
U
(0, 1)
U
(0, 1)
U
(0, 1)
U
(0, 1)
U
(0, 1)
U
(0, 1)
U
(0, 1)
D
(0, 1)
D
(0, 1)
D
(0, 1)
D
(0, 1)
D
(0, 1)
D
(0, 1)
D
(0, 1)
D
(0, 1)
D
(0, 1)
D
(0, 1)
D
(0, 1)
L
(0, 1)
L
(0, 1)
L
(0, 1)
L
(0, 1)
L
(0, 1)
L
(0, 1)
L
(0, 1)
L
(0, 1)
L
(0, 1)
L
(0, 1)
L
(0, 1)
R
(0, 1)
R
(0, 1)
R
(0, 1)
R
(0, 1)
R
(0, 1)
R
(0, 1)
R
(0, 1)
R
(0, 1)
R
(0, 1)
R
(0, 1)
R
(1, 2)
U
(1, 2)
U
(1, 2)
U
(1, 2)
U
(1, 2)
U
(1, 2)
U
(1, 2)
U
(1, 2)
U
(1, 2)
U
(1, 2)
U
(1, 2)
U
(1, 2)
D
(1, 2)
D
(1, 2)
D
(1, 2)
D
(1, 2)
D
(1, 2)
D
(1, 2)
D
(1, 2)
D
(1, 2)
D
(1, 2)
D
(1, 2)
D
(1, 2)
L
(1, 2)
L
(1, 2)
L
(1, 2)
L
(1, 2)
L
(1, 2)
L
(1, 2)
L
(1, 2)
L
(1, 2)
L
(1, 2)
L
(1, 2)
L
(1, 2)
R
(1, 2)
R
(1, 2)
R
(1, 2)
R
(1, 2)
R
(1, 2)
R


In [117]:
policy[s]

{1.0, 'R'}

In [119]:
s

(0, 1)

In [122]:
policy

{(2, 0): {'U': 0.5, 'R': 0.5},
 (1, 0): {1.0, 'U'},
 (0, 0): {1.0, 'R'},
 (0, 1): {1.0, 'R'},
 (0, 2): {1.0, 'R'},
 (1, 2): {1.0, 'U'},
 (2, 1): {1.0, 'R'},
 (2, 2): {1.0, 'U'},
 (2, 3): {1.0, 'L'}}