In [1]:
import numpy as np

In [18]:
class GridWorld(object):
    def __init__(self):
        # create an array for grid 
        # 0 represents empty cell, 1 represents *, and 2 represents x
        self.grid = np.array([[0,0,0,0,0,0,0,0,0],
                              [0,1,1,1,1,2,0,0,0],
                              [0,2,2,2,0,2,2,0,0],
                              [0,0,0,0,0,0,0,2,0],
                              [0,0,2,2,2,2,0,2,0],
                              [0,0,0,0,0,0,0,2,0],
                              [0,2,2,2,0,2,2,0,0],
                              [0,0,0,0,0,0,0,0,3],
                              [0,1,1,1,2,1,1,0,2]]) 
        self.rewards = {0:-1, 1:5, 2:-20, 3:100} # rewards 
        self.actions = np.array([[1, 0.125],[2, 0.125], [3, 0.625],[4, 0.125]]) #up,right,down,left with probabilities
        self.transition_probability = np.array([0.2,0.6,0.2]) # left diagonal, desired direction, right diagonal
        self.gama = 0.9  # discount factor
        self.threshold = 0.0001  # theta
        self.n=9  # number of rows 
        self.a=4  # number of actions
        self.states = self.n*self.n

    def get_next_state(self,row,col,a):
        # going up
        if a == 1:
            # if state is in row 0 it stays in same row
            if row == 0:
                r = row
                c = col
                reward = -10
            else:
                # if next state is not reachable
                reward = self.rewards[self.grid[row-1,col]]
                if self.grid[row-1,col] == 2:
                    r = row
                    c = col
                # if next state is reachable and no obstacle
                else:
                    r = row-1
                    c = col
        # go right
        elif a == 2:
            # if state is in last column it stays in same column
            if col == 8:
                r = row
                c = col
                reward = -10
            else:
                # if next state is not reachable
                reward = self.rewards[self.grid[row,col+1]]
                if self.grid[row,col+1] == 2:
                    r = row
                    c = col
                # if next state is reachable and no obstacle
                else:
                    r = row
                    c = col+1
        # go down
        elif a == 3:
            # if state is in row 8 it stays in same row
            if row == 8:
                r = row
                c = col
                reward = -10
            else:
                # if next state is not reachable
                reward = self.rewards[self.grid[row+1,col]]
                if self.grid[row+1,col] == 2:
                    r = row
                    c = col
                # if next state is reachable and no obstacle
                else:
                    r = row+1
                    c = col
        # go left
        elif a == 4:
            # if state is in first column it stays in same column
            if col == 0:
                r = row
                c = col
                reward = -10
            else:
                # if next state is not reachable
                reward = self.rewards[self.grid[row,col-1]]
                if self.grid[row,col-1] == 2:
                    r = row
                    c = col
                # if next state is reachable and no obstacle
                else:
                    r = row
                    c = col-1
        return r,c,reward
    
    # states with undeterministic action
    # 0.6 probability to move in desired direction, 0.2 to move left diagonal, 0.2 for right diagonal
    def get_next_non_determinstic_state(self,row,col,a):
        self.possible_states = np.zeros((3,4))
        # going up
        if a == 1:
            for i in range(3):
                # going up,Diagonaly left
                if i == 0:
                    if row == 0:
                        r = row
                        c = col
                        reward = -10
                    else:
                        if col == 0:
                            r = row
                            c = col
                            reward = -10
                        else:
                            reward = self.rewards[self.grid[row-1,col-1]]
                            if self.grid[row-1,col] == 2:
                                r = row
                                c = col
                            # if next state is reachable and no obstacle
                            else:
                                r = row-1
                                c = col-1
                    prob = self.transition_probability[i]
                #going up, Desired direction
                if i == 1:
                    if row == 0:
                        r = row
                        c = col
                        reward = -10
                    else:
                        reward = self.rewards[self.grid[row-1,col]]
                        if self.grid[row-1,col] == 2:
                            r = row
                            c = col
                        # if next state is reachable and no obstacle
                        else:
                            r = row-1
                            c = col
                    prob = self.transition_probability[i]
                # going up,Diagonally right
                if i == 2:
                    if row == 0:
                        r = row
                        c = col
                        reward = -10
                    else:
                        if col == 8:
                            r = row
                            c = col
                            reward = -10
                        else:
                            reward = self.rewards[self.grid[row-1,col+1]]
                            if self.grid[row-1,col+1] == 2:
                                r = row
                                c = col
                            # if next state is reachable and no obstacle
                            else:
                                r = row-1
                                c = col+1
                    prob = self.transition_probability[i]
                self.possible_states[i,:] = np.array([r,c,reward,prob])
        # going right 
        if a == 2:
            for i in range(3):
                # going right, Diagonaly left
                if i == 0:
                    if col == 8:
                        r = row
                        c = col
                        reward = -10
                    else:
                        if row == 0:
                            r = row
                            c = col
                            reward = -10
                        else:
                            reward = self.rewards[self.grid[row-1,col+1]]
                            if self.grid[row-1,col+1] == 2:
                                r = row
                                c = col
                            # if next state is reachable and no obstacle
                            else:
                                r = row-1
                                c = col+1
                    prob = self.transition_probability[i]
                # going right,Desired direction
                if i == 1:
                    if col == 8:
                        r = row
                        c = col
                        reward = -10
                    else:
                        reward = self.rewards[self.grid[row,col+1]]
                        if self.grid[row,col+1] == 2:
                            r = row
                            c = col
                        # if next state is reachable and no obstacle
                        else:
                            r = row
                            c = col+1
                    prob = self.transition_probability[i]
                # going right,Diagonally right
                if i == 2:
                    if col == 8:
                        r = row
                        c = col
                        reward = -10
                    else:
                        if row == 8:
                            r = row
                            c = col
                            reward = -10
                        else:
                            reward = self.rewards[self.grid[row+1,col+1]]
                            if self.grid[row+1,col+1] == 2:
                                r = row
                                c = col
                            # if next state is reachable and no obstacle
                            else:
                                r = row+1
                                c = col+1
                    prob = self.transition_probability[i]
                self.possible_states[i,:] = np.array([r,c,reward,prob])
        # going down
        if a == 3:
            for i in range(3):
                # going down,  Diagonaly left
                if i == 0:
                    if row == 8:
                        r = row
                        c = col
                        reward = -10
                    else:
                        if col == 8:
                            r = row
                            c = col
                            reward = -10
                        else:
                            reward = self.rewards[self.grid[row+1,col+1]]
                            if self.grid[row+1,col+1] == 2:
                                r = row
                                c = col
                            # if next state is reachable and no obstacle
                            else:
                                r = row+1
                                c = col+1
                    prob = self.transition_probability[i]
                # going down, Desired direction
                if i == 1:
                    if row == 8:
                        r = row
                        c = col
                        reward = -10
                    else:
                        reward = self.rewards[self.grid[row+1,col]]
                        if self.grid[row+1,col] == 2:
                            r = row
                            c = col
                        # if next state is reachable and no obstacle
                        else:
                            r = row+1
                            c = col
                    prob = self.transition_probability[i]
                # going down, Diagonally right
                if i == 2:
                    if row == 8:
                        r = row
                        c = col
                        reward = -10
                    else:
                        if col == 0:
                            r = row
                            c = col
                            reward = -10
                        else:
                            reward = self.rewards[self.grid[row+1,col-1]]
                            if self.grid[row+1,col-1] == 2:
                                r = row
                                c = col
                            # if next state is reachable and no obstacle
                            else:
                                r = row+1
                                c = col-1
                    prob = self.transition_probability[i]
                self.possible_states[i,:] = np.array([r,c,reward,prob])
        # going left 
        if a == 4:
            for i in range(3):
                # going left,  Diagonaly left
                if i == 0:
                    if col == 0:
                        r = row
                        c = col
                        reward = -10
                    else:
                        if row == 8:
                            r = row
                            c = col
                            reward = -10
                        else:
                            reward = self.rewards[self.grid[row+1,col-1]]
                            if self.grid[row+1,col-1] == 2:
                                r = row
                                c = col
                            # if next state is reachable and no obstacle
                            else:
                                r = row+1
                                c = col-1
                    prob = self.transition_probability[i]
                # going left, Desired direction
                if i == 1:
                    if col == 0:
                        r = row
                        c = col
                        reward = -10
                    else:
                        reward = self.rewards[self.grid[row,col-1]]
                        if self.grid[row,col-1] == 2:
                            r = row
                            c = col
                        # if next state is reachable and no obstacle
                        else:
                            r = row
                            c = col-1
                    prob = self.transition_probability[i]
                # going left, Diagonally right
                if i == 2:
                    if col == 0:
                        r = row
                        c = col
                        reward = -10
                    else:
                        if row == 0:
                            r = row
                            c = col
                            reward = -10
                        else:
                            reward = self.rewards[self.grid[row-1,col-1]]
                            if self.grid[row-1,col-1] == 2:
                                r = row
                                c = col
                            # if next state is reachable and no obstacle
                            else:
                                r = row-1
                                c = col-1
                    prob = self.transition_probability[i]
                self.possible_states[i,:] = np.array([r,c,reward,prob])
        return self.possible_states
        
    # get the expected reward of state
    def get_value(self,row,col):
        index = row*9 + col
        return self.V[index]
    
    # get the values of next state
    def get_next_state_value(self, row, col):
        A_v = np.zeros(self.a)
        for j in range(1,5): # for each action
            r,c,reward = self.get_next_state(row,col,j) # get next state and reward
            next_s = r*9 + c
            next_v = self.get_value(r,c) # get value from next state
            prob = self.actions[j-1,1] # probability of action
            A_v[j-1]  += prob*(reward + self.gama*next_v)
        return A_v
    
    # get the value of non-deterministic states
    def get_next_non_deterministic_state_value(self, row, col):
        A_v = np.zeros(self.a)
        for j in range(1,5): # for each action
            possible_states = self.get_next_non_determinstic_state(row,col,j) # get next state and reward
            for s in range(3):
                next_s = possible_states[s,0]*9 + possible_states[s,1]
                next_v = self.get_value(possible_states[s,0],possible_states[s,1]) # get value from nest state
                prob = self.actions[j-1,1] # probability of action
                A_v[j-1]  +=  possible_states[s,3]*prob*(possible_states[s,2] + self.gama*next_v)
        return A_v
    
    # evaluate the policy
    def policy_evaluation(self):        
        self.V = np.zeros(self.states)
        count = 0
        iterate = True
        while iterate:
        
            delta = 0
            for i in range(self.states):
                row = int(i / self.n)
                col = i % self.n
                s = self.grid[row,col]
                v = 0
                if s != 2 and s != 3:
                    for j in range(1,5): # for each action
                        r,c,reward = self.get_next_state(row,col,j) # get next state and reward
                        next_s = r*9 + c
                        next_v = self.get_value(r,c) # get value from nest state
                        prob = self.actions[j-1,1] # probability of action
                        v  += prob*(reward + self.gama*next_v)
        
                    delta = max(delta, np.abs(v - self.V[i]))
                self.V[i] = v
            count += 1
            # if delta is smaller than certain threshold, it stops.
            if delta < self.threshold:
                break
        return self.V, delta, count

    # value iteration
    def value_iteration(self):        
        self.V = np.zeros(self.states)
        self.policy = np.zeros((self.states,self.a))
        count = 0
        iterate = True
        while iterate:
        #     print(count)
            delta = 0
            for i in range(self.states):
                row = int(i / self.n)
                col = i % self.n
                s = self.grid[row,col]
                best_v = 0
                # if state is either * or x, then values of those state remain same
                if s != 2 and s != 3:
                    values = self.get_next_state_value(row,col)
                    best_v = max(values)
                    delta = max(delta, np.abs(best_v- self.V[i]))
                self.V[i] = best_v
            count += 1
            if delta < self.threshold:
                break
        
        for i in range(self.states):
            row = int(i / self.n)
            col = i % self.n
            values = self.get_next_state_value(row,col)
            # take action with maximum value
            best_v = np.argmax(values)
            self.policy[i,best_v] = 1
        # Deterministic policy     
        return self.V, delta, count, self.policy

    # non-deterministic actions
    def non_deterministic_value_iteration(self):        
        self.V = np.zeros(self.states)
        self.policy = np.zeros((self.states,self.a))
        count = 0
        iterate = True
        while iterate:
        
            delta = 0
            for i in range(self.states):
                row = int(i / self.n)
                col = i % self.n
                s = self.grid[row,col]
                best_v = 0
                if s != 2 and s != 3:
                    values = self.get_next_non_deterministic_state_value(row,col)
                    best_v = max(values)
                    delta = max(delta, np.abs(best_v- self.V[i]))
                self.V[i] = best_v
            count += 1
            if delta < self.threshold:
                break
        # Deterministic policy
        for i in range(self.states):
            row = int(i / self.n)
            col = i % self.n
            values = self.get_next_state_value(row,col)
            best_v = np.argmax(values)
            self.policy[i,best_v] = 1
             
        return self.V, delta, count, self.policy

In [19]:
grid_world = GridWorld()

# Compute the expected value

In [20]:
V,delta, count = grid_world.policy_evaluation()
V.shape = (9,9)
print('delta: ',delta)
print('number of iterations: ',count)
print('V:', V)

('delta: ', 8.8468431698629502e-05)
('number of iterations: ', 91)
('V:', array([[ -50.1324936 ,  -64.58247582,  -72.65010404,  -76.66298262,
         -80.8102371 ,  -99.15175631,  -81.18403045,  -50.63043609,
         -20.12029228],
       [ -50.3772763 ,  -80.22926875,  -89.26562608,  -93.15408361,
         -95.22664654,    0.        ,  -94.35615829,  -55.8449427 ,
         -11.81743254],
       [ -50.96744537,    0.        ,    0.        ,    0.        ,
        -110.7870577 ,    0.        ,    0.        ,  -66.1413208 ,
           0.32553778],
       [ -52.14645007,  -76.29940005, -118.74797657, -131.19534114,
        -124.70584532, -128.54082796, -111.07930378,    0.        ,
          19.88315053],
       [ -53.04445287,  -80.20470504,    0.        ,    0.        ,
           0.        ,    0.        , -117.11229547,    0.        ,
          35.32945071],
       [ -53.44443734,  -94.67650531, -120.093197  , -116.70428329,
         -81.49735801, -119.75857851, -128.91671857,    0.

# Value iteration



In [21]:
V, delta, count, policy = grid_world.value_iteration()
V.shape = (9,9)
print('delta:', delta)
print('number of iterations:', count)
print('value:', V )
print("\n0 = up, 1 = right, 2 = down, 3 = left \n")
print(np.reshape(np.argmax(policy,axis=1),(9,9)))

('delta:', 1.6074364062035329e-12)
('number of iterations:', 9)
('value:', array([[  2.71126761e-01,   3.52112676e+00,   3.52112676e+00,
          3.52112676e+00,   3.52112676e+00,   2.71126761e-01,
         -9.44982394e-02,  -5.78990933e-02,   5.96452504e-01],
       [  7.04225352e-01,   7.04225352e-01,   7.04225352e-01,
          7.04225352e-01,   7.04225352e-01,   0.00000000e+00,
         -1.11579819e-01,   1.19290501e-01,   2.17147112e+00],
       [ -4.57746479e-02,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   7.04225352e-01,   0.00000000e+00,
          0.00000000e+00,   4.34294224e-01,   4.97150421e+00],
       [ -1.30149648e-01,  -1.39641835e-01,  -1.39641835e-01,
         -1.30149648e-01,  -4.57746479e-02,  -1.30149648e-01,
         -1.39641835e-01,   0.00000000e+00,   9.94934082e+00],
       [ -1.39641835e-01,  -1.40709706e-01,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         -1.40709706e-01,   0.00000000e+00,   1.87988

# Non-deterministic actions

In [22]:
V, delta, count, policy = grid_world.non_deterministic_value_iteration()
V.shape = (9,9)
print('delta:', delta)
print('number of iterations:', count)
print('V: ', V)
print("\n0 = up, 1 = right, 2 = down, 3 = left \n")
print(np.reshape(np.argmax(policy,axis=1),(9,9)))

('delta:', 2.4178113538830814e-05)
('number of iterations:', 12)
('V: ', array([[ -3.44243869e-02,   2.42739026e+00,   3.24868693e+00,
          3.25252722e+00,   2.74076201e-02,  -2.03966779e-01,
         -3.88366809e-01,  -3.88262459e-01,  -3.88366809e-01],
       [ -8.99025689e-02,   1.11169750e-01,   2.22084511e-01,
          3.22016055e-01,  -5.45573757e-02,   0.00000000e+00,
         -1.47190863e-01,  -1.52037335e-01,  -1.47190863e-01],
       [ -2.08252792e-01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   3.64474515e-03,   0.00000000e+00,
          0.00000000e+00,  -1.41886109e-01,  -3.71719922e-01],
       [ -6.29541091e-01,  -1.75401765e-01,  -6.29541091e-01,
         -6.89650886e-01,  -1.09949256e+00,  -6.89650886e-01,
         -1.17439941e+00,   0.00000000e+00,  -3.86990826e-01],
       [ -1.43202384e-01,  -1.57191006e-01,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         -7.10781693e-01,   0.00000000e+00,   1.5603209

