In [None]:
import numpy as np

In [248]:
# Iterative Policy Evaluation Algorithm
class GridIterativePolicyEvaluation(): 
    def __init__(self, init_state_values, starting_state, grid_size, terminal_states):
        #Initialize state values randomly
        self.evaluated_state_values = init_state_values

        #Undicounted
        self.discount = 1
        self.step_cost = -1

        #Grid Size
        self.grid_size = grid_size

        #Termination States
        self.terminal_states = terminal_states

        #When to stop
        self.termination_diff = 0.00000001
        self.diff = 100

        # select random integer between 0 and 3
        self.current_state = starting_state
        self.action_probability = 0.25

    def get_starting_state_random(self): 
        # Select a random starting state - Don't let it be the termination state
        x = np.random.randint(0, self.grid_size[0])
        y = np.random.randint(0, self.grid_size[1])
        while ([x, y] in self.terminal_states):
            x = np.random.randint(0, self.grid_size[0])
            y = np.random.randint(0, self.grid_size[1])
        return [x, y]

    def get_step_right(self):
        return [self.current_state[0] + 1, self.current_state[1]]
    
    def get_step_left(self):
        return [self.current_state[0] - 1, self.current_state[1]]
    def get_step_up(self):
        return [self.current_state[0], self.current_state[1] + 1]
    
    def get_step_down(self):
        return [self.current_state[0], self.current_state[1] - 1]
    
    def is_out_of_range(self, state):
        if (state[0] > self.grid_size[0] - 1 or state[0] < 0 or state[1] > self.grid_size[1] -1 or state[1] < 0):
            return True
        else:
            return False

    def get_state_value(self):
        expected_value = 0.0

        # Get expected 
        if (self.is_out_of_range(self.get_step_right())):
            expected_value += self.action_probability * (self.step_cost + self.discount * self.evaluated_state_values[self.current_state[0], self.current_state[1]])
        else: 
            expected_value += self.action_probability * (self.step_cost + self.discount * self.evaluated_state_values[self.get_step_right()[0], self.current_state[1]])

        if (self.is_out_of_range(self.get_step_left())):
            expected_value += self.action_probability * (self.step_cost + self.discount * self.evaluated_state_values[self.current_state[0], self.current_state[1]])
        else:
            expected_value += self.action_probability * (self.step_cost + self.discount * self.evaluated_state_values[self.get_step_left()[0], self.current_state[1]])
        
        if (self.is_out_of_range(self.get_step_up())):
            expected_value += self.action_probability * (self.step_cost + self.discount * self.evaluated_state_values[self.current_state[0], self.current_state[1]])
        else:
            expected_value += self.action_probability * (self.step_cost + self.discount * self.evaluated_state_values[self.current_state[0], self.get_step_up()[1]])
        
        if (self.is_out_of_range(self.get_step_down())):
            expected_value += self.action_probability * (self.step_cost + self.discount * self.evaluated_state_values[self.current_state[0], self.current_state[1]])
        else:
            expected_value += self.action_probability * (self.step_cost + self.discount * self.evaluated_state_values[self.current_state[0], self.get_step_down()[1]])

        return expected_value
    
    def get_next_state(self, step):
        if (self.is_out_of_range(step)):
            return self.current_state
        else: 
            return step

    def take_step(self): 
        action_probability = np.random.rand()
        #take one action randomly
        if (action_probability < self.action_probability):
            return self.get_next_state(self.get_step_up())
        elif (action_probability < 2 * self.action_probability):
            return self.get_next_state(self.get_step_down())
        elif (action_probability < 3 * self.action_probability):
            return self.get_next_state(self.get_step_right())
        else:
            return self.get_next_state(self.get_step_left())

    
    def iterative_evaluation(self):
        while (self.diff > self.termination_diff):
            self.diff = 0.0
            while (True):
                if (self.current_state in self.terminal_states):
                    break

                old_state_value = self.evaluated_state_values[self.current_state[0]][self.current_state[1]]
                new_state_value = self.get_state_value()

                self.evaluated_state_values[self.current_state[0]][self.current_state[1]] = new_state_value
                self.diff = max(self.diff, abs(old_state_value - new_state_value))

                self.current_state = self.take_step()

            self.current_state = self.get_starting_state_random()

        return self.evaluated_state_values
        


In [249]:

    
grid_size = [5, 5]
terminal_states  = [[4,4], [0,0]]

#Initialize a random number
init_state_values = np.random.rand(grid_size[0], grid_size[1])*-100
for terminal_state in terminal_states:
    init_state_values[terminal_state[0]][terminal_state[1]] = 0

x = np.random.randint(0, grid_size[0])
y = np.random.randint(0, grid_size[1])
while ([x, y] in terminal_states):
    x = np.random.randint(0, grid_size[0])
    y = np.random.randint(0, grid_size[1])
starting_state = [x, y]


In [250]:
# Gives you the expexted value of a state in the grid world
GridIterativePolicyEvaluation(init_state_values, starting_state, grid_size, terminal_states).iterative_evaluation()

array([[  0.        , -23.00000091, -34.3333347 , -39.66666825,
        -41.66666832],
       [-23.00000108, -30.66666802, -36.33333493, -39.00000167,
        -39.66666833],
       [-34.33333504, -36.33333511, -37.33333514, -36.33333504,
        -34.33333488],
       [-39.66666865, -39.00000199, -36.33333518, -30.66666836,
        -23.00000122],
       [-41.66666882, -39.66666883, -34.33333516, -23.00000117,
          0.        ]])