In [835]:
import numpy as np

In [857]:
# rents out car and is credited with $10 by company
# if he is out of cars then the business is lost
# cars can be rented the day after returned 
# jack can move cars between locations for 2 dollars

# Iterative Policy Evaluation Algorithm
class PolicyIteration(): 
    def __init__(self, init_state_values, starting_actions, starting_state, terminal_state, state_x_size, state_y_size):
        #Initialize state values randomly
        self.evaluated_state_values = init_state_values

        #discount factor
        self.discount = 0.9
        self.move_action_cost = -2.0 
        self.max_move_size = 5
        self.rental_cost = 10.0

        self.return_lambda = [3, 2]
        self.rent_lambda = [3, 4]

        #Termination States
        self.terminal_state = terminal_state

        #When to stop
        self.termination_diff = 0.00000001
        self.diff = 100

        # select random integer between 0 and 3
        self.current_state = starting_state
        self.current_action = starting_actions

        self.state_x_size = state_x_size
        self.state_y_size = state_y_size

        self.action_probabilitiy = 1
    
    def get_starting_state_random(self):
        x = np.random.randint(0, self.state_x_size)
        y = np.random.randint(0, self.state_y_size)
        while (x == self.terminal_state or y == self.terminal_state):
            x = np.random.randint(0, self.state_x_size)
            y = np.random.randint(0, self.state_y_size)
        
        return [x, y]

    def get_action_cost(self, action): 
        return action[1] * self.move_action_cost

    # Get the estimated state values using the action reward and 
    def get_next_state_value(self, state, action):
        expected_value = 0.0

        state_after_action = self.get_state_after_action(state, action)

        rental_rewards = self.get_rental_rewards_next_day(state_after_action)
        move_cost = self.get_action_cost(action)

        net_reward = rental_rewards + move_cost

        # print(rental_rewards, move_cost, net_reward, self.evaluated_state_values[state_after_action[0]][state_after_action[1]], self.action_probabilitiy * (net_reward + self.discount * self.evaluated_state_values[state_after_action[0]][state_after_action[1]]))

        return self.action_probabilitiy * (net_reward + self.discount * self.evaluated_state_values[state_after_action[0]][state_after_action[1]])

    def get_rental_rewards_next_day(self, state):
        rental_earnings = 0.0

        if state[0] >= self.return_lambda[0]: 
            rental_earnings += self.rent_lambda[0] * self.rental_cost
        else: 
            rental_earnings += state[0] * self.rental_cost
        
        if state[1] >= self.return_lambda[1]:
            rental_earnings += self.rent_lambda[1] * self.rental_cost
        else:
            rental_earnings += state[1] * self.rental_cost

        return rental_earnings
    
    def get_state_after_action(self, state, action):
        new_state = state

        x_max_receive = min(self.state_x_size - state[0], action[1])
        y_max_receive = min(self.state_y_size - state[1], action[1])

        x_max_move = min(state[0], action[1])
        y_max_move = min(state[1], action[1])

        #Update based on move
        if action[0] == 0:
            new_state = [state[0] - min(x_max_move, y_max_receive), state[1] + min(x_max_move, y_max_receive)]
        else: 
            new_state = [state[0] + min(x_max_receive, y_max_move), state[1] - min(x_max_receive, y_max_move)]
    
        # update the state based on the net value of rentals and returns
        net_rentals_x = self.return_lambda[0] - self.rent_lambda[0]
        net_rentals_y = self.return_lambda[1] - self.rent_lambda[1]
        
        if (net_rentals_x + new_state[0] > self.state_x_size):
            new_state[0] = self.state_x_size
        elif (net_rentals_x + new_state[0] < 0): 
            new_state[0] = 0
        else: 
            new_state[0] = net_rentals_x + new_state[0]
        
        if (net_rentals_y + new_state[1] > self.state_y_size):
            new_state[1] = self.state_y_size
        elif (net_rentals_y + new_state[1] < 0):
            new_state[1] = 0
        else:
            new_state[1] = net_rentals_y + new_state[1]
        
        return new_state
    
    def iterative_evaluation(self):
        while (self.diff > self.termination_diff):
            self.diff = 0.0

            for i in range(self.state_x_size):
                for j in range(self.state_y_size):
                    self.current_state = [i, j]
                    self.evaluated_state_values[i][j] = self.get_next_state_value(self.current_state, self.current_action[self.current_state[0]][self.current_state[1]])
                    self.diff = max(self.diff, abs(self.evaluated_state_values[i][j] - self.evaluated_state_values[self.current_state[0]][self.current_state[1]]))
                    # print(self.current_state)
        
        print(self.evaluated_state_values)
    
    def policy_improvement(self): 
        for i in range(self.state_x_size):
            for j in range(self.state_y_size):
                self.current_state = [i, j]
                action = self.get_action_with_max_value(self.current_state)
                self.current_action[i][j] = self.get_action_with_max_value(self.current_state)
                # print(self.current_state)
    
    def get_action_with_max_value(self, state):
        max_value = -1000000.0
        max_action = [0, 0]
        for i in range(self.max_move_size + 1):
            action = [0, i]
            new_state = self.get_state_after_action(state, action)

            if (self.evaluated_state_values[new_state[0]][new_state[1]] > max_value):
                max_value = self.evaluated_state_values[new_state[0]][new_state[1]]
                max_action = action
        
        for i in range(self.max_move_size + 1):
            action = [1, i]
            new_state = self.get_state_after_action(state, action)

            if (self.evaluated_state_values[new_state[0]][new_state[1]] > max_value):
                max_value = self.evaluated_state_values[new_state[0]][new_state[1]]
                max_action = action
        
        return max_action

In [868]:
#Initialize Problem
terminal_state = 0
lot_one_size = 20
lot_two_size = 20

x = np.random.randint(0, lot_one_size+1)
y = np.random.randint(0, lot_two_size+1)
while ([x, y] == terminal_state):
    x = np.random.randint(0, lot_one_size+1)
    y = np.random.randint(0, lot_two_size+1)

starting_state = [x, y]

#Initialize a random number
init_state_values = np.random.rand(lot_one_size+1, lot_two_size+1)*100
init_state_values[terminal_state, :] = 0
init_state_values[:, terminal_state] = 0

starting_actions = np.zeros((lot_one_size+1, lot_two_size+1, 2), dtype=int)

In [869]:
# Gives you the expexted value of a state in the grid world
policy = PolicyIteration(init_state_values, starting_actions, starting_state, terminal_state, lot_one_size, lot_two_size)

In [875]:
policy.policy_improvement()

In [876]:
policy.iterative_evaluation()

[[  0.           0.           0.          10.          40.
   49.          76.          84.1        108.4        115.69
  137.56       144.121      163.804      169.7089     187.4236
  192.73801    208.68124    213.464209   227.813116   232.1177881
    0.        ]
 [ 10.          19.          19.          37.1         67.1
   83.39       110.39       125.051      149.351      162.5459
  184.4159     196.29131    215.97431    226.662179   244.376879
  253.9959611  269.9391911  278.59636499 292.94527199 300.73672849
   52.66784802]
 [ 20.          38.          38.          64.2         94.2
  117.78       144.78       166.002      190.302      209.4018
  231.2718     248.46162    268.14462    283.615458   301.330158
  315.2539122  331.1971422  343.72852098 358.07742798 369.35566888
   78.24933928]
 [ 30.          57.          57.          91.3        121.3
  152.17       179.17       206.953      231.253      256.2577
  278.1277     300.63193    320.31493    340.568737   358.283437
  376

In [846]:
policy.policy_improvement()

In [848]:
policy.iterative_evaluation()

[[  0.          10.          40.          49.          76.
   84.1        108.4        115.69       137.56       144.121
  163.804      169.7089     187.4236     192.73801    208.68124
  213.464209   227.813116   232.1177881  245.0318044  248.90600929
    0.        ]
 [ 10.          29.          59.          76.1        103.1
  118.49       142.79       156.641      178.511      190.9769
  210.6599     221.87921    239.59391    249.691289   265.634519
  274.7221601  289.0710671  297.24994409 310.16396039 317.52494968
   24.15761762]
 [ 20.          48.          78.         103.2        130.2
  152.88       177.18       197.592      219.462      237.8328
  257.5158     274.04952    291.76422    306.644568   322.587798
  335.9801112  350.3290182  362.38210008 375.29611638 386.14389007
   94.11382659]
 [ 30.          67.          97.         130.3        157.3
  187.27       211.57       238.543      260.413      284.6887
  304.3717     326.21983    343.93453    363.597847   379.541077
  