In [None]:
import numpy as np

In [None]:
# A wrapper class for parameters of the algorithm
class Params:
    def __init__(self):
        # Max number of cars for each location
        self.max_car = 20

        # Max number of cars to move each night
        self.max_move = 5

        # Reward given to rent a car
        self.reward_per_car = 10

        # Cost to keep more car than half the maximum overnight, for the modified version of Jack's Car Rental problem
        self.cost_per_slot_night = 4

        # Cost to move a car
        self.cost_per_car = 2

        # Small number determining the accuracy of policy evaluation's estimation
        self.theta = 0.01

        # Discount value
        self.gamma = 0.9

        # Expectation for rental requests in first location
        self.lambda_request_first = 3

        # Expectation for rental requests in second location
        self.lambda_request_second = 4

        # Expectation for returns in first location
        self.lambda_return_first = 3

        # Expectation for returns in second location
        self.lambda_return_second = 2

        # Possible versions of the problem
        self.problem_types = ['original_problem', 'modified_problem']


class GamblersValueIteration():
    def __init__(self, p_h, params):
        # Set up the ph value
        self.p_h = p_h

        # Set up parameters
        self.params = params

        # All possible states
        self.S = np.arange(1, self.params.max_money)

        # Value function
        self.V = np.zeros(self.params.max_money + 1)
        self.V[0] = 0
        self.V[self.params.max_money] = 1

        # List of value functions
        self.Vs = []

        # Policy function
        self.pi = None

        # Number of sweeps needed to complete the problem
        self.sweep_count = None

    def solve_problem(self):
        """
        Resolve Gambler Problem using Value Iteration
        """
        self.sweep_count = 0
        while True:
            delta = 0
            for s in self.S:
                v = self.V[s]
                self.V[s] = np.max([self.V_eval(s, a) for a in self.A(s)])
                delta = np.maximum(delta, abs(v - self.V[s]))
            if self.sweep_count < 3:
                self.Vs.append(self.V.copy())
            self.sweep_count += 1
            if delta < self.params.theta:
                break
        print('Sweeps needed:', self.sweep_count)
        self.Vs.append(self.V.copy())
        self.pi = [self.A(s)[np.argmax([self.V_eval(s, a) for a in self.A(s)])] for s in self.S]
    
    def A(self, s):
        """
        Get all possible actions given a state
        :param s: state
        :return: possible actions
        """
        # All possible actions
        return np.arange(1, np.minimum(s, self.params.max_money - s) + 1)

    def V_eval(self, s, a):
        """
        Compute value given a state and an action for the state following the formula:
        sum over all s',r of p(s',r|s, a)[r + gamma*V(s')]
        :param s: state
        :param a: action
        :return: value
        """
        return self.params.gamma * self.V[s + a] * self.p_h + self.params.gamma * self.V[s - a] * (1 - self.p_h)

# rents out car and is credited with $10 by company
# if he is out of cars then the business is lost
# cars can be rented the day after returned 
# jack can move cars between locations for 2 dollars

# Iterative Policy Evaluation Algorithm
class PolicyIteration(): 
    def __init__(self, init_state_values, starting_actions, starting_state, terminal_state, state_x_size, state_y_size):
        #Initialize state values randomly
        self.evaluated_state_values = init_state_values

        #discount factor
        self.discount = 0.9
        self.move_action_cost = -2.0 
        self.max_move_size = 5
        self.rental_cost = 10.0

        self.return_lambda = [3, 2]
        self.rent_lambda = [3, 4]

        #Termination States
        self.terminal_state = terminal_state

        #When to stop
        self.termination_diff = 0.00000001
        self.diff = 100

        # select random integer between 0 and 3
        self.current_state = starting_state
        self.current_action = starting_actions

        self.state_x_size = state_x_size
        self.state_y_size = state_y_size

        self.action_probabilitiy = 1
    
    def get_starting_state_random(self):
        x = np.random.randint(0, self.state_x_size)
        y = np.random.randint(0, self.state_y_size)
        while (x == self.terminal_state or y == self.terminal_state):
            x = np.random.randint(0, self.state_x_size)
            y = np.random.randint(0, self.state_y_size)
        
        return [x, y]

    def get_action_cost(self, action): 
        return action[1] * self.move_action_cost

    # Get the estimated state values using the action reward and 
    def get_next_state_value(self, state, action):
        expected_value = 0.0

        state_after_action = self.get_state_after_action(state, action)

        rental_rewards = self.get_rental_rewards_next_day(state_after_action)
        move_cost = self.get_action_cost(action)

        net_reward = rental_rewards + move_cost

        # print(rental_rewards, move_cost, net_reward, self.evaluated_state_values[state_after_action[0]][state_after_action[1]], self.action_probabilitiy * (net_reward + self.discount * self.evaluated_state_values[state_after_action[0]][state_after_action[1]]))

        return self.action_probabilitiy * (net_reward + self.discount * self.evaluated_state_values[state_after_action[0]][state_after_action[1]])

    def get_rental_rewards_next_day(self, state):
        rental_earnings = 0.0

        if state[0] >= self.return_lambda[0]: 
            rental_earnings += self.rent_lambda[0] * self.rental_cost
        else: 
            rental_earnings += state[0] * self.rental_cost
        
        if state[1] >= self.return_lambda[1]:
            rental_earnings += self.rent_lambda[1] * self.rental_cost
        else:
            rental_earnings += state[1] * self.rental_cost

        return rental_earnings
    
    def get_state_after_action(self, state, action):
        new_state = state

        x_max_receive = min(self.state_x_size - state[0], action[1])
        y_max_receive = min(self.state_y_size - state[1], action[1])

        x_max_move = min(state[0], action[1])
        y_max_move = min(state[1], action[1])

        #Update based on move
        if action[0] == 0:
            new_state = [state[0] - min(x_max_move, y_max_receive), state[1] + min(x_max_move, y_max_receive)]
        else: 
            new_state = [state[0] + min(x_max_receive, y_max_move), state[1] - min(x_max_receive, y_max_move)]
    
        # update the state based on the net value of rentals and returns
        net_rentals_x = self.return_lambda[0] - self.rent_lambda[0]
        net_rentals_y = self.return_lambda[1] - self.rent_lambda[1]
        
        if (net_rentals_x + new_state[0] > self.state_x_size):
            new_state[0] = self.state_x_size
        elif (net_rentals_x + new_state[0] < 0): 
            new_state[0] = 0
        else: 
            new_state[0] = net_rentals_x + new_state[0]
        
        if (net_rentals_y + new_state[1] > self.state_y_size):
            new_state[1] = self.state_y_size
        elif (net_rentals_y + new_state[1] < 0):
            new_state[1] = 0
        else:
            new_state[1] = net_rentals_y + new_state[1]
        
        return new_state
    
    def iterative_evaluation(self):
        while (self.diff > self.termination_diff):
            self.diff = 0.0

            for i in range(self.state_x_size):
                for j in range(self.state_y_size):
                    self.current_state = [i, j]
                    self.evaluated_state_values[i][j] = self.get_next_state_value(self.current_state, self.current_action[self.current_state[0]][self.current_state[1]])
                    self.diff = max(self.diff, abs(self.evaluated_state_values[i][j] - self.evaluated_state_values[self.current_state[0]][self.current_state[1]]))
                    # print(self.current_state)
        
        print(self.evaluated_state_values)
    
    def policy_improvement(self): 
        for i in range(self.state_x_size):
            for j in range(self.state_y_size):
                self.current_state = [i, j]
                action = self.get_action_with_max_value(self.current_state)
                self.current_action[i][j] = self.get_action_with_max_value(self.current_state)
                # print(self.current_state)
    
    def get_action_with_max_value(self, state):
        max_value = -1000000.0
        max_action = [0, 0]
        for i in range(self.max_move_size + 1):
            action = [0, i]
            new_state = self.get_state_after_action(state, action)

            if (self.evaluated_state_values[new_state[0]][new_state[1]] > max_value):
                max_value = self.evaluated_state_values[new_state[0]][new_state[1]]
                max_action = action
        
        for i in range(self.max_move_size + 1):
            action = [1, i]
            new_state = self.get_state_after_action(state, action)

            if (self.evaluated_state_values[new_state[0]][new_state[1]] > max_value):
                max_value = self.evaluated_state_values[new_state[0]][new_state[1]]
                max_action = action
        
        return max_action

In [None]:
#Initialize Problem
terminal_state = 0
lot_one_size = 20
lot_two_size = 20

x = np.random.randint(0, lot_one_size+1)
y = np.random.randint(0, lot_two_size+1)
while ([x, y] == terminal_state):
    x = np.random.randint(0, lot_one_size+1)
    y = np.random.randint(0, lot_two_size+1)

starting_state = [x, y]

#Initialize a random number
init_state_values = np.random.rand(lot_one_size+1, lot_two_size+1)*100
init_state_values[terminal_state, :] = 0
init_state_values[:, terminal_state] = 0

starting_actions = np.zeros((lot_one_size+1, lot_two_size+1, 2), dtype=int)

In [None]:
# Gives you the expexted value of a state in the grid world
policy = PolicyIteration(init_state_values, starting_actions, starting_state, terminal_state, lot_one_size, lot_two_size)

In [None]:
policy.policy_improvement()

In [None]:
policy.iterative_evaluation()

In [None]:
policy.policy_improvement()

In [None]:
policy.iterative_evaluation()