-
Notifications
You must be signed in to change notification settings - Fork 25
/
JacksCarRentalProblem.m
138 lines (114 loc) · 5.6 KB
/
JacksCarRentalProblem.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
%********************************************* JACK'S CAR RENTAL PROBLEM ********************************************************
% Code written by : Nikhil P V S and Savinay Nagendra
% Date : 30-01-2017
%********************************************************************************************************************************
% Problem parameters: State : No. of cars at each location at the end of each day
% Actions : Net number of cars moved between the two locations overnight
% Rewards : 10$ for each request and -2$ for moving cars between locations overnight
% Time Steps : Days
% Transition Probability: Number of cars requested and returned at each location are Poisson random variables
%********************************************************************************************************************************
% initialization
clc;
clear all;
MZIC = 1; % MATLAB Zero Index Correction
gamma = 0.9; % Discounting factor
max_cars = 20; % Maximum number of cars at a particular location
max_transfers = 5; % Maximum cars that can be transfered between two locations overnight
% lamda = Expected value for Transition Probability
% P(n) = (lamda^n)*(exp(-lamda))/(n!) where n is number of cars requested or returned
lamdaAreq = 3;
lamdaAret = 3;
lamdaBreq = 4;
lamdaBret = 2;
% Pre-calculation of Rewards and Transition Probabilities for all states
[Ra,Pa] = cmpt_P_and_R(lamdaAreq,lamdaAret,max_cars,max_transfers);
[Rb,Pb] = cmpt_P_and_R(lamdaBreq,lamdaBret,max_cars,max_transfers);
theta = 1e-6; % Error convergence limit
delta = inf; % Error
MAX_ITER = 20; % Maximum number of iterations
V = zeros(max_cars+MZIC,max_cars+MZIC); % State value functions
policy_pi = zeros(max_cars+MZIC,max_cars+MZIC); % Policy
Tot_states = (max_cars+MZIC)^2; % Total number of states
policy_stable = 0; % Flag - Is the optimal policy achieved
while(policy_stable == 0)
% POLICY EVALUATION
% *****************
% plot the current policy:
figure; imagesc( 0:max_cars, 0:max_cars, policy_pi ); colorbar; xlabel( 'num at B' ); ylabel( 'num at A' );
title('Current Policy'); axis xy; drawnow;
%fn=sprintf('policy_iter_%d.eps',iterNum); saveas( gcf, fn, 'eps2' );
disp('begin policy evaluation...');
iter_count = 0;
while(delta>theta && iter_count < MAX_ITER)
delta = 0;
for s = 1:Tot_states
[na1,nb1] = ind2sub([max_cars+MZIC,max_cars+MZIC] , s); % Array index to matrix index
na = na1 - 1 ; nb = nb1 - 1; % Actual number of cars
transfers = policy_pi(na1,nb1); % Action
v = V(na1,nb1); % Value of being at state (na1,nb1)
% restrict this action:
transfers = max(-nb,min(transfers,na));
transfers = max(-max_transfers,min(max_transfers,transfers));
vtemp = -2*abs(transfers); % Value of each transition in a state
na_morning = na1 - transfers;
nb_morning = nb1 + transfers;
% Policy evaluation algorithm
for s_pri_A = 1: max_cars+1 % MZIC included here
for s_pri_B = 1:max_cars+1 % MZIC included here
vtemp = vtemp + Pa(na_morning,s_pri_A)*Pb(nb_morning,s_pri_B)*(Ra(na_morning)+Rb(nb_morning)+gamma*V(s_pri_A,s_pri_B));
end
end
V(na1,nb1) = vtemp;
delta = max([delta , abs(v - V(na1,nb1))]);
end
iter_count = iter_count + 1;
fprintf('iteration number = %5d, delta = %10.8f \n',iter_count,delta);
end
disp('end of policy evaluation');
% End of policy evaluation
figure; imagesc( 0:max_cars, 0:max_cars, V ); colorbar;
xlabel( 'num at B' ); ylabel( 'num at A' );
title( 'current state-value function' ); axis xy; drawnow;
%fn=sprintf('state_value_fn_iter_%d.eps',iterNum); saveas( gcf, fn, 'eps2' );
% POLICY IMPROVEMENT
% ******************
disp('policy improvement beginning');
policy_stable = 1;
for s = 1:Tot_states
[na1,nb1] = ind2sub([max_cars+MZIC,max_cars+MZIC] , s); % Array index to matrix index
na = na1 - 1 ; nb = nb1 - 1;
old_policy = policy_pi(na1,nb1);
% Possible actions
possActInA = min(na,max_transfers);
possActInB = min(nb,max_transfers);
TotPosAct = [-possActInB:possActInA];
num = length(TotPosAct);
Q = -Inf*ones(1,num); % Value function for policy improvement
for k = 1:num
transfers = TotPosAct(k);
vtemp = -2*abs(transfers);
% restrict this action:
transfers = max(-nb,min(transfers,na));
transfers = max(-max_transfers,min(max_transfers,transfers));
na_morning = na1 - transfers;
nb_morning = nb1 + transfers;
% Policy improvement algorithm
for s_pri_A = 1: max_cars+1
for s_pri_B = 1:max_cars+1
vtemp = vtemp + Pa(na_morning,s_pri_A)*Pb(nb_morning,s_pri_B)*(Ra(na_morning)+Rb(nb_morning)+gamma*V(s_pri_A,s_pri_B));
end
end
Q(k) = vtemp;
end
% Maximizing over value, and selecting corresponding action
[max_val,max_ind] = max(Q);
maxPosAct = TotPosAct(max_ind);
% Condition for improving policy
if (maxPosAct ~= old_policy)
policy_stable = 0;
policy_pi(na1,nb1) = maxPosAct;
end
end
disp('end of policy_improvement');
end