-
Notifications
You must be signed in to change notification settings - Fork 0
/
plan.py
49 lines (39 loc) · 1.75 KB
/
plan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
__author__ = 'Austin'
import operator
def bellman_backup(mdp, value_k, discount):
"""
Given an MDP and its value function with time-to-go k, returns its value function with time-to-go k+1.
"""
# computing a new value function, ie list over all states
new_value_fn = []
actions = []
for s in range(mdp.num_states):
# R(s) + max_a[sum_s'(T(s,a,s') dot Vk(s'))]
action_results = []
for a in range(mdp.num_actions):
action_results.append(sum([mdp.transition(s, a, s_next) * value_k[s_next] for s_next in range(mdp.num_states)]))
max_index, max_value = max(enumerate(action_results), key=operator.itemgetter(1))
actions.append(max_index)
new_value_fn.append(mdp.rewards[s] + discount * max_value)
return new_value_fn, actions
def max_norm(new, old):
return max([new[i] - old[i] for i in range(0, len(new))])
def plan(mdp, discount, epsilon):
"""
Main algorithm. The input to your algorithm should be a description of an MDP and
a time horizon H (positive integer). The output should be an optimal non-stationary value
function and non-stationary policy for the MDP and time horizon .
"""
bellman_error = []
bellman_error.append(max(mdp.rewards))
policy = [0 for _ in range(mdp.num_states)]
iterations = 0
# initialize at iteration 0 with reward of each state
value_function = mdp.rewards
while bellman_error[len(bellman_error) - 1] > epsilon:
new_values, policy = bellman_backup(mdp, value_function, discount)
bellman_error.append(max_norm(new_values, value_function))
value_function = new_values
iterations += 1
# at this point we have the k-th value and policy
return value_function, policy, iterations