-
Notifications
You must be signed in to change notification settings - Fork 1
/
ringworld_MC.py
45 lines (37 loc) · 2.26 KB
/
ringworld_MC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from utils import *
from methods import *
from joblib import Parallel, delayed
from MC import *
import numpy.matlib
# experiment Preparation
N = 11; env, runtimes, episodes, gamma = RingWorldEnv(N), int(50), int(2500), lambda x: 0.95
target_policy = np.matlib.repmat(np.array([0.5, 0.5]).reshape(1, -1), env.observation_space.n, 1)
behavior_policy = np.matlib.repmat(np.array([0.5, 0.5]).reshape(1, -1), env.observation_space.n, 1)
# get ground truth expectation, variance and stationary distribution
filename = 'ground_truths_0.5_0.5_%d.npz' % N
try:
loaded = np.load(filename)
true_expectation, true_variance, stationary_dist = loaded['true_expectation'], loaded['true_variance'], loaded['stationary_dist']
except FileNotFoundError:
E, V, return_counts = monte_carlo(env, decide, decide, lambda x: onehot(x, N), N, int(1e6), gamma = 0.99)
stationary_dist = return_counts / np.sum(return_counts)
true_expectation, true_variance = E[-1], V[-1]
np.savez(filename, true_expectation = true_expectation, true_variance = true_variance, stationary_dist = stationary_dist)
j, v, s = iterative_policy_evaluation(env, target_policy, gamma=gamma)
print('Iterative policy evaluation')
Lambda = LAMBDA(env, lambda_type = 'constant', initial_value = np.ones(N))
mc_j, mc_v, mc_counts = MC(env, 10000, target_policy, target_policy, Lambda, gamma)
# test both on-policy and off-policy
Lambda = LAMBDA(env, lambda_type = 'constant', initial_value = np.ones(N))
off_mc_results, off_mc_var_results = eval_method_with_variance(MC, env, true_expectation, true_variance, stationary_dist, behavior_policy, target_policy, Lambda, gamma=gamma, alpha=0.05, beta=0.05, runtimes=runtimes, episodes=episodes)
Lambda = LAMBDA(env, lambda_type = 'constant', initial_value = np.ones(N))
on_mc_results, on_mc_var_results = eval_method_with_variance(MC, env, true_expectation, true_variance, stationary_dist, target_policy, target_policy, Lambda, gamma=gamma, alpha=0.05, beta=0.05, runtimes=runtimes, episodes=episodes)
plot_results(off_mc_results, label='off-policy MC')
plot_results(on_mc_results, label='on-policy MC')
plt.yscale('log')
plt.figure()
plot_results(off_mc_var_results, label='off-policy MC variance')
plot_results(on_mc_var_results, label='on-policy MC variance')
plt.yscale('log')
plt.show()
pass