In [2]:
# %%
import gym
import numpy as np

env = gym.make('FrozenLake-v1')
# 4*4的网格，有16个格子（状态），分别用0-15表示。eon=16
eon = env.observation_space.n
# 4个动作——上下左右，分别用0-3表示。ean=4
ean = env.action_space.n


# 计算值函数
def compute_value_function(policy, gamma=1.0):
    # 初始化V表
    value_table = np.zeros(eon)
    # 收敛判断阈值
    threshold = 1e-10
    # 循环直到收敛
    while True:
        # 初始化更新后的V表（旧表复制过来）
        updated_value_table = np.copy(value_table)
        # 计算每个状态从策略中得到的动作，然后计算值函数
        # 遍历每个状态
        for state in range(eon):
            # 根据策略取动作
            action = policy[state]
            # 更新该状态的V值（公式）
            value_table[state] = sum([trans_prob*(reward+gamma*updated_value_table[next_state])
                                      for trans_prob, next_state, reward, done in env.P[state][action]])
        # 收敛判断
        if (np.sum((np.fabs(updated_value_table-value_table))) <= threshold):
            break
    # 返回V表
    return value_table


# 策略选取（同上）
def extract_policy(value_table, gamma=1.0):
    # 初始化存储策略的数组
    policy = np.zeros(eon)
    # 对每个状态构建Q表，并在该状态下对每个行为计算Q值，
    for state in range(eon):
        # 初始化Q表
        Q_table = np.zeros(ean)
        # 对每个动作计算
        for action in range(ean):
            # 同上
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward, done = next_sr
                # 更新Q表，即更新动作对应的Q值（4个动作分别由0-3表示）
                Q_table[action] += (trans_prob *
                                    (reward+gamma*value_table[next_state]))
        # 当前状态下，选取使Q值最大的那个策略
        policy[state] = np.argmax(Q_table)
    # 返回策略
    return policy


# 策略迭代
def policy_iteration(env, gamma=1.0):
    # 初始化随机策略，下句代码即为初始策略全为0（向左走）
    random_policy = np.zeros(eon)
    # 设置迭代次数
    no_of_iterations = 20
    # 开始迭代
    for i in range(no_of_iterations):
        # 计算新的值函数
        new_value_function = compute_value_function(random_policy, gamma)
        print(new_value_function)
        # 得到新的策略
        new_policy = extract_policy(new_value_function, gamma)
        # 判断迭代终止条件（策略不变时）
        if (np.all(random_policy == new_policy)):
            print('Policy-Iteration converged as step %d.' % (i+1))
            #break
        # 新的策略为下一次的执行策略
        random_policy = new_policy
    # 返回新的策略
    return new_policy


# 输出最优策略
print(policy_iteration(env))


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0.         0.         0.03846154 0.01923077 0.         0.
 0.07692308 0.         0.         0.         0.19230769 0.
 0.         0.         0.5        0.        ]
[0.         0.06140351 0.18421053 0.18421053 0.         0.
 0.18421053 0.         0.         0.23684211 0.36842105 0.
 0.         0.34210526 0.68421053 0.        ]
[0.14393939 0.11363636 0.22727273 0.22727273 0.17424242 0.
 0.22727273 0.         0.20454545 0.40909091 0.45454545 0.
 0.         0.56818182 0.72727273 0.        ]
[0.75       0.54166667 0.33333333 0.33333333 0.75       0.
 0.33333333 0.         0.75       0.75       0.66666667 0.
 0.         0.83333333 0.91666667 0.        ]
[0.7804878  0.65853658 0.53658537 0.53658537 0.7804878  0.
 0.41463415 0.         0.7804878  0.7804878  0.70731707 0.
 0.         0.85365854 0.92682927 0.        ]
[0.82352941 0.82352941 0.82352941 0.82352941 0.82352941 0.
 0.52941176 0.         0.82352941 0.82352941 0.76470588 0.
 0.         

In [1]:
import gym
import numpy as np
import diffuser.environments
env=gym.make('myroulette-random-v0')
d = env.get_dataset()
for a in d['actions']:
    print(np.sum(a))
    if np.sum(a)!=1:
        print("warning")

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'np' is not defined

In [16]:
for l in range(25,300):
    import numpy as np
    action = d['actions'][l]
    observation = d['observations'][l]
    dire = ['U', 'R', 'D', 'L']
    print("action:   ", dire[np.argmax(action)], np.max(action), d['rewards'][l])
    o = np.zeros((4,12))
    pos = np.where(observation!=0)[0]
    o[ np.unravel_index(pos, o.shape)] = 1
    print(o)


action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    R 1.0 -1
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 

In [5]:
import wandb
wandb.init(
    project='SafeRLDiffusion',
    name=f'test5',
    group='default',
)
data = []
wandb.log({"my_custom_id" : wandb.plot.line_series(
          xs=[0, 1, 2, 3, 4],
          ys=[[10, 20, 30, 40, 50], [0.5, 11, 72, 3, 41], [-10, -20, -30, -40, -50]],
          keys=["metric Y", "metric Z"],
          title="Two Random Metrics",
          xname="x units")})

# wandb.log({"my_custom_id" : wandb.plot.line_series(
#           xs=[5,6,7,8,9],
#           ys=[[10, 20, 30, 40, 50], [0.5, 11, 72, 3, 41]],
#           keys=["metric Y", "metric Z"],
#           title="Two Random Metrics",
#           xname="x units")})
wandb.finish()

In [2]:
import wandb
wandb.init(
    project='SafeRLDiffusion',
    name=f'test5',
    group='default',
)
data = []
for i in range(0,7):
    data = [i] * (i+1)
    #wandb.run.summary.update(  # if only in summary, only visible on overview tab
    #    {"test_x": wandb.Histogram(x)})

    # data = [[s] for s in scores]
    # table = wandb.Table(data=data, columns=["scores"])
    # wandb.log({'my_histogram': wandb.plot.histogram(table, "scores",
    #                         title="Histogram")})

    #wandb.log({"test_x": wandb.Histogram(x)})
    data = [[x, y] for (x, y) in zip([3]*(i+1), data)]
    table = wandb.Table(data=data, columns = ["class_x", "class_y"])
    wandb.log({"my_custom_id" : wandb.plot.scatter(table,
                                "class_x", "class_y")})
    import time
    #time.sleep(5)
wandb.finish()