In [2]:
# %%
import gym
import numpy as np

env = gym.make('FrozenLake-v1')
# 4*4的网格，有16个格子（状态），分别用0-15表示。eon=16
eon = env.observation_space.n
# 4个动作——上下左右，分别用0-3表示。ean=4
ean = env.action_space.n


# 计算值函数
def compute_value_function(policy, gamma=1.0):
    # 初始化V表
    value_table = np.zeros(eon)
    # 收敛判断阈值
    threshold = 1e-10
    # 循环直到收敛
    while True:
        # 初始化更新后的V表（旧表复制过来）
        updated_value_table = np.copy(value_table)
        # 计算每个状态从策略中得到的动作，然后计算值函数
        # 遍历每个状态
        for state in range(eon):
            # 根据策略取动作
            action = policy[state]
            # 更新该状态的V值（公式）
            value_table[state] = sum([trans_prob*(reward+gamma*updated_value_table[next_state])
                                      for trans_prob, next_state, reward, done in env.P[state][action]])
        # 收敛判断
        if (np.sum((np.fabs(updated_value_table-value_table))) <= threshold):
            break
    # 返回V表
    return value_table


# 策略选取（同上）
def extract_policy(value_table, gamma=1.0):
    # 初始化存储策略的数组
    policy = np.zeros(eon)
    # 对每个状态构建Q表，并在该状态下对每个行为计算Q值，
    for state in range(eon):
        # 初始化Q表
        Q_table = np.zeros(ean)
        # 对每个动作计算
        for action in range(ean):
            # 同上
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward, done = next_sr
                # 更新Q表，即更新动作对应的Q值（4个动作分别由0-3表示）
                Q_table[action] += (trans_prob *
                                    (reward+gamma*value_table[next_state]))
        # 当前状态下，选取使Q值最大的那个策略
        policy[state] = np.argmax(Q_table)
    # 返回策略
    return policy


# 策略迭代
def policy_iteration(env, gamma=1.0):
    # 初始化随机策略，下句代码即为初始策略全为0（向左走）
    random_policy = np.zeros(eon)
    # 设置迭代次数
    no_of_iterations = 20
    # 开始迭代
    for i in range(no_of_iterations):
        # 计算新的值函数
        new_value_function = compute_value_function(random_policy, gamma)
        print(new_value_function)
        # 得到新的策略
        new_policy = extract_policy(new_value_function, gamma)
        # 判断迭代终止条件（策略不变时）
        if (np.all(random_policy == new_policy)):
            print('Policy-Iteration converged as step %d.' % (i+1))
            #break
        # 新的策略为下一次的执行策略
        random_policy = new_policy
    # 返回新的策略
    return new_policy


# 输出最优策略
print(policy_iteration(env))


[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0.         0.         0.03846154 0.01923077 0.         0.
 0.07692308 0.         0.         0.         0.19230769 0.
 0.         0.         0.5        0.        ]
[0.         0.06140351 0.18421053 0.18421053 0.         0.
 0.18421053 0.         0.         0.23684211 0.36842105 0.
 0.         0.34210526 0.68421053 0.        ]
[0.14393939 0.11363636 0.22727273 0.22727273 0.17424242 0.
 0.22727273 0.         0.20454545 0.40909091 0.45454545 0.
 0.         0.56818182 0.72727273 0.        ]
[0.75       0.54166667 0.33333333 0.33333333 0.75       0.
 0.33333333 0.         0.75       0.75       0.66666667 0.
 0.         0.83333333 0.91666667 0.        ]
[0.7804878  0.65853658 0.53658537 0.53658537 0.7804878  0.
 0.41463415 0.         0.7804878  0.7804878  0.70731707 0.
 0.         0.85365854 0.92682927 0.        ]
[0.82352941 0.82352941 0.82352941 0.82352941 0.82352941 0.
 0.52941176 0.         0.82352941 0.82352941 0.76470588 0.
 0.         

In [18]:
from stable_baselines3 import DQN, SAC
import diffuser.environments
import gym
dataset='SafePendulum-v0'
env = gym.make(dataset, mode='test')
model1_path = f"./dataset/{dataset}.agent" 
model = SAC.load(model1_path)


total_step = 0
while total_step<2000:
    done = False
    timeouts = False
    obs = env.reset()
    t = 0
    total_reward = 0
    total_cost = 0
    while not done and not timeouts:
        action, _states = model.predict(obs, deterministic=False)   
        next_obs, reward, done, info = env.step(action)
        obs = next_obs
        total_reward += reward
        total_cost += (1 ** t) * info['cost']

        t += 1
        total_step += 1
    print(total_reward, total_cost)

In [44]:
import gym
import numpy as np
import diffuser.environments
env = gym.make('SafeDoublePendulum-random-v0', mode="train")
env.reset()
done = False
total_r = 0
while not done:
    action = env.action_space.sample()
    next_obs, reward, done, info = env.step(action)
    total_r += reward
    print(next_obs, action, reward, done, info)
print(total_r)

[ 0.10852    -0.11491121  0.21046396  0.99337577  0.97760162  1.88434718
 -3.73856277  5.13213883  0.          0.          0.        ] [0.7897759] 0.9203331397739969 False {'cost': 0.4717903798605534}
[ 0.23174096 -0.35061374  0.50009009  0.93652016  0.86597338  3.0150768
 -5.83106417  6.83811348  0.          0.          0.        ] [0.5202298] 0.9015885888186196 False {'cost': 0.36802992961027686}
[ 0.35075623 -0.56923912  0.73162824  0.82217202  0.68170383  1.77242139
 -4.2611408   5.52001621  0.          0.          0.        ] [-0.5753414] 0.8981901317574896 False {'cost': 0.08950282068178267}
[ 0.40926155 -0.714032    0.8943861   0.70011306  0.44729576  0.58009051
 -3.3975479   6.1272487   0.          0.          0.        ] [-0.6072684] 0.8782645005258021 True {'cost': -0.0}
3.598376360875908


In [6]:
import gym
import numpy as np
import torch
dataset_name = f'./dataset/myroulette-v0_random.pkl'
dataset = torch.load(dataset_name)
for d in dataset:
    print(d)
    print(dataset[d].shape)
'''
dataset：字典类型
dataset['actions']  : [数据集样本数 x 动作维度]
dataset['observations']  : [数据集样本数 x 观测维度]
dataset['rewards']  : [数据集样本数, ]
dataset['terminals']  : [数据集样本数, ] 
dataset['timeouts']  : （表示是否因为达到最大轨迹长度而终止） [数据集样本数, ]

#不太重要的备注：所有[数据集样本数]上相邻的样本他们生成的顺序也是相邻的，
# dataset['observations'][100]和dataset['observations'][101]要么前者是一条轨迹的终止状态，要么这两个状态分布代表s_t,s_{t+1}
'''

actions
(50008, 38)
observations
(50008, 3)
rewards
(50008,)
terminals
(50008,)
timeouts
(50008,)


"\ndataset：字典类型\ndataset['actions']  : [数据集样本数 x 动作维度]\ndataset['observations']  : [数据集样本数 x 观测维度]\ndataset['rewards']  : [数据集样本数, ]\ndataset['terminals']  : [数据集样本数, ] \ndataset['timeouts']  : （表示是否因为达到最大轨迹长度而终止） [数据集样本数, ]\n\n#不太重要的备注：所有[数据集样本数]上相邻的样本他们生成的顺序也是相邻的，\n# dataset['observations'][100]和dataset['observations'][101]要么前者是一条轨迹的终止状态，要么这两个状态分布代表s_t,s_{t+1}\n"

In [16]:
for l in range(25,300):
    import numpy as np
    action = d['actions'][l]
    observation = d['observations'][l]
    dire = ['U', 'R', 'D', 'L']
    print("action:   ", dire[np.argmax(action)], np.max(action), d['rewards'][l])
    o = np.zeros((4,12))
    pos = np.where(observation!=0)[0]
    o[ np.unravel_index(pos, o.shape)] = 1
    print(o)


action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    R 1.0 -1
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 

In [7]:
import wandb
wandb.init(
    project='SafeRLDiffusion',
    name=f'test6',
    group='default',
)
data = []
wandb.log({"my_custom_id" : wandb.plot.line_series(
          xs=[0, 1, 2, 3, 4],
          ys=[[10, 20, 30, 40, 50], [0.5, 11, 72, 3, 41], [-10, -20, -30, -40, -50]],
          #keys=["metric Y", "metric Z", ""],
          title="Two Random Metrics",
          xname="x units")})

# wandb.log({"my_custom_id" : wandb.plot.line_series(
#           xs=[5,6,7,8,9],
#           ys=[[10, 20, 30, 40, 50], [0.5, 11, 72, 3, 41]],
#           keys=["metric Y", "metric Z"],
#           title="Two Random Metrics",
#           xname="x units")})
wandb.finish()

In [2]:
import wandb
wandb.init(
    project='SafeRLDiffusion',
    name=f'test5',
    group='default',
)
data = []
for i in range(0,7):
    data = [i] * (i+1)
    #wandb.run.summary.update(  # if only in summary, only visible on overview tab
    #    {"test_x": wandb.Histogram(x)})

    # data = [[s] for s in scores]
    # table = wandb.Table(data=data, columns=["scores"])
    # wandb.log({'my_histogram': wandb.plot.histogram(table, "scores",
    #                         title="Histogram")})

    #wandb.log({"test_x": wandb.Histogram(x)})
    data = [[x, y] for (x, y) in zip([3]*(i+1), data)]
    table = wandb.Table(data=data, columns = ["class_x", "class_y"])
    wandb.log({"my_custom_id" : wandb.plot.scatter(table,
                                "class_x", "class_y")})
    import time
    #time.sleep(5)
wandb.finish()