In [None]:
# %%
import gym
import numpy as np

env = gym.make('FrozenLake-v1')
# 4*4的网格，有16个格子（状态），分别用0-15表示。eon=16
eon = env.observation_space.n
# 4个动作——上下左右，分别用0-3表示。ean=4
ean = env.action_space.n


# 计算值函数
def compute_value_function(policy, gamma=1.0):
    # 初始化V表
    value_table = np.zeros(eon)
    # 收敛判断阈值
    threshold = 1e-10
    # 循环直到收敛
    while True:
        # 初始化更新后的V表（旧表复制过来）
        updated_value_table = np.copy(value_table)
        # 计算每个状态从策略中得到的动作，然后计算值函数
        # 遍历每个状态
        for state in range(eon):
            # 根据策略取动作
            action = policy[state]
            # 更新该状态的V值（公式）
            value_table[state] = sum([trans_prob*(reward+gamma*updated_value_table[next_state])
                                      for trans_prob, next_state, reward, done in env.P[state][action]])
        # 收敛判断
        if (np.sum((np.fabs(updated_value_table-value_table))) <= threshold):
            break
    # 返回V表
    return value_table


# 策略选取（同上）
def extract_policy(value_table, gamma=1.0):
    # 初始化存储策略的数组
    policy = np.zeros(eon)
    # 对每个状态构建Q表，并在该状态下对每个行为计算Q值，
    for state in range(eon):
        # 初始化Q表
        Q_table = np.zeros(ean)
        # 对每个动作计算
        for action in range(ean):
            # 同上
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward, done = next_sr
                # 更新Q表，即更新动作对应的Q值（4个动作分别由0-3表示）
                Q_table[action] += (trans_prob *
                                    (reward+gamma*value_table[next_state]))
        # 当前状态下，选取使Q值最大的那个策略
        policy[state] = np.argmax(Q_table)
    # 返回策略
    return policy


# 策略迭代
def policy_iteration(env, gamma=1.0):
    # 初始化随机策略，下句代码即为初始策略全为0（向左走）
    random_policy = np.zeros(eon)
    # 设置迭代次数
    no_of_iterations = 20
    # 开始迭代
    for i in range(no_of_iterations):
        # 计算新的值函数
        new_value_function = compute_value_function(random_policy, gamma)
        print(new_value_function)
        # 得到新的策略
        new_policy = extract_policy(new_value_function, gamma)
        # 判断迭代终止条件（策略不变时）
        if (np.all(random_policy == new_policy)):
            print('Policy-Iteration converged as step %d.' % (i+1))
            #break
        # 新的策略为下一次的执行策略
        random_policy = new_policy
    # 返回新的策略
    return new_policy


# 输出最优策略
print(policy_iteration(env))


In [21]:
import numpy as np
x=[70.29895854844958, 46.91645785465293, 37.897050091450176, 54.034548883658104, 42.734023253598416, 47.47385859873021, 50.36202507140178, 43.78171111741168, 57.20561996431989]
print(np.mean(x))

50.07825037596364


In [1]:
import gym
import numpy as np
import diffuser.environments
env = gym.make('SafeReacher-v0', mode="test")
env.reset()
done = False
total_r = 0
while not done:
    action = env.action_space.sample()
    next_obs, reward, done, info = env.step(action)
    total_r += reward
    
    #print(next_obs, action, reward, done, info)
print(total_r)

[1.   1.   0.01] [0.20925748 0.01740208 0.01      ]
[1.   1.   0.01] [0.20760746 0.03160781 0.01      ]
[1.   1.   0.01] [0.20468956 0.04660721 0.01      ]
[1.   1.   0.01] [0.20391604 0.04942917 0.01      ]
[1.   1.   0.01] [0.20630474 0.0380869  0.01      ]
[1.   1.   0.01] [0.20957318 0.01081377 0.01      ]
[1.   1.   0.01] [ 0.20771295 -0.03080639  0.01      ]
[1.   1.   0.01] [ 0.19786319 -0.07031004  0.01      ]
[1.   1.   0.01] [ 0.18030701 -0.1072848   0.01      ]
[1.   1.   0.01] [ 0.15445966 -0.14117256  0.01      ]
[1.   1.   0.01] [ 0.12183793 -0.16852743  0.01      ]
[1.   1.   0.01] [ 0.09049189 -0.18512467  0.01      ]
[1.   1.   0.01] [ 0.06553315 -0.19302059  0.01      ]
[1.   1.   0.01] [ 0.03752548 -0.19824739  0.01      ]
[1.   1.   0.01] [ 0.01305906 -0.20074087  0.01      ]
[1.   1.   0.01] [-0.01567729 -0.19977668  0.01      ]
[1.   1.   0.01] [-0.05641655 -0.19074934  0.01      ]
[1.   1.   0.01] [-0.09508786 -0.17369113  0.01      ]
[1.   1.   0.01] [-0.1326443

In [23]:
from stable_baselines3 import DQN, SAC
import diffuser.environments
import gym
dataset='SafeReacher-v0'
env = gym.make(dataset, mode='test')
model1_path = f"./dataset/{dataset}.agent" 
model = SAC.load(model1_path)


total_step = 0
while total_step<50:
    done = False
    timeouts = False
    obs = env.reset()
    t = 0
    total_reward = 0
    total_cost = 0
    while not done and not timeouts:
        action, _states = model.predict(obs, deterministic=False)   
        action=[0,0]
        next_obs, reward, done, info = env.step(action)
        obs = next_obs
        total_reward += reward
        total_cost += (1 ** t) * info['cost']

        t += 1
        total_step += 1
        print(env.target_position, env.get_body_com("fingertip"))
        print(info['cost'], t)
        
    print(total_reward, total_cost)

-0.0 -1.1377015
-1.2649986443858414 0
[1.   1.   0.01] [0.20958165 0.01235526 0.01      ]
0 1
-1.2649914281240084 0
[1.   1.   0.01] [0.20958111 0.01236475 0.01      ]
0 2
-1.2649843549163877 0
[1.   1.   0.01] [0.20958058 0.01237405 0.01      ]
0 3
-1.2649774218912941 0
[1.   1.   0.01] [0.20958006 0.01238317 0.01      ]
0 4
-1.264970626264461 0
[1.   1.   0.01] [0.20957956 0.01239211 0.01      ]
0 5
-1.2649639653071565 0
[1.   1.   0.01] [0.20957906 0.01240087 0.01      ]
0 6
-1.2649574363450662 0
[1.   1.   0.01] [0.20957857 0.01240945 0.01      ]
0 7
-1.2649510367572006 0
[1.   1.   0.01] [0.20957809 0.01241787 0.01      ]
0 8
-1.264944763974822 0
[1.   1.   0.01] [0.20957762 0.01242612 0.01      ]
0 9
-1.2649386154803952 0
[1.   1.   0.01] [0.20957716 0.01243421 0.01      ]
0 10
-1.2649325888065592 0
[1.   1.   0.01] [0.20957671 0.01244214 0.01      ]
0 11
-1.2649266815351197 0
[1.   1.   0.01] [0.20957626 0.01244991 0.01      ]
0 12
-1.2649208912960619 0
[1.   1.   0.01] [0.20957

In [27]:
import gym
import numpy as np
import torch
dataset_name = f'./dataset/SafePendulum-v0_medium-replay.pkl'
dataset = torch.load(dataset_name)
print(np.sum(dataset['timeouts']))
for d in dataset:
    print(d)
    print(dataset[d].shape)
'''
dataset：字典类型
dataset['actions']  : [数据集样本数 x 动作维度]
dataset['observations']  : [数据集样本数 x 观测维度]
dataset['rewards']  : [数据集样本数, ]
dataset['terminals']  : [数据集样本数, ] 
dataset['timeouts']  : （表示是否因为达到最大轨迹长度而终止） [数据集样本数, ]

#不太重要的备注：所有[数据集样本数]上相邻的样本他们生成的顺序也是相邻的，
# dataset['observations'][100]和dataset['observations'][101]要么前者是一条轨迹的终止状态，要么这两个状态分布代表s_t,s_{t+1}
'''

100.0
actions
(20000, 1)
observations
(20000, 3)
rewards
(20000,)
terminals
(20000,)
timeouts
(20000,)
costs
(20000,)


"\ndataset：字典类型\ndataset['actions']  : [数据集样本数 x 动作维度]\ndataset['observations']  : [数据集样本数 x 观测维度]\ndataset['rewards']  : [数据集样本数, ]\ndataset['terminals']  : [数据集样本数, ] \ndataset['timeouts']  : （表示是否因为达到最大轨迹长度而终止） [数据集样本数, ]\n\n#不太重要的备注：所有[数据集样本数]上相邻的样本他们生成的顺序也是相邻的，\n# dataset['observations'][100]和dataset['observations'][101]要么前者是一条轨迹的终止状态，要么这两个状态分布代表s_t,s_{t+1}\n"

In [16]:
for l in range(25,300):
    import numpy as np
    action = d['actions'][l]
    observation = d['observations'][l]
    dire = ['U', 'R', 'D', 'L']
    print("action:   ", dire[np.argmax(action)], np.max(action), d['rewards'][l])
    o = np.zeros((4,12))
    pos = np.where(observation!=0)[0]
    o[ np.unravel_index(pos, o.shape)] = 1
    print(o)


action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    U 1.0 -1
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
action:    R 1.0 -1
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 

In [7]:
import wandb
wandb.init(
    project='SafeRLDiffusion',
    name=f'test6',
    group='default',
)
data = []
wandb.log({"my_custom_id" : wandb.plot.line_series(
          xs=[0, 1, 2, 3, 4],
          ys=[[10, 20, 30, 40, 50], [0.5, 11, 72, 3, 41], [-10, -20, -30, -40, -50]],
          #keys=["metric Y", "metric Z", ""],
          title="Two Random Metrics",
          xname="x units")})

# wandb.log({"my_custom_id" : wandb.plot.line_series(
#           xs=[5,6,7,8,9],
#           ys=[[10, 20, 30, 40, 50], [0.5, 11, 72, 3, 41]],
#           keys=["metric Y", "metric Z"],
#           title="Two Random Metrics",
#           xname="x units")})
wandb.finish()

In [2]:
import wandb
wandb.init(
    project='SafeRLDiffusion',
    name=f'test5',
    group='default',
)
data = []
for i in range(0,7):
    data = [i] * (i+1)
    #wandb.run.summary.update(  # if only in summary, only visible on overview tab
    #    {"test_x": wandb.Histogram(x)})

    # data = [[s] for s in scores]
    # table = wandb.Table(data=data, columns=["scores"])
    # wandb.log({'my_histogram': wandb.plot.histogram(table, "scores",
    #                         title="Histogram")})

    #wandb.log({"test_x": wandb.Histogram(x)})
    data = [[x, y] for (x, y) in zip([3]*(i+1), data)]
    table = wandb.Table(data=data, columns = ["class_x", "class_y"])
    wandb.log({"my_custom_id" : wandb.plot.scatter(table,
                                "class_x", "class_y")})
    import time
    #time.sleep(5)
wandb.finish()