In [None]:
# %%
import gym
import numpy as np

env = gym.make('FrozenLake-v1')
# 4*4的网格，有16个格子（状态），分别用0-15表示。eon=16
eon = env.observation_space.n
# 4个动作——上下左右，分别用0-3表示。ean=4
ean = env.action_space.n


# 计算值函数
def compute_value_function(policy, gamma=1.0):
    # 初始化V表
    value_table = np.zeros(eon)
    # 收敛判断阈值
    threshold = 1e-10
    # 循环直到收敛
    while True:
        # 初始化更新后的V表（旧表复制过来）
        updated_value_table = np.copy(value_table)
        # 计算每个状态从策略中得到的动作，然后计算值函数
        # 遍历每个状态
        for state in range(eon):
            # 根据策略取动作
            action = policy[state]
            # 更新该状态的V值（公式）
            value_table[state] = sum([trans_prob*(reward+gamma*updated_value_table[next_state])
                                      for trans_prob, next_state, reward, done in env.P[state][action]])
        # 收敛判断
        if (np.sum((np.fabs(updated_value_table-value_table))) <= threshold):
            break
    # 返回V表
    return value_table


# 策略选取（同上）
def extract_policy(value_table, gamma=1.0):
    # 初始化存储策略的数组
    policy = np.zeros(eon)
    # 对每个状态构建Q表，并在该状态下对每个行为计算Q值，
    for state in range(eon):
        # 初始化Q表
        Q_table = np.zeros(ean)
        # 对每个动作计算
        for action in range(ean):
            # 同上
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward, done = next_sr
                # 更新Q表，即更新动作对应的Q值（4个动作分别由0-3表示）
                Q_table[action] += (trans_prob *
                                    (reward+gamma*value_table[next_state]))
        # 当前状态下，选取使Q值最大的那个策略
        policy[state] = np.argmax(Q_table)
    # 返回策略
    return policy


# 策略迭代
def policy_iteration(env, gamma=1.0):
    # 初始化随机策略，下句代码即为初始策略全为0（向左走）
    random_policy = np.zeros(eon)
    # 设置迭代次数
    no_of_iterations = 20
    # 开始迭代
    for i in range(no_of_iterations):
        # 计算新的值函数
        new_value_function = compute_value_function(random_policy, gamma)
        print(new_value_function)
        # 得到新的策略
        new_policy = extract_policy(new_value_function, gamma)
        # 判断迭代终止条件（策略不变时）
        if (np.all(random_policy == new_policy)):
            print('Policy-Iteration converged as step %d.' % (i+1))
            #break
        # 新的策略为下一次的执行策略
        random_policy = new_policy
    # 返回新的策略
    return new_policy


# 输出最优策略
print(policy_iteration(env))


In [None]:
from stable_baselines3 import DQN, SAC
import diffuser.environments
import gym
import numpy as np
dataset='SafeReacher-v0'
env = gym.make(dataset, mode='test')
model1_path = f"./dataset/{dataset}.agent" 
model = SAC.load(model1_path)

avg_ratio = []
total_step = 0
while total_step<10000:
    done = False
    timeouts = False
    obs = env.reset()
    t = 0
    total_reward = 0
    total_cost = 0
    discount_total_cost = 0
    while not done and not timeouts:
        action, _states = model.predict(obs, deterministic=False)   
        next_obs, reward, done, info = env.step(action)
        obs = next_obs
        total_reward += reward
        total_cost += info['cost']
        discount_total_cost += (0.99 ** t) * info['cost']

        t += 1
        total_step += 1
        #print(env.target_position, env.get_body_com("fingertip"))
        #print(info['cost'], t)
        
    print(total_reward, total_cost, discount_total_cost)
    avg_ratio.append(discount_total_cost/total_cost)
    #print((1-0.99**50)/(1-0.99)/50, discount_total_cost/total_cost)
    #print(total_cost*(1-0.99**50)/(1-0.99)/50, )
print(np.mean(avg_ratio))

In [1]:
import diffuser.environments
import gym
env=gym.make('ocpm-v0')
print(env._max_episode_steps)
dataset=env.get_dataset()
for d in dataset:
    #d = np.array(d)
    print(d, dataset[d].shape)



  from .autonotebook import tqdm as notebook_tqdm


1000000.0
actions (1163441, 20)
observations (1163441, 33)
rewards (1163441,)
terminals (1163441,)
costs (1163441,)


In [48]:
import einops
import torch 
x=torch.tensor([[1,2], [3,4]])
#y=einops.repeat(x, 'd w -> (repeat d) w', repeat=3)
y=torch.tensor([[1,1], [2,2], [3,3], [4,4], [5,5], [6,6]])
z=torch.tensor([1,2,3,4,5,6])
z=z.reshape(2, 3)
y=y.reshape(2, 3, -1)
print(y)
ind=torch.argmax(z, 1, keepdim=True)
print(ind.unsqueeze(-1).repeat(2,1,2))
print(y.shape, ind.unsqueeze(-1).repeat(1,1,2).shape)
y=torch.gather(y, 1 , ind.unsqueeze(-1).repeat(1,1,2)).squeeze(1)
z=torch.gather(z, 1, ind).squeeze(1)
print(y, z)

tensor([[[1, 1],
         [2, 2],
         [3, 3]],

        [[4, 4],
         [5, 5],
         [6, 6]]])
tensor([[[2, 2]],

        [[2, 2]],

        [[2, 2]],

        [[2, 2]]])
torch.Size([2, 3, 2]) torch.Size([2, 1, 2])
tensor([[3, 3],
        [6, 6]]) tensor([3, 6])


In [53]:
dataset = torch.load(f'./dataset/ocpm_train.pkl')
print(len(dataset['actions']))

1163441


In [54]:
import gym
import numpy as np
import torch
import pickle
dataset_name = f'./dataset/ocpm_train.pickle'
#dataset = torch.load(dataset_name)
with open(dataset_name, 'rb') as handle:
    dataset = pickle.load(handle)
print(dataset.keys())

traj_len = []
p = 0
for i in range(len(dataset['is_terminal'])):
    if dataset['is_terminal'][i]:
        traj_len.append(i-p)
        p=i
print(len(dataset['is_terminal']), len(traj_len))
print(traj_len)

print(np.min(dataset['action']), np.max(traj_len), np.mean(traj_len))
print(np.sum(dataset['residual_constraint_v1']<100))
print(dataset['residual_constraint_v1'][1000:1100])
print(dataset['is_terminal'][:30])
#print(np.sum(dataset['timeouts']))
dataset1 = {}
dataset1['actions'] = np.eye(20)[dataset.pop('action')]
dataset1['observations'] = dataset.pop('state')
dataset1['rewards'] = dataset.pop('reward')
dataset1['terminals'] = dataset.pop('is_terminal')
dataset1['costs'] = dataset.pop('cost')

for d in dataset1:
    #d = np.array(d)
    print(d, dataset1[d].shape)
    #print(process_data[:3])
#torch.save(dataset1, "./dataset/ocpm_train.pkl")
'''
dataset：字典类型
dataset['actions']  : [数据集样本数 x 动作维度]
dataset['observations']  : [数据集样本数 x 观测维度]
dataset['rewards']  : [数据集样本数, ]
dataset['terminals']  : [数据集样本数, ] 
dataset['timeouts']  : （表示是否因为达到最大轨迹长度而终止） [数据集样本数, ]

#不太重要的备注：所有[数据集样本数]上相邻的样本他们生成的顺序也是相邻的，
# dataset['observations'][100]和dataset['observations'][101]要么前者是一条轨迹的终止状态，要么这两个状态分布代表s_t,s_{t+1}
'''  

dict_keys(['dt', 'pvid', 'poi_id', 'state', 'next_state', 'action', 'reward', 'cost', 'residual_constraint_v1', 'is_terminal'])
1196928 196812
[4, 3, 8, 4, 5, 24, 3, 12, 26, 9, 1, 13, 1, 10, 37, 6, 4, 9, 22, 1, 47, 10, 22, 5, 11, 7, 18, 3, 5, 6, 14, 28, 20, 18, 22, 8, 9, 15, 5, 25, 8, 1, 16, 4, 2, 3, 51, 2, 2, 1, 22, 1, 15, 6, 3, 30, 48, 15, 15, 5, 36, 5, 7, 5, 16, 36, 1, 42, 3, 3, 1, 3, 3, 3, 4, 1, 3, 3, 2, 2, 1, 4, 8, 2, 2, 5, 1, 4, 1, 3, 7, 7, 2, 1, 2, 1, 2, 1, 7, 2, 4, 1, 2, 1, 1, 7, 1, 11, 3, 4, 2, 1, 1, 1, 1, 3, 2, 1, 2, 6, 4, 1, 2, 2, 2, 4, 7, 3, 2, 1, 3, 6, 3, 7, 1, 7, 1, 1, 5, 3, 2, 3, 4, 6, 1, 2, 3, 7, 1, 1, 2, 2, 3, 2, 1, 5, 3, 5, 1, 1, 1, 3, 2, 2, 1, 2, 2, 1, 1, 2, 3, 1, 1, 5, 1, 3, 2, 5, 12, 11, 6, 6, 1, 4, 6, 3, 1, 7, 1, 3, 7, 1, 1, 2, 1, 6, 4, 3, 1, 1, 2, 1, 1, 3, 4, 1, 1, 1, 6, 1, 3, 1, 1, 1, 9, 2, 1, 3, 4, 2, 1, 5, 3, 4, 4, 3, 1, 2, 8, 2, 7, 4, 3, 2, 4, 1, 1, 1, 6, 1, 1, 3, 4, 1, 12, 1, 2, 3, 1, 10, 7, 2, 4, 1, 6, 3, 6, 4, 1, 3, 4, 1, 3, 1, 2, 8, 9, 2, 3, 7, 2, 3, 4, 3

"\ndataset：字典类型\ndataset['actions']  : [数据集样本数 x 动作维度]\ndataset['observations']  : [数据集样本数 x 观测维度]\ndataset['rewards']  : [数据集样本数, ]\ndataset['terminals']  : [数据集样本数, ] \ndataset['timeouts']  : （表示是否因为达到最大轨迹长度而终止） [数据集样本数, ]\n\n#不太重要的备注：所有[数据集样本数]上相邻的样本他们生成的顺序也是相邻的，\n# dataset['observations'][100]和dataset['observations'][101]要么前者是一条轨迹的终止状态，要么这两个状态分布代表s_t,s_{t+1}\n"

In [None]:
import gym
import numpy as np
import torch
import pickle
import d4rl
env = gym.make('walker2d-medium-v2')
dataset = env.get_dataset()

print(np.sum(dataset['terminals']), np.sum(dataset['terminals'])+ np.sum(dataset['timeouts']),np.sum(np.logical_or(dataset['terminals'], dataset['timeouts'])))
print(np.sum(np.logical_and(dataset['terminals'], dataset['timeouts']))) 
assert 0

traj_len = []
p = 0
for i in range(len(dataset['terminals'])):
    if dataset['terminals'][i]:
        traj_len.append(i-p)
        p=i
#print(len(traj_len))
print(traj_len)
#print(np.sum(dataset['timeouts']))
# dataset['actions'] = dataset.pop('action')
# dataset['observations'] = dataset.pop('state')
# dataset['rewards'] = dataset.pop('reward')
# dataset['terminals'] = dataset.pop('is_terminal')
# dataset['rewards'] = dataset.pop('reward')

for d in dataset:
    #d = np.array(d)
    print(d, dataset[d].shape)
    #print(process_data[:3])
'''
dataset：字典类型
dataset['actions']  : [数据集样本数 x 动作维度]
dataset['observations']  : [数据集样本数 x 观测维度]
dataset['rewards']  : [数据集样本数, ]
dataset['terminals']  : [数据集样本数, ] 
dataset['timeouts']  : （表示是否因为达到最大轨迹长度而终止） [数据集样本数, ]

#不太重要的备注：所有[数据集样本数]上相邻的样本他们生成的顺序也是相邻的，
# dataset['observations'][100]和dataset['observations'][101]要么前者是一条轨迹的终止状态，要么这两个状态分布代表s_t,s_{t+1}
'''  

In [None]:
for l in range(25,300):
    import numpy as np
    action = d['actions'][l]
    observation = d['observations'][l]
    dire = ['U', 'R', 'D', 'L']
    print("action:   ", dire[np.argmax(action)], np.max(action), d['rewards'][l])
    o = np.zeros((4,12))
    pos = np.where(observation!=0)[0]
    o[ np.unravel_index(pos, o.shape)] = 1
    print(o)
