In [1]:
import torch as th
from typing import Tuple

from stable_baselines3 import PPO
from stable_baselines3.common.policies import BasePolicy

import onnx
import onnxruntime as ort
import numpy as np
import matplotlib.pyplot as plt

import os, sys
sys.path.append(os.path.abspath('./env'))
sys.path.append(os.path.abspath('./common'))

from env.env_move_sector_v3  import HumanMoveSectorActionV3
from env.env_move_ray_v3  import HumanMoveRayActionV3

In [2]:
env = HumanMoveSectorActionV3(target_point_rand=True, object_ignore=True)

In [2]:

class OnnxableSB3Policy(th.nn.Module):
    def __init__(self, policy: BasePolicy):
        super().__init__()
        self.policy = policy

    def forward(self, observation: th.Tensor) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
        # NOTE: Preprocessing is included, but postprocessing
        # (clipping/inscaling actions) is not,
        # If needed, you also need to transpose the images so that they are channel first
        # use deterministic=False if you want to export the stochastic policy
        # policy() returns `actions, values, log_prob` for PPO
        return self.policy(observation, deterministic=True)



In [3]:


model = PPO.load("env_Wall_MoveRay3_RanTP_exp_0112_115555/model", device="cpu")


In [4]:

onnx_policy = OnnxableSB3Policy(model.policy)

observation_size = model.observation_space.shape
dummy_input = th.randn(1, *observation_size)


In [2]:
onnx_path = "teached/move_ray_v3_ppo.onnx"



In [6]:

th.onnx.export(
    onnx_policy,
    dummy_input,
    onnx_path,
    opset_version=17,
    input_names=["input"],
)

In [8]:
##### Load and test with onnx
onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)

In [None]:
from env_move_wall import HumanMoveRayAroundWallActionV3
from pytz import timezone
from datetime import datetime

TZ = timezone('Europe/Moscow')

seed = int( datetime.now(TZ).strftime("%H%M%S") )
print(seed)

#env = HumanMoveSectorActionV3(render_mode = 'human', target_point_rand=False, object_ignore=True, seed=seed)
env = HumanMoveRayAroundWallActionV3(render_mode = 'human',
                                target_point_rand = True,
                                object_locate = 'build',
                                wall_border=0,
                                tree_count=0,
                                line_angle=-90,
                                seed=seed,
                           )

total_reward = 0.
step_reward = []
angle_step_reward = []
speed_step_reward = []
view_step_reward = []
stoper_step_reward = []
obstacle_reward_stop = []
obstacle_reward_move = []
a_speed_x = []
a_speed_y = []
a_speed_a = []

observation, info = env.reset(seed=seed)


for tick in range(1800):



    ort_sess = ort.InferenceSession(onnx_path)
    actions, values, log_prob = ort_sess.run(None, {"input": [observation]})
    
    a_speed_x.append(actions[0][0])
    a_speed_y.append(actions[0][1])
    a_speed_a.append(actions[0][2])

    observation, reward, terminated, truncated, info = env.step(actions[0])
    if terminated or truncated:
        observation, info = env.reset()
        step_reward.append(reward)
        total_reward += reward
        print('BREAK',terminated, truncated)
        break
    
    step_reward.append(reward)
    total_reward += reward

    rews = env.get_rewards()
    angle_step_reward.append(rews['angle_reward'])
    speed_step_reward.append(rews['speed_reward'])
    view_step_reward.append(rews['view_reward'] )
    stoper_step_reward.append(rews['stoped_reward'])
    obstacle_reward_stop.append(rews['object_stop'])
    obstacle_reward_move.append(rews['object_move'])

print(len(step_reward))
print(total_reward)



In [None]:
#plt.plot(step_reward[:-5])
plt.plot(angle_step_reward,color='b')
plt.plot(speed_step_reward,color='g')
plt.plot(view_step_reward, color='pink')
plt.plot(stoper_step_reward, color='yellow')
plt.plot(obstacle_reward_stop,color='black')
plt.plot(obstacle_reward_move,color='r')
plt.legend([
    #'total',
    'angle',
    'speed',
    'view',
    'stoper',
    'obstacle_stop',
    'obstacle_free'
    ])
plt.title('Вознаграждения за шаг')
plt.xlabel('Шаг')
plt.ylabel('Вознаграждения')
plt.show()

In [None]:
plt.plot(a_speed_x,color='b')
plt.plot(a_speed_y,color='g')
plt.plot(a_speed_a, color='pink')
plt.legend([
    'speed_x',
    'speed_y',
    'angle',
    ])
plt.title('Управляющие действия')
plt.xlabel('Шаг')
plt.ylabel('Действие')
plt.show()

In [None]:


# Check that the predictions are the same
with th.no_grad():
    print(model.policy(th.as_tensor(observation), deterministic=True))

In [9]:
env.close()