In [1]:
from UR10 import UR10, visualize
import numpy as np

from stable_baselines3 import PPO, DDPG
from gym.wrappers import FlattenObservation
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import StopTrainingOnNoModelImprovement, EvalCallback
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
import torch

pybullet build time: May  2 2023 06:19:30


In [2]:
def linear_schedule(initial_value: float):
    def func(progress_remaining: float) -> float:
        return progress_remaining * initial_value

    return func

In [None]:
angle_control=True
complex_obs_space=True
force = 0.1

env = FlattenObservation(UR10(
    is_dense=True, is_train=True, is_fixed=False, angle_control=angle_control, force=force, 
    complex_obs_space=complex_obs_space, complex_reward=False, pos_range=0.5
))
# Normalize

eval_env = FlattenObservation(UR10(
    is_dense=False, is_train=True, is_fixed=False, angle_control=angle_control, force=force, 
    complex_obs_space=complex_obs_space, pos_range=0.5
))

stop_train_callback = StopTrainingOnNoModelImprovement(max_no_improvement_evals=50, min_evals=200, verbose=1)
eval_callback = EvalCallback(eval_env, best_model_save_path='./models/bestmodels/',
                             n_eval_episodes=20, eval_freq=20000, callback_after_eval=stop_train_callback, 
                             deterministic=True, render=False, verbose=0)

'''vec_env = SubprocVecEnv(
    [lambda: FlattenObservation(UR10(is_train=True, is_fixed=False, angle_control=True, is_dense=True, force=0.1))] * 6
)'''

# default for PPO - 64
policy_kwargs = dict(activation_fn=torch.nn.ReLU, net_arch=dict(pi=[256, 256], vf=[256, 256]))
model = PPO(
    "MlpPolicy", env, verbose=0, tensorboard_log='./tesnsorboard_log/',
    policy_kwargs=policy_kwargs, learning_rate=0.0003
            # learning_rate=linear_schedule(0.0003)
           )

n_actions = env.action_space.shape[-1]
# action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.005 * np.ones(n_actions), dtype=np.float64)
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions, dtype=np.float32), sigma=0.005 * np.ones(n_actions, dtype=np.float32))
model = DDPG("MlpPolicy", env, learning_rate=1e-4, batch_size=32, 
             action_noise=action_noise, tau=0.001, verbose=1, tensorboard_log='./tesnsorboard_log/')

model.learn(total_timesteps=2000000, progress_bar=True, callback=eval_callback)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


2023-06-20 21:53:32.209512: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Logging to ./tesnsorboard_log/DDPG_6


Output()

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | -430     |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 4        |
|    fps             | 133      |
|    time_elapsed    | 15       |
|    total_timesteps | 2000     |
| train/             |          |
|    actor_loss      | 1.59     |
|    critic_loss     | 0.00223  |
|    learning_rate   | 0.0001   |
|    n_updates       | 1500     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | -458     |
|    success_rate    | 0        |
| time/              |          |
|    episodes        | 8        |
|    fps             | 128      |
|    time_elapsed    | 31       |
|    total_timesteps | 4000     |
| train/             |          |
|    actor_loss      | 3.08     |
|    critic_loss     | 0.00838  |
|    learning_

---------------------------------
| eval/              |          |
|    mean_ep_length  | 472      |
|    mean_reward     | -472     |
|    success_rate    | 0.1      |
| time/              |          |
|    total_timesteps | 20000    |
| train/             |          |
|    actor_loss      | 9.29     |
|    critic_loss     | 0.128    |
|    learning_rate   | 0.0001   |
|    n_updates       | 19996    |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 500      |
|    ep_rew_mean     | -342     |
|    success_rate    | 0.0227   |
| time/              |          |
|    episodes        | 44       |
|    fps             | 114      |
|    time_elapsed    | 192      |
|    total_timesteps | 21996    |
| train/             |          |
|    actor_loss      | 9.8      |
|    critic_loss     | 0.149    |
|    learning_rate   | 0.0001   |
|    n_updates       | 21496    |
---------------------------------
--------------

In [5]:
model.save("models/ddpg_ur10_1m.model")