# ROBOTIC

Using stable_baselines3 for robotic use case


### pip install

Installing gym and panda-gym.

In [1]:
!pip install stable-baselines3
!pip install panda-gym
!pip install tensorboard

Collecting stable-baselines3
  Using cached stable_baselines3-1.5.0-py3-none-any.whl (177 kB)
Collecting numpy
  Using cached numpy-1.22.3-cp38-cp38-macosx_10_14_x86_64.whl (17.6 MB)
Collecting cloudpickle
  Using cached cloudpickle-2.0.0-py3-none-any.whl (25 kB)
Collecting gym==0.21
  Using cached gym-0.21.0-py3-none-any.whl
Collecting matplotlib
  Using cached matplotlib-3.5.2-cp38-cp38-macosx_10_9_x86_64.whl (7.3 MB)
Collecting pandas
  Using cached pandas-1.4.2-cp38-cp38-macosx_10_9_x86_64.whl (11.0 MB)
Collecting torch>=1.8.1
  Using cached torch-1.11.0-cp38-none-macosx_10_9_x86_64.whl (129.9 MB)
Collecting typing-extensions
  Using cached typing_extensions-4.2.0-py3-none-any.whl (24 kB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.33.3-py3-none-any.whl (930 kB)
Collecting kiwisolver>=1.0.1
  Using cached kiwisolver-1.4.2-cp38-cp38-macosx_10_9_x86_64.whl (65 kB)
Collecting pillow>=6.2.0
  Using cached Pillow-9.1.0-cp38-cp38-macosx_10_9_x86_64.whl (3.1 MB)
Collecting cyc

### import

Importing gym and panda-gym.

In [2]:
import gym
import panda_gym
import stable_baselines3
from stable_baselines3.common.logger import configure
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback
from stable_baselines3 import HerReplayBuffer, DDPG

  if not hasattr(tensorboard, '__version__') or LooseVersion(tensorboard.__version__) < LooseVersion('1.15'):


## Testing the environnment

In [4]:
env = gym.make('PandaPush-v2', render=True)

obs = env.reset()
done = False
while not done:
    action = env.action_space.sample() # random action
    obs, reward, done, info = env.step(action)

env.close()

### Setting up model
Hyper-parameters from https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/her.yml

In [14]:
env = gym.make("PandaPush-v2")
model = DDPG('MultiInputPolicy', 
             env, 
             replay_buffer_class=HerReplayBuffer, 
             replay_buffer_kwargs=dict(
                 n_sampled_goal=4,
                 goal_selection_strategy='future',
                 online_sampling=True,
             ), 
             buffer_size = 1000000, 
             tau = 0.05, 
             learning_rate = 1e-3, 
             verbose=1, 
             batch_size = 2048, 
             gamma = 0.95, 
             policy_kwargs = dict(
                 n_critics=2, 
                 net_arch=[512, 512, 512]
             ), 
             tensorboard_log="logs/tensorboard/")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### Training the model

In [15]:
checkpoint_callback = CheckpointCallback(save_freq=1000, 
                                         save_path='.', 
                                         name_prefix='PandaPush-v1')


In [16]:
eval_callback = EvalCallback(env, 
                             best_model_save_path='eval_save', 
                             eval_freq=1000)

In [17]:
callback_list = CallbackList([checkpoint_callback, eval_callback])

model.learn(total_timesteps=10000, 
            callback=callback_list, 
            log_interval=1000, 
            tb_log_name='logs_robotics_PandaPush')

Logging to logs/tensorboard/logs_robotics_PandaPush_1
Eval num_timesteps=1000, episode_reward=-40.00 +/- 20.00
Episode length: 50.00 +/- 0.00
Success rate: 20.00%
---------------------------------
| eval/              |          |
|    mean_ep_length  | 50       |
|    mean_reward     | -40      |
|    success_rate    | 0.2      |
| time/              |          |
|    total_timesteps | 1000     |
| train/             |          |
|    actor_loss      | 0.947    |
|    critic_loss     | 0.0543   |
|    learning_rate   | 0.001    |
|    n_updates       | 850      |
---------------------------------
New best mean reward!
Eval num_timesteps=2000, episode_reward=-40.00 +/- 20.00
Episode length: 50.00 +/- 0.00
Success rate: 20.00%
---------------------------------
| eval/              |          |
|    mean_ep_length  | 50       |
|    mean_reward     | -40      |
|    success_rate    | 0.2      |
| time/              |          |
|    total_timesteps | 2000     |
| train/             |    

<stable_baselines3.ddpg.ddpg.DDPG at 0x7fc4093f9eb0>

### Saving and cleaning the environnment

In [18]:
model.save("PandaPush-v2-DDPG")

del model
del env

### Testing the environnment

In [19]:
env = gym.make("PandaPush-v2", render=True)
model = DDPG.load("PandaPush-v2-DDPG", env=env)
obs = env.reset()
dones = False

while not dones:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()

2022-05-04 09:36:30.424 Python[38148:1933253] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to /var/folders/4j/sm6ccfnj0xs0g2nm3mnz28540000gn/T/org.python.python.savedState


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


### installing sb3-contrib

In [None]:
!pip install sb3-contrib

### Testing community trained model

In [None]:
import panda_gym
from sb3_contrib import TQC
from stable_baselines3.common.env_util import make_vec_env
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.logger import configure

In [None]:
env = make_vec_env("PandaPush-v2", wrapper_class=TimeFeatureWrapper, env_kwargs={'render':True})
model = TQC.load("PandaPush-v1", custom_objects={'learning_rate':0.001}, env=env)
obs = env.reset()
dones = False
while not dones:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()