# ROBOTIC

Using stable_baselines3 for robotic use case


### pip install

Installing gym and panda-gym.

In [None]:
!pip install gym
!pip install panda-gym==2.0.1
!pip install stable_baselines3
!pip install tensorboard

### import

Importing gym and panda-gym.

In [2]:
import gym
import panda_gym
import stable_baselines3
from stable_baselines3.common.logger import configure
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback
from stable_baselines3 import HerReplayBuffer, DDPG

## Testing the environnment

In [3]:
env = gym.make('PandaPush-v2', render=True)

obs = env.reset()
done = False
while not done:
    action = env.action_space.sample() # random action
    obs, reward, done, info = env.step(action)

env.close()

pybullet build time: Apr 26 2022 03:12:14


### Setting up model
Hyper-parameters from https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/hyperparams/her.yml

In [None]:
env = gym.make("PandaPush-v1")
model = DDPG('MultiInputPolicy', 
             env, 
             replay_buffer_class=HerReplayBuffer, 
             replay_buffer_kwargs=dict(
                 n_sampled_goal=4,
                 goal_selection_strategy='future',
                 online_sampling=True,
             ), 
             buffer_size = 1000000, 
             tau = 0.05, 
             learning_rate = 1e-3, 
             verbose=1, 
             batch_size = 2048, 
             gamma = 0.95, 
             policy_kwargs = dict(
                 n_critics=2, 
                 net_arch=[512, 512, 512]
             ), 
             tensorboard_log="logs/tensorboard/")

### Training the model

In [None]:
checkpoint_callback = CheckpointCallback(save_freq=10_000, 
                                         save_path='.', 
                                         name_prefix='PandaPush-v1')


In [None]:
eval_callback = EvalCallback(env, 
                             best_model_save_path='eval_save', 
                             eval_freq=500)

In [None]:
callback_list = CallbackList([checkpoint_callback, eval_callback])

model.learn(total_timesteps=int(1e7), 
            callback=callback_list, 
            log_interval=100, 
            tb_log_name='logs_robotics_PandaPush')

### Saving and cleaning the environnment

In [None]:
model.save("PandaPush-v2-DDPG")

del model
del env

### Testing the environnment

In [5]:
env = gym.make("PandaPush-v2", render=True)
model = DDPG.load("PandaPush-v2-DDPG", env=env)
obs = env.reset()
dones = False

while not dones:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()

argv[0]=--background_color_red=0.8745098039215686
argv[1]=
argv[2]=
argv[3]=
argv[4]=
argv[5]=
argv[6]=
argv[7]=
argv[8]=
argv[9]=
argv[10]=
argv[11]=
argv[12]=
argv[13]=
argv[14]=
argv[15]=
argv[16]=
argv[17]=
argv[18]=
argv[19]=
argv[20]=
argv[21]=--background_color_green=0.21176470588235294
argv[22]=
argv[23]=
argv[24]=
argv[25]=
argv[26]=
argv[27]=
argv[28]=
argv[29]=
argv[30]=
argv[31]=
argv[32]=
argv[33]=
argv[34]=
argv[35]=
argv[36]=
argv[37]=
argv[38]=
argv[39]=
argv[40]=
argv[41]=
argv[42]=--background_color_blue=0.17647058823529413
startThreads creating 1 threads.
starting thread 0
started thread 0 
argc=45
argv[0] = --unused
argv[1] = --background_color_red=0.8745098039215686
argv[2] = 
argv[3] = 
argv[4] = 
argv[5] = 
argv[6] = 
argv[7] = 
argv[8] = 
argv[9] = 
argv[10] = 
argv[11] = 
argv[12] = 
argv[13] = 
argv[14] = 
argv[15] = 
argv[16] = 
argv[17] = 
argv[18] = 
argv[19] = 
argv[20] = 
argv[21] = 
argv[22] = --background_color_green=0.21176470588235294
argv[23] = 
argv

### installing sb3-contrib

In [None]:
!pip install sb3-contrib

### Testing community trained model

In [7]:
import panda_gym
from sb3_contrib import TQC
from stable_baselines3.common.env_util import make_vec_env
from sb3_contrib.common.wrappers import TimeFeatureWrapper
from stable_baselines3.common.logger import configure

In [10]:
env = make_vec_env("PandaPush-v2", wrapper_class=TimeFeatureWrapper, env_kwargs={'render':True})
model = TQC.load("PandaPush-v1", custom_objects={'learning_rate':0.001}, env=env)
obs = env.reset()
dones = False
while not dones:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
env.close()

argc=45
argv[0] = --unused
argv[1] = --background_color_red=0.8745098039215686
argv[2] = 
argv[3] = 
argv[4] = 
argv[5] = 
argv[6] = 
argv[7] = 
argv[8] = 
argv[9] = 
argv[10] = 
argv[11] = 
argv[12] = 
argv[13] = 
argv[14] = 
argv[15] = 
argv[16] = 
argv[17] = 
argv[18] = 
argv[19] = 
argv[20] = 
argv[21] = 
argv[22] = --background_color_green=0.21176470588235294
argv[23] = 
argv[24] = 
argv[25] = 
argv[26] = 
argv[27] = 
argv[28] = 
argv[29] = 
argv[30] = 
argv[31] = 
argv[32] = 
argv[33] = 
argv[34] = 
argv[35] = 
argv[36] = 
argv[37] = 
argv[38] = 
argv[39] = 
argv[40] = 
argv[41] = 
argv[42] = 
argv[43] = --background_color_blue=0.17647058823529413
argv[44] = --start_demo_name=Physics Server
ExampleBrowserThreadFunc started
X11 functions dynamically loaded using dlopen/dlsym OK!
X11 functions dynamically loaded using dlopen/dlsym OK!
Creating context
Created GL 3.3 context
Direct GLX rendering context obtained
Making context current
GL_VENDOR=Intel
GL_RENDERER=Mesa Intel(R) HD Gra