robocin · FelipeMartins96 · Feb 22, 2021 · Feb 22, 2021
diff --git a/envs/rc_gym/Utils/Utils.py b/envs/rc_gym/Utils/Utils.py
@@ -1,5 +1,6 @@
 import math
 import numpy as np
+import gym
 
 
 # Math methods
@@ -68,3 +69,24 @@ def normVt(vt):
 
 def roundTo5(x, base=5):
     return int(base * round(float(x) / base))
+
+# Base on baselines implementation
+class OrnsteinUhlenbeckAction(object):
+    def __init__(self, action_space, theta=.17, dt=0.032, x0=None):
+        self.theta = theta
+        self.mu = (action_space.high + action_space.low) / 2
+        self.sigma = (action_space.high - self.mu) / 2
+        self.dt = dt
+        self.x0 = x0
+        self.reset()
+
+    def sample(self):
+        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
+        self.x_prev = x
+        return x
+
+    def reset(self):
+        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
+
+    def __repr__(self):
+        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
diff --git a/envs/rc_gym/vss/env_3v3/vss_gym_3v3.py b/envs/rc_gym/vss/env_3v3/vss_gym_3v3.py
@@ -1,5 +1,6 @@
 import math
 import random
+from rc_gym.Utils.Utils import OrnsteinUhlenbeckAction
 from typing import Dict
 
 import gym
@@ -61,8 +62,8 @@ def __init__(self):
         high_obs_bound = [1.2, 1.2, 1.25, 1.25]
         high_obs_bound += [1.2, 1.2, 1, 1, 1.25, 1.25, 1.2]*3
         high_obs_bound += [1.2, 1.2, 1.25, 1.25, 1.2]*3
-        low_obs_bound = np.array(low_obs_bound)
-        high_obs_bound = np.array(high_obs_bound)
+        low_obs_bound = np.array(low_obs_bound, dtype=np.float32)
+        high_obs_bound = np.array(high_obs_bound, dtype=np.float32)
 
         self.action_space = gym.spaces.Box(low=-1, high=1,
                                            shape=(2, ), dtype=np.float32)
@@ -76,11 +77,19 @@ def __init__(self):
         self.reward_shaping_total = None
         self.v_wheel_deadzone = 0.05
 
+        self.ou_actions = []
+        for i in range(self.n_robots_blue + self.n_robots_yellow):
+            self.ou_actions.append(
+                OrnsteinUhlenbeckAction(self.action_space, dt=self.time_step)
+            )
+
         print('Environment initialized')
 
     def reset(self):
         self.actions = None
         self.reward_shaping_total = None
+        for ou in self.ou_actions:
+            ou.reset()
 
         return super().reset()
 
@@ -129,14 +138,14 @@ def _get_commands(self, actions):
                               v_wheel2=v_wheel2))
 
         # Send random commands to the other robots
-        for i in range(1, 3):
-            actions = self.action_space.sample()
+        for i in range(1, self.n_robots_blue):
+            actions = self.ou_actions[i].sample()
             self.actions[i] = actions
             v_wheel1, v_wheel2 = self._actions_to_v_wheels(actions)
             commands.append(Robot(yellow=False, id=i, v_wheel1=v_wheel1,
                                   v_wheel2=v_wheel2))
-        for i in range(3):
-            actions = self.action_space.sample()
+        for i in range(self.n_robots_yellow):
+            actions = self.ou_actions[self.n_robots_blue+i].sample()
             v_wheel1, v_wheel2 = self._actions_to_v_wheels(actions)
             commands.append(Robot(yellow=True, id=i, v_wheel1=v_wheel1,
                                   v_wheel2=v_wheel2))
@@ -286,47 +295,49 @@ def theta(): return random.uniform(-180, 180)
 
         def same_position_ref(x, y, x_ref, y_ref, radius):
             if x >= x_ref - radius and x <= x_ref + radius and \
-            y >= y_ref - radius and y <= y_ref + radius:
+                    y >= y_ref - radius and y <= y_ref + radius:
                 return True
             return False
 
         radius_ball = 0.2
         radius_robot = 0.2
         same_pos = True
-
 
         while same_pos:
             for i in range(len(agents)):
                 same_pos = False
-                while same_position_ref(agents[i].x,agents[i].y,pos_frame.ball.x, pos_frame.ball.y, radius_ball):
+                while same_position_ref(agents[i].x, agents[i].y, pos_frame.ball.x, pos_frame.ball.y, radius_ball):
                     agents[i] = Robot(x=x(), y=y(), theta=theta())
                     same_pos = True
                 for j in range(i + 1, len(agents)):
                     while same_position_ref(agents[i].x, agents[i].y, agents[j].x, agents[j].y, radius_robot):
                         agents[i] = Robot(x=x(), y=y(), theta=theta())
                         same_pos = True
-                        
+
         pos_frame.robots_blue[0] = agents[0]
         pos_frame.robots_blue[1] = agents[1]
         pos_frame.robots_blue[2] = agents[2]
         pos_frame.robots_yellow[0] = agents[3]
         pos_frame.robots_yellow[1] = agents[4]
-        pos_frame.robots_yellow[2] = agents[5] 
+        pos_frame.robots_yellow[2] = agents[5]
 
         return pos_frame
 
     def _actions_to_v_wheels(self, actions):
         left_wheel_speed = actions[0] * self.rsim.linear_speed_range
         right_wheel_speed = actions[1] * self.rsim.linear_speed_range
 
+        left_wheel_speed, right_wheel_speed = np.clip(
+            (left_wheel_speed, right_wheel_speed),
+            -self.rsim.linear_speed_range,
+            self.rsim.linear_speed_range
+        )
+
         # Deadzone
         if -self.v_wheel_deadzone < left_wheel_speed < self.v_wheel_deadzone:
             left_wheel_speed = 0
 
         if -self.v_wheel_deadzone < right_wheel_speed < self.v_wheel_deadzone:
             right_wheel_speed = 0
 
-        left_wheel_speed, right_wheel_speed = np.clip(
-            (left_wheel_speed, right_wheel_speed), -2.6, 2.6)
-
         return left_wheel_speed, right_wheel_speed
diff --git a/envs/test.py b/envs/test.py
@@ -6,10 +6,12 @@
 # #
 # '''
 
+from rc_gym.Utils.Utils import OrnsteinUhlenbeckAction
 import gym
 import rc_gym
 import numpy as np
 import time
+import rc_gym.Utils
 
 # Using penalty env
 env = gym.make('VSS3v3-v0')
@@ -77,8 +79,9 @@ def print_with_description(state):
     # print_with_description(next_state)
     epi_rew = 0.
     while not done:
-        action = np.array([0.1, 0.])
+        action = np.array([0., 0.])
         # action = env.action_space.sample()
+        # time.sleep(0.1)
         next_state, reward, done, _ = env.step(action)
         epi_rew += reward
         # print_with_description(next_state)