In [3]:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
import gym

https://github-wiki-see.page/m/openai/gym/wiki/BipedalWalker-v2}
    
$\textbf{Description}$

Get a 2D biped walker to walk through rough terrain.

$\textbf{Environment : Observation}$
Type: Box(24)

$Num	Observation	Min	Max	Mean 
0	hull_angle	0	2*pi	0.5
1	hull_angularVelocity	-inf	+inf	-
2	vel_x	-1	+1	-
3	vel_y	-1	+1	-
4	hip_joint_1_angle	-inf	+inf	-
5	hip_joint_1_speed	-inf	+inf	-
6	knee_joint_1_angle	-inf	+inf	-
7	knee_joint_1_speed	-inf	+inf	-
8	leg_1_ground_contact_flag	0	1	-
9	hip_joint_2_angle	-inf	+inf	-
10	hip_joint_2_speed	-inf	+inf	-
11	knee_joint_2_angle	-inf	+inf	-
12	knee_joint_2_speed	-inf	+inf	-
13	leg_2_ground_contact_flag	0	1	-
14-23	10 lidar readings	-inf	+inf	-$

$\textbf{Actions}$
Type: Box(4) - Torque control(default) / Velocity control - Change inside /envs/box2d/bipedal_walker.py line 363

Num	Name	Min	Max

0	Hip_1 (Torque / Velocity)	-1	+1 

1	Knee_1 (Torque / Velocity)	-1	+1

2	Hip_2 (Torque / Velocity)	-1	+1

3	Knee_2 (Torque / Velocity)	-1	+1

$\textbf{Reward}$

Reward is given for moving forward, total 300+ points up to the far end. If the robot falls, it gets -100. Applying motor torque costs a small amount of points, more optimal agent will get better score. State consists of hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements. There's no coordinates in the state vector.

$\textbf{Starting State}$

Random position upright and mostly straight legs.

$\textbf{Episode Termination}$

The episode ends when the robot body touches ground or the robot reaches far right side of the environment.

In [19]:
env = gym.make("BipedalWalker-v3")
observation = env.reset()
for _ in range(100):
    env.render()
    action = env.action_space.sample() # your agent here (this takes random actions)
    observation, reward, done, info = env.step(action)
    print(reward)
    if done:
        observation = env.reset()
env.close()
# print(rewards)

-0.02600405580301961
-0.07881398305420957
-0.10048904217158992
-0.0329431512852498
-0.12203473487247904
-0.08520764390131075
0.0619033031380204
-0.15761265008807457
-0.10146729200084886
-0.1177071178058795
-0.06753422384460649
-0.13716558068990706
-0.1612932060956955
-0.10948161645730219
-0.05455836097399512
-0.09624982913335046
-0.11202556393792469
-0.06981079212824268
-0.16314097466071686
-0.1750605053181426
-0.2567293731371574
-0.17012341032425682
-0.27234428238868713
-0.320044668463368
-0.2983010286688805
-0.3272242262810469
-0.26359097677469256
-0.19429904116193572
-0.15472890096902847
-0.1115510514577218
-0.13464818148314955
-0.12716487622261047
-0.0917263320287057
-0.08038823083819437
-0.06705010104179382
-0.047345322052635364
-0.0955392576853446
-0.18988597696026047
-0.23680734825134278
-0.21041946915164592
-0.12724443740646163
-0.2406802662710361
-0.28305651448170105
-0.20346702222526075
-0.24447251375516493
-0.38428648924827574
-0.44001552170515057
-0.4218959700862543
-0.4270

In [6]:
env.action_space, env.observation_space

(Box(-1.0, 1.0, (4,), float32), Box(-inf, inf, (24,), float32))

In [10]:
observation, action

(array([-0.04242516,  0.02229533,  0.06530807, -0.04109291, -0.11813273,
        -0.05345169,  0.78499204, -1.        ,  0.        ,  0.23785482,
         0.21821049,  0.40128577, -1.        ,  0.        ,  0.47013718,
         0.47547629,  0.49211684,  0.52211529,  0.56963158,  0.64253706,
         0.75632179,  0.94486445,  1.        ,  1.        ]),
 array([ 0.3334087 , -0.41505602,  0.16125764, -0.4089472 ], dtype=float32))