In [1]:
import os
# restart notebook
#os.kill(os.getpid(), 9)

In [2]:
# virtual display 
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400,900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7f60c871c190>

In [3]:
import gym

from huggingface_sb3 import load_from_hub, package_to_hub, push_to_hub
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.

from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env

In [4]:
import gym

# Firtst, we create our environment called LunarLander-v2
env = gym.make("LunarLander-v2")

# Then, we reset this environment
observation = env.reset()

for _ in range(20):
    # Take a random action
    action = env.action_space.sample()
    print("Action taken:", action)
    
    # Do this action in the environment and get
    # next_state, reward, done and info
    observation, reward, done, info = env.step(action)
    
    # If the game is done (in our case we land, crashed or timeout)
    if done:
        # Reset the environment
        print("Environment is reset")
        observation = env.reset()

Action taken: 2
Action taken: 1
Action taken: 3
Action taken: 3
Action taken: 1
Action taken: 3
Action taken: 2
Action taken: 3
Action taken: 1
Action taken: 1
Action taken: 2
Action taken: 3
Action taken: 3
Action taken: 3
Action taken: 0
Action taken: 3
Action taken: 2
Action taken: 1
Action taken: 2
Action taken: 3


In [5]:
# We create our environment with gym.make("<name_of_the_environment>")
env = gym.make("LunarLander-v2")
env.reset()
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space Shape", env.observation_space.shape)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

Observation Space Shape (8,)
Sample observation [-1.4011873  -0.8958607   0.96948975 -0.11884433  0.95865786 -0.5399154
 -0.42004403 -0.7763293 ]


We see with `Observation Space Shape (8,)` that the observation is a vector of size 8, where each value contains different information about the lander:
- Horizontal pad coordinate (x)
- Vertical pad coordinate (y)
- Horizontal speed (x)
- Vertical speed (y)
- Angle
- Angular speed
- If the left leg has contact point touched the land
- If the right leg has contact point touched the land


In [6]:
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 3


The action space (the set of possible actions the agent can take) is discrete with 4 actions available 🎮: 

- Do nothing,
- Fire left orientation engine,
- Fire the main engine,
- Fire right orientation engine.

Reward function (the function that will gives a reward at each timestep) 💰:

- Moving from the top of the screen to the landing pad and zero speed is about 100~140 points.
- Firing main engine is -0.3 each frame
- Each leg ground contact is +10 points
- Episode finishes if the lander crashes (additional - 100 points) or come to rest (+100 points)

#### Vectorized Environment
- We create a vectorized environment (method for stacking multiple independent environments into a single environment) of 16 environments, this way, **we'll have more diverse experiences during the training.**

In [7]:
# Create the environment
env = make_vec_env('LunarLander-v2', n_envs=32)

In [13]:
# Create the model
model = PPO('MlpPolicy', env ,normalize_advantage=True, n_steps=512, batch_size=128, gae_lambda=0.98, 
            gamma=0.999, ent_coef=0.01, 
            )


In [15]:
model.learn(total_timesteps=1000000)
model_name = "ppo-LunarLander-v2"
model.save(model_name)