<a target="_blank" href="https://colab.research.google.com/github/rcpaffenroth/dac_raghu/blob/main/LunarLander.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Setup and libraries

In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules

In [2]:
if IN_COLAB:
  ! apt-get install swig
  ! pip install stable-baselines3[extra] gymnasium[box2d] huggingface_sb3
else:
  # Otherwise, install locally and you need the following
  # NOTE: Need "gym" and "gymnasium" installed, since we use "gymnasium" for the LunarLander environment
  #       and "gym" is for huggingface_sb3.
  # sudo apt install swig ffmpeg
  # pip install stable-baselines3[extra] gymnasium[box2d] huggingface_sb3 imageio[ffmpeg] gym ipywidegets ipykernel pandas pyarrow
  pass

In [3]:
import gymnasium as gym
import matplotlib.pylab as plt
import numpy as np

import imageio
from stable_baselines3 import PPO
from huggingface_sb3 import load_from_hub

import pandas as pd

from IPython.display import display
from IPython.display import HTML
from ipywidgets import interact, widgets
from base64 import b64encode
%matplotlib inline

In [4]:
# Make the environment
env = gym.make("LunarLander-v2", render_mode='rgb_array')
observation = env.reset()


### Action Space
There are four discrete actions available:

0: do nothing

1: fire left orientation engine

2: fire main engine

3: fire right orientation engine

### Observation Space

The state is an 8-dimensional vector: the coordinates of the lander in x & y, its linear velocities in x & y, its angle, its angular velocity, and two booleans that represent whether each leg is in contact with the ground or not.

In [5]:
obs_names = ['x', 'y', 'vx', 'vy', 'theta', 'vtheta', 'leg1', 'leg2']

# Train a model

In [6]:
models = {}

In [7]:
class RandomModel(object):
  def __init__(self, env):
    self.env = env

  def predict(self, obs):
    return env.action_space.sample(), None # The second return value is the state value, which the random model does not use

random_model =  RandomModel(env)
models['random'] = {}
models['random']['model'] = random_model
models['random']['runs'] = []

In [8]:
# This is an trained model that has a good architecture and loss function, but is not trained very much.  This takes about 30 sec on 
# a RTX 4090
trained_model = PPO("MlpPolicy", env)
trained_model.learn(total_timesteps=20000)

models['trained'] = {}
models['trained']['model'] = trained_model
models['trained']['runs'] = []

In [9]:
# This is a model from huggingface.co at https://huggingface.co/sb3/a2c-LunarLander-v2
# Mean reward: 181.08 +/- 95.35
checkpoint = load_from_hub(
    repo_id="sb3/a2c-LunarLander-v2",
    filename="a2c-LunarLander-v2.zip",
)

good_model = PPO.load(checkpoint)

models['good'] = {}
models['good']['model'] = good_model
models['good']['runs'] = []

Exception: an integer is required (got type bytes)
Exception: an integer is required (got type bytes)


In [10]:
# This is a model from huggingface.co at https://huggingface.co/araffin/ppo-LunarLander-v2
# Mean reward:  283.49 +/- 13.74
checkpoint = load_from_hub(
    repo_id="araffin/ppo-LunarLander-v2",
    filename="ppo-LunarLander-v2.zip",
)

better_model = PPO.load(checkpoint)
models['better'] = {}
models['better']['model'] = better_model
models['better']['runs'] = []

Exception: an integer is required (got type bytes)


# Evaluate models

In [11]:
def evaluate_model(model_name, models=models, env=env):
   # Make a movie of a trained agent
   obs = env.reset()[0]

   # Get the model
   model = models[model_name]['model']
   images = []
   all_obs = []
   all_actions = []
   all_rewards = []
   done = False
   while not done:
      # This rendering mode puts an image into a numpy array
      images += [env.render()]
      action, _state = model.predict(obs)
      all_obs.append(obs)
      all_actions.append(action)
      obs, reward, done, trunc, info = env.step(action)
      all_rewards.append(reward)
   env.close()

   df = pd.DataFrame(all_obs, columns=obs_names)
   df['action'] = all_actions
   df['reward'] = all_rewards
   models[model_name]['runs'].append({'data':df,'images':images})



In [12]:
for i in range(3):
    evaluate_model('random')
    evaluate_model('trained')
    evaluate_model('good')
    evaluate_model('better')
    

In [13]:
@interact(model_name=models.keys(), run_idx=widgets.IntSlider(min=0, max=9, step=1, value=0))
def show_video(model_name, run_idx):
      images = models[model_name]['runs'][run_idx]['images']      
      # imageio is a nice library for taking a sequence of images and makeing a movie
      name = 'tmp.mp4'
      imageio.mimsave(name, images, fps=15)
      mp4 = open(name,'rb').read()
      data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
      # This puts the video in the notebook
      display(HTML("""
      <video width=400 controls>
            <source src="%s" type="video/mp4">
      </video>
      """ % data_url))
      # plot various data for the run 
      _, ax = plt.subplots(1, 3)
      data = models[model_name]['runs'][run_idx]['data']
      ax[0].plot(data['reward'])
      ax[0].set_title('reward')
      ax[1].plot(data['x'], data['y'])
      ax[1].set_title('position')
      ax[2].plot(data['vx'], data['vy'])
      ax[2].set_title('velocity')
      plt.tight_layout()
      

interactive(children=(Dropdown(description='model_name', options=('random', 'trained', 'good', 'better'), valu…

# Rewards

In [14]:
for model_name in models.keys():
    for run_idx in range(len(models[model_name]['runs'])):
        data = models[model_name]['runs'][run_idx]['data']  
        # Print the total reward for each model
        print(f"{model_name}: {np.sum(data['reward']):.2f}")

random: -477.42
random: -80.93
random: -288.32
trained: -198.97
trained: -320.00
trained: 5.62
good: 296.98
good: 80.09
good: 0.88
better: 270.98
better: 255.24
better: 301.49


# Write files

In [56]:
# This section produces the data for a generative model of the Lunar Lander
# Create a single dataframe with all the data
# each row is a single run of a single model
# each column is a single timestep of a single variable

# NOTE:  There needs to be some thinking here.  I mean, while the position,
# velocity, and angle are all continuous, the thrust is not.  So, we need to
# thinkg about how to interpolate the thrust. I think the data in this case
# needs to be "ragged" in the sense that each row has a different number of
# entries.  However, perhaps we can also just look at the "shortest" run and
# truncate all the other runs to that length. 

def uniform_data_for_autoencoder(filename, entries_per_run=100)
for model_name in models.keys():
    for run_idx in range(len(models[model_name]['runs'])):
        df = models[model_name]['runs'][run_idx]['data'].copy()  

        # index plays the role of timestep
        df['timestamp'] = pd.to_datetime(df.index, unit='s')
        df.set_index('timestamp', inplace=True)

        # We now compute the delta t that gives us 100 total sample points for each run
        # We do this by taking the total time of the run and dividing by 100
        total_time = df.index[-1] - df.index[0]
        delta_t = total_time / entries_per_run
        df = df.resample(delta_t).interpolate()

        df = pd.melt(df, 
                     value_vars=['x', 'y', 'vx', 'vy', 'theta', 'vtheta'], 
                     var_name='variable', 
                     ignore_index=False, 
                     value_name='value')
        df.loc[df.index[0]] = ['model_name', model_name]
        df.loc[df.index[-1]] = ['total_time', total_time]
        

In [53]:
df

Unnamed: 0_level_0,variable,value
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-01-01 00:00:00.000,model_name,better
1970-01-01 00:00:02.360,x,-0.016149
1970-01-01 00:00:04.720,x,-0.025983
1970-01-01 00:00:07.080,x,-0.035816
1970-01-01 00:00:09.440,x,-0.045649
...,...,...
1970-01-01 00:03:46.560,y,-0.003334
1970-01-01 00:03:48.920,y,-0.00278
1970-01-01 00:03:51.280,y,-0.002225
1970-01-01 00:03:53.640,y,-0.001671
