<a target="_blank" href="https://colab.research.google.com/github/rcpaffenroth/dac_raghu/blob/main/LunarLander.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Setup and libraries

In [1]:
import sys
IN_COLAB = 'google.colab' in sys.modules

In [2]:
if IN_COLAB:
  ! apt-get install swig
  ! pip install stable-baselines3[extra] gymnasium[box2d] huggingface_sb3
else:
  # Otherwise, install locally and you need the following
  # NOTE: Need "gym" and "gymnasium" installed, since we use "gymnasium" for the LunarLander environment
  #       and "gym" is for huggingface_sb3.
  # sudo apt install swig ffmpeg
  # pip install stable-baselines3[extra] gymnasium[box2d] huggingface_sb3 imageio[ffmpeg] gym ipywidegets ipykernel pandas pyarrow
  pass

In [3]:
import gymnasium as gym
import matplotlib.pylab as plt
import numpy as np

import imageio
from stable_baselines3 import PPO
from huggingface_sb3 import load_from_hub

import pandas as pd

from IPython.display import display
from IPython.display import HTML
from ipywidgets import interact, widgets
from base64 import b64encode
%matplotlib inline

In [None]:
# Make the environment
env = gym.make("LunarLander-v2", render_mode='rgb_array')
observation = env.reset()


### Action Space
There are four discrete actions available:

0: do nothing

1: fire left orientation engine

2: fire main engine

3: fire right orientation engine

### Observation Space

The state is an 8-dimensional vector: the coordinates of the lander in x & y, its linear velocities in x & y, its angle, its angular velocity, and two booleans that represent whether each leg is in contact with the ground or not.

In [None]:
obs_names = ['x', 'y', 'vx', 'vy', 'theta', 'vtheta', 'leg1', 'leg2']

# Train a model

In [None]:
models = {}

In [None]:
class RandomModel(object):
  def __init__(self, env):
    self.env = env

  def predict(self, obs):
    return env.action_space.sample(), None # The second return value is the state value, which the random model does not use

random_model =  RandomModel(env)
models['random'] = {}
models['random']['model'] = random_model
models['random']['runs'] = []

In [None]:
# This is an trained model that has a good architecture and loss function, but is not trained very much.  This takes about 30 sec on 
# a RTX 4090
trained_model = PPO("MlpPolicy", env)
trained_model.learn(total_timesteps=20000)

models['trained'] = {}
models['trained']['model'] = trained_model
models['trained']['runs'] = []

In [None]:
# This is a model from huggingface.co at https://huggingface.co/sb3/a2c-LunarLander-v2
# Mean reward: 181.08 +/- 95.35
checkpoint = load_from_hub(
    repo_id="sb3/a2c-LunarLander-v2",
    filename="a2c-LunarLander-v2.zip",
)

good_model = PPO.load(checkpoint)

models['good'] = {}
models['good']['model'] = good_model
models['good']['runs'] = []

Exception: an integer is required (got type bytes)
Exception: an integer is required (got type bytes)


In [None]:
# This is a model from huggingface.co at https://huggingface.co/araffin/ppo-LunarLander-v2
# Mean reward:  283.49 +/- 13.74
checkpoint = load_from_hub(
    repo_id="araffin/ppo-LunarLander-v2",
    filename="ppo-LunarLander-v2.zip",
)

better_model = PPO.load(checkpoint)
models['better'] = {}
models['better']['model'] = better_model
models['better']['runs'] = []

Exception: an integer is required (got type bytes)


# Evaluate models

In [None]:
def evaluate_model(model_name, models=models, env=env):
   # Make a movie of a trained agent
   obs = env.reset()[0]

   # Get the model
   model = models[model_name]['model']
   images = []
   all_obs = []
   all_actions = []
   all_rewards = []
   done = False
   while not done:
      # This rendering mode puts an image into a numpy array
      images += [env.render()]
      action, _state = model.predict(obs)
      all_obs.append(obs)
      all_actions.append(action)
      obs, reward, done, trunc, info = env.step(action)
      all_rewards.append(reward)
   env.close()

   df = pd.DataFrame(all_obs, columns=obs_names)
   df['action'] = all_actions
   df['reward'] = all_rewards
   models[model_name]['runs'].append({'data':df,'images':images})



In [None]:
for i in range(3):
    evaluate_model('random')
    evaluate_model('trained')
    evaluate_model('good')
    evaluate_model('better')
    

In [None]:
@interact(model_name=models.keys(), run_idx=widgets.IntSlider(min=0, max=9, step=1, value=0))
def show_video(model_name, run_idx):
      images = models[model_name]['runs'][run_idx]['images']      
      # imageio is a nice library for taking a sequence of images and makeing a movie
      name = 'tmp.mp4'
      imageio.mimsave(name, images, fps=15)
      mp4 = open(name,'rb').read()
      data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
      # This puts the video in the notebook
      display(HTML("""
      <video width=400 controls>
            <source src="%s" type="video/mp4">
      </video>
      """ % data_url))
      # plot various data for the run 
      _, ax = plt.subplots(1, 3)
      data = models[model_name]['runs'][run_idx]['data']
      ax[0].plot(data['reward'])
      ax[0].set_title('reward')
      ax[1].plot(data['x'], data['y'])
      ax[1].set_title('position')
      ax[2].plot(data['vx'], data['vy'])
      ax[2].set_title('velocity')
      plt.tight_layout()
      

interactive(children=(Dropdown(description='model_name', options=('random', 'trained', 'good', 'better'), valu…

# Rewards

In [None]:
for model_name in models.keys():
    for run_idx in range(len(models[model_name]['runs'])):
        data = models[model_name]['runs'][run_idx]['data']  
        # Print the total reward for each model
        print(f"{model_name}: {np.sum(data['reward']):.2f}")

random: -97.89
random: -268.28
random: -120.32
trained: -286.57
trained: -4782.33
trained: -100.21
good: 35.73
good: 243.29
good: 22.98
better: 278.44
better: 273.98
better: 280.76


# Write files

In [None]:
# Create a single dataframe with all the data
# each row is a single run of a single model
# each column is a single timestep
# each cell is the reward at that timestep

for model_name in models.keys():
    for run_idx in range(len(models[model_name]['runs'])):
        df = models[model_name]['runs'][run_idx]['data'].copy()  

        # index plays the role of timestep
        df['timestamp'] = df.index
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
        df.set_index('timestamp', inplace=True)

        # We now compute the delta t that gives us 100 total sample points for each run
        # We do this by taking the total time of the run and dividing by 100
        delta_t = (df.index[-1] - df.index[0]) / 100.0
        print(delta_t)
        df_resampled = df.resample(f'{delta_t}T').interpolate()

        # data = pd.melt(data, value_vars=['x', 'y'], var_name='timestep', value_name='position')

0 days 00:00:00


ValueError: Invalid frequency: 0 days 00:00:00T

In [None]:
df

Unnamed: 0_level_0,x,y,vx,vy,theta,vtheta,leg1,leg2,action,reward,timestep,timestam
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1970-01-01 00:00:00.000000000,0.001899,1.404410,0.192341,-0.289357,-0.002194,-0.043568,0.0,0.0,3,-2.568321,0,0
1970-01-01 00:00:00.000000001,0.003894,1.397322,0.204094,-0.315020,-0.006756,-0.091253,0.0,0.0,3,-2.625251,1,1
1970-01-01 00:00:00.000000002,0.005961,1.389639,0.213215,-0.341504,-0.013141,-0.127714,0.0,0.0,2,-0.816648,2,2
1970-01-01 00:00:00.000000003,0.008132,1.381912,0.223055,-0.343501,-0.019054,-0.118278,0.0,0.0,0,-2.022618,3,3
1970-01-01 00:00:00.000000004,0.010303,1.373585,0.223072,-0.370176,-0.024968,-0.118274,0.0,0.0,0,-2.006289,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 00:00:00.000000064,0.128445,0.126061,0.155184,-1.472933,0.045131,0.121635,0.0,0.0,0,-1.213249,64,64
1970-01-01 00:00:00.000000065,0.130058,0.092325,0.155184,-1.499603,0.051213,0.121635,0.0,0.0,0,-1.699139,65,65
1970-01-01 00:00:00.000000066,0.131672,0.057989,0.155184,-1.526273,0.057295,0.121634,0.0,0.0,3,-2.247434,66,66
1970-01-01 00:00:00.000000067,0.133381,0.023067,0.167276,-1.552195,0.060943,0.072957,0.0,0.0,0,6.953294,67,67
