## Setup

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%pip install --upgrade pip wheel setuptools swig

In [None]:
%pip install ipywidgets

In [None]:
%pip install --upgrade tensorrt
#%pip install nvidia-tensorrt --extra-index-url https://pypi.ngc.nvidia.com


In [None]:
%pip install --upgrade tensorflow[and-cuda]
#%pip install tensorflow-metal 

In [None]:
%pip install blinker --ignore-installed


In [None]:
%pip install mlflow optuna optuna-integration[mlflow] plotly


In [None]:
%pip install stable-baselines3 dagshub 


In [None]:
%pip install gymnasium

In [None]:
%pip install gymnasium[box2d]

In [None]:
%pip install "stable-baselines3[extra]" PyMySQL python-dotenv


In [None]:
import tensorrt as trt
print("TensorRT version:", trt.__version__)
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
import dagshub
dagshub.init(repo_owner='smileynet', repo_name='gymnasium_experiments', mlflow=True)

In [None]:
%load_ext dotenv
%dotenv

In [None]:
%pip freeze > requirements.txt

In [None]:
import os

# Fetch database connection details from environment variables
db_name = os.getenv("DB_NAME")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")

# Construct the PostgreSQL connection URL
mysql_url = f"mysql+pymysql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"

print("Database connection URL: ", mysql_url)  # For testing purposes, you can print this but avoid showing sensitive data in production.

## Hyperparameter Optimization

In [None]:
import gymnasium as gym
import optuna
import os
import mlflow
import torch
import sys
import gc
import numpy as np
from stable_baselines3.common.callbacks import BaseCallback
from typing import Any, Dict, Union, Tuple
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.logger import HumanOutputFormat, KVWriter, Logger, configure
from optuna.visualization import plot_optimization_history, plot_param_importances
from IPython.display import clear_output

# Define constants
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_TRIALS = 15
N_ENVS = 64
TOTAL_STEPS = 1000000
DATA_DIR = "data"
MODELS_DIR = "models"
LOGS_DIR = "logs"

# Ensure the models directory exists
def ensure_directory_exists(directory):
    os.makedirs(directory, exist_ok=True)
    return directory

# Create models directory if it doesn't exist
models_dir = ensure_directory_exists(MODELS_DIR)
data_dir = ensure_directory_exists(DATA_DIR)
logs_dir = ensure_directory_exists(LOGS_DIR)

best_model_path = os.path.join(models_dir, "best_model.zip")
best_mean_reward = -float("inf")
db_path = os.path.join(data_dir, 'study.db')
local_storage = f"sqlite:///{db_path}"


rdb_storage = optuna.storages.RDBStorage(
    url = mysql_url,
    #heartbeat_interval=60, 
    #grace_period=120
    )

storage = rdb_storage

# Set up MLflow experiment tracking
mlflow.set_experiment("PPO-LunarLander-v2")

class ClearOutputFormat(HumanOutputFormat):
    def write(
        self,
        key_values: Dict[str, Any],
        key_excluded: Dict[str, Union[str, Tuple[str, ...]]] = {},
        step: int = 0
    ) -> None:
        clear_output(wait=True)
        super().write(key_values, key_excluded, step)

class MLflowOutputFormat(KVWriter):
    def write(
        self,
        key_values: Dict[str, Any],
        key_excluded: Dict[str, Union[str, Tuple[str, ...]]],
        step: int = 0,
    ) -> None:
        for (key, value), (_, excluded) in zip(
            sorted(key_values.items()), sorted(key_excluded.items())
        ):
            if excluded is not None and "mlflow" in excluded:
                continue
            # Explicitly check for valid types before logging
            if isinstance(value, (int, float, np.integer, np.floating)) and not isinstance(value, str):
                mlflow.log_metric(key, value, step)
            else:
                print(f"invalid metric of {key}: {value}")
                    
                    
def filter_valid_params(hparams):
    """Filter valid MLflow parameter types."""
    valid_params = {}
    for k, v in hparams.items():
        if isinstance(v, (int, float, str)):  # Check for valid types
            valid_params[k] = v
        else:
            print(f"Skipping invalid parameter: {k} = {v} (type: {type(v)})")
    return valid_params

# Helper function to evaluate the model
def evaluate_model(model, env, num_episodes=10):
    total_rewards = []
    for _ in range(num_episodes):
        obs = env.reset()
        done = False
        total_reward = 0.0
        while not done:
            action, _states = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            total_reward += reward
        total_rewards.append(total_reward)
    return sum(total_rewards) / len(total_rewards)


# Function to set MLflow tags
def set_mlflow_tags(hparams, trial_number):
    tags = {
        "trial_number": trial_number,
        "algorithm": "PPO",
        "type": "reinforcement_learning",
        "optimizer": "optuna",
        "policy_type": "MlpPolicy",
        "environment_name": "LunarLander-v2",
        "total_timesteps": TOTAL_STEPS,
        "n_envs": N_ENVS,
        "python_version": sys.version,
        "stable_baselines3_version": PPO.__module__.split(".")[1],
        "device": DEVICE
    }
    mlflow.set_tags(tags)
    #filtered_hparams = filter_valid_params(hparams)
    mlflow.log_params(hparams)

# Function to create the PPO model
def create_model(hparams, env, checkpoint_path=None):
    # Create the PPO model
    model = PPO(policy="MlpPolicy", env=env, device=DEVICE, verbose=2, **hparams)

    # Load from checkpoint if available
    if checkpoint_path and os.path.exists(checkpoint_path):
        model.load(checkpoint_path)

    return model

# Main function to train and evaluate the model
def train_and_evaluate(hparams, trial_number, trial):
    global best_mean_reward, best_model_path

    # Setup environments
    env = make_vec_env("LunarLander-v2", n_envs=N_ENVS)
    eval_env = make_vec_env("LunarLander-v2", n_envs=1)

    checkpoint_path = os.path.join(models_dir, "checkpoint.zip")

    try:
        model = create_model(hparams, env, checkpoint_path)

        # Setup logger with the custom callbacks
        logger = Logger(
            folder=logs_dir,
            output_formats=[ClearOutputFormat(sys.stdout), MLflowOutputFormat()],
        )
        model.set_logger(logger)

        # Train the model
        model.learn(total_timesteps=TOTAL_STEPS) 

        # Evaluate the model
        mean_reward = evaluate_model(model, eval_env)
        mlflow.log_metric("mean_reward", mean_reward)
        

        # Save the model if it's the best one so far
        if mean_reward > best_mean_reward:
            best_mean_reward = mean_reward
            model.save(best_model_path)
            print(f"New best model saved with mean reward: {mean_reward}")

        return mean_reward

    except Exception as e:
        print(f"Error during training: {e}")
        raise
        
    finally:
        print("Concluding trial...")
        env.reset()
        env.close()
        eval_env.reset()
        eval_env.close()

# Define the objective function for Optuna
def objective(trial):
    hparams = {
        'batch_size': trial.suggest_categorical('batch_size', [512, 1024, 2048]),
        'n_steps': trial.suggest_categorical('n_steps', [32, 64, 128]),
        'n_epochs': trial.suggest_int('n_epochs', 3, 5),
        'gamma': trial.suggest_float('gamma', 0.9, 0.999),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, 1.0),
        'ent_coef': trial.suggest_float('ent_coef', 0.0001, 0.01),
    }

    # Start an MLflow run to log this trial
    with mlflow.start_run(nested=True):
        set_mlflow_tags(hparams, trial.number)
        result = train_and_evaluate(hparams, trial.number, trial)
        print("Exiting MLFlow run...")
        return result

# Main optimization loop
def main():
    # Define a pruner to stop unpromising trials early
    #pruner = optuna.pruners.MedianPruner()
    
    # Create or load the study
    study = optuna.create_study(
        study_name="1M_steps", 
        direction="maximize", 
        #pruner=pruner,
        storage=storage, 
        load_if_exists=True
    )

    # Run the optimization with parallel jobs
    study.optimize(
        objective,  
        n_trials=MAX_TRIALS, 
        #n_jobs=1,
        #gc_after_trial=True
    )
    print("Exiting study...")

    best_trial = study.best_trial
    print(f"Best trial: {best_trial.number}")
    print(f"Value: {best_trial.value}")
    print("Params: ")
    for key, value in best_trial.params.items():
        print(f"    {key}: {value}")
        
    # Save the best hyperparameters
    best_hparams = study.best_params
    print(f"Best Hyperparameters: {best_hparams}")

    # Log the best model as an artifact in MLflow
    mlflow.log_params(best_hparams)
    mlflow.log_artifact(best_model_path)


In [None]:
# Run the main function
#if __name__ == "__main__":
main()

## Analyze

In [None]:
import optuna
study = optuna.study.load_study(study_name='1M_steps', storage=storage)

In [None]:
best_trial = study.best_trial
print(f"Best trial: {best_trial.number}")
print(f"Value: {best_trial.value}")
print("Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

In [None]:
df = study.trials_dataframe(attrs=("number", "value", "params", "state"))
print(df)

In [None]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_rank
from optuna.visualization import plot_slice
from optuna.visualization import plot_timeline

plot_optimization_history(study)



In [None]:
plot_intermediate_values(study)


In [None]:
plot_parallel_coordinate(study)

In [None]:
plot_contour(study)


In [None]:
plot_slice(study)


In [None]:
plot_param_importances(study)


In [None]:
plot_edf(study)


In [None]:
plot_rank(study)


In [None]:
plot_timeline(study)

In [None]:
%pip install -U kaleido

In [None]:
import plotly

# Visualize optimization history and parameter importances
optimization_history_figure = plot_optimization_history(study)
optimization_history_figure.write_image("optimization_history.png")
mlflow.log_artifact("optimization_history.png")

param_importance_figure = plot_param_importances(study)
param_importance_figure.write_image("param_importances.png")
mlflow.log_artifact("param_importances.png")


In [None]:
env = gym.make("LunarLander-v2")
env.reset()
print("Observation Space Shape", env.observation_space.shape)
print("Action Space Shape", env.action_space.n)
env.close()

## Benchmark CPU vs GPU

In [None]:
import time
import torch
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

TOTAL_TIMESTEPS = 100000  # Short benchmark to measure FPS

def measure_fps(device):
    """
    Measure FPS on the given device (cpu or cuda).
    """
    # Create environment
    env = make_vec_env("LunarLander-v2", n_envs=1)

    # Initialize the model on the specified device
    model = PPO('MlpPolicy', env, device=device)

    # Start timer
    start_time = time.time()

    # Train for a small number of timesteps (benchmark)
    model.learn(total_timesteps=TOTAL_TIMESTEPS)

    # End timer
    end_time = time.time()

    # Calculate FPS (frames per second)
    elapsed_time = end_time - start_time
    fps = TOTAL_TIMESTEPS / elapsed_time

    # Clean up
    env.close()

    return fps

# Check if GPU is available
gpu_available = torch.cuda.is_available()

# Measure FPS on CPU
cpu_fps = measure_fps(device="cpu")
print(f"CPU FPS: {cpu_fps:.2f}")

# Measure FPS on GPU (if available)
if gpu_available:
    gpu_fps = measure_fps(device="cuda")
    print(f"GPU FPS: {gpu_fps:.2f}")
else:
    gpu_fps = 0
    print("GPU is not available.")

# Select the device with the higher FPS
if gpu_fps > cpu_fps:
    print("Using GPU for training.")
    chosen_device = "cuda"
else:
    print("Using CPU for training.")
    chosen_device = "cpu"



## Examples

In [None]:
# TODO: Evaluate the agent with this instead

# Create a new environment for evaluation
eval_env = Monitor(gym.make("LunarLander-v2"))

# Evaluate the model with 10 evaluation episodes and deterministic=True
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

# Print the results
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env

from huggingface_sb3 import package_to_hub

# PLACE the variables you've just defined two cells above
# Define the name of the environment
env_id = "LunarLander-v2"

# TODO: Define the model architecture we used
model_architecture = "PPO"

## Define a repo_id
## repo_id is the id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
## CHANGE WITH YOUR REPO ID
repo_id = "ThomasSimonini/ppo-LunarLander-v2"  # Change with your repo id, you can't push with mine 😄

## Define the commit message
commit_message = "Upload PPO LunarLander-v2 trained agent"

# Create the evaluation env and set the render_mode="rgb_array"
eval_env = DummyVecEnv([lambda: Monitor(gym.make(env_id, render_mode="rgb_array"))])

# PLACE the package_to_hub function you've just filled here
package_to_hub(
    model=model,  # Our trained model
    model_name=model_name,  # The name of our trained model
    model_architecture=model_architecture,  # The model architecture we used: in our case PPO
    env_id=env_id,  # Name of the environment
    eval_env=eval_env,  # Evaluation Environment
    repo_id=repo_id,  # id of the model repository from the Hugging Face Hub (repo_id = {organization}/{repo_name} for instance ThomasSimonini/ppo-LunarLander-v2
    commit_message=commit_message,
)