In [25]:
from IPython import display
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import torch
import omegaconf
from experiment_logger import (
    create_modquad_experiment_logger,
    log_modquad_experiment_params,
    log_modquad_environment_info,
    log_modquad_model_info,
    log_modquad_agent_info
)

import modquad_copp_env as modquad_env
import meta_learning_base as mlb
# import mbrl.env.reward_fns as reward_fns
# import mbrl.env.termination_fns as termination_fns
import mbrl.models as models
import mbrl.planning as planning
import mbrl.util.common as common_util
import mbrl.util as util
import modquad_utils 
import time
from models import modquad_ModelEnv
# import models. as mq_model


%load_ext autoreload
%autoreload 2

mpl.rcParams.update({"font.size": 16})

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

seed = 0
env = modquad_env.ModQuadEnv()
rng = np.random.default_rng(seed=0)
generator = torch.Generator(device=device)
generator.manual_seed(seed)
obs_shape = env.observation_space.shape
act_shape = env.action_space.shape

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Connecting to CoppeliaSim...
Connected!
4 DoF [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
No propeller. Is it a magic box?


Get initial replay buffer. Either run simulation now or load existing flight data

In [26]:
#to test if gym environment simulation is working
import pickle

# replay_buffer_sim = env.run_gym_simulation_and_collect_data(cut_at = 600)

# with open('quat_600_sim_data.pkl', 'wb') as f:
#     pickle.dump(replay_buffer_sim, f)

# with open('600_large_sim_data_dt_0.01.pkl', 'wb') as f:
#     pickle.dump(replay_buffer_sim, f)

# _ = env.run_gym_simulation_and_collect_data(cut_at = 300)

with open('quat_600_sim_data.pkl', 'rb') as f:
    replay_buffer_sim = pickle.load(f)
    
# env.end_simulation()
# env.reset()

In [7]:

env.end_simulation()
env.reset()


Connected!
4 DoF [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
No propeller. Is it a magic box?


(array([-3.8312464e-08, -1.6547890e-08,  4.6264969e-08, -6.5610797e-09,
         4.8497657e-08,  2.0069294e-07, -1.6270590e-04, -1.3943643e-05,
         6.5666239e-04], dtype=float32),
 {})

In [27]:

replay_buffer_sim
all_actions = []
all_obs = []
for transition in replay_buffer_sim:
    obs, action, next_obs, reward, terminate, truncated = transition
    all_obs.append(obs)
    all_actions.append(action)

all_actions = np.array(all_actions)
print(all_actions.shape)

means = np.mean(all_actions, axis=0)
variances = np.var(all_actions, axis=0)
std_devs = np.std(all_actions, axis=0)
mins = np.min(all_actions, axis=0)
maxs = np.max(all_actions, axis=0)

print("Means:", means)
print("Variances:", variances)
print("Standard Deviations:", std_devs)
print("Minimums:", mins)
print("Maximums:", maxs)

all_obs = np.array(all_obs)
print(all_obs.shape)

means = np.mean(all_obs, axis=0)
variances = np.var(all_obs, axis=0)
std_devs = np.std(all_obs, axis=0)
mins = np.min(all_obs, axis=0)
maxs = np.max(all_obs, axis=0)

print("Means:", means)
print("Variances:", variances)
print("Standard Deviations:", std_devs)
print("Minimums:", mins)
print("Maximums:", maxs)

# pos, angles, velocity, angular_velocity

(14478, 4)
Means: [ 3.25222877e+00 -2.37496764e-05 -1.14858131e-04  1.01522262e-01]
Variances: [0.01127349 0.00054829 0.01140178 0.00576978]
Standard Deviations: [0.10617668 0.02341554 0.10677913 0.07595909]
Minimums: [ 2.3866805  -0.10301613 -0.28670569 -0.08023392]
Maximums: [4.09218592 0.0866907  0.27594365 0.22322088]
(14478, 10)
Means: [ 3.8639180e-04  4.9862633e-03  1.0937074e-02 -1.3712974e-04
 -1.1420795e-03  1.2183436e-03  8.8508496e-06  4.9579536e-05
  5.2475512e-02  9.9724454e-01]
Variances: [3.96992639e-02 9.38166492e-03 1.49817085e-02 2.57527642e-03
 2.06866488e-01 1.44890306e-04 6.68567809e-05 1.11687439e-03
 1.57208880e-03 3.73381022e-06]
Standard Deviations: [0.19924673 0.09685899 0.12239979 0.05074718 0.4548258  0.01203704
 0.0081766  0.03341967 0.03964958 0.00193231]
Minimums: [-0.6478879  -0.46388495 -0.64670855 -0.2343599  -1.7107099  -0.04097146
 -0.03536256 -0.09221905 -0.04908156  0.98986685]
Maximums: [0.6826183  0.4886786  0.63356763 0.18391222 1.6933316  0.064

In [30]:
trial_length = 200
num_trials = 20
ensemble_size = 5

# Everything with "???" indicates an option with a missing value.
# Our utility functions will fill in these details using the 
# environment information
cfg_dict = {
    # dynamics model configuration
    "dynamics_model": {
        "_target_": "mbrl.models.GaussianMLP",
        "device": device,
        "num_layers": 3,
        "ensemble_size": ensemble_size,
        "hid_size": 256,
        "in_size": "???",
        "out_size": "???",
        "deterministic": False,
        "propagation_method": "fixed_model",
        # can also configure activation function for GaussianMLP
        "activation_fn_cfg": {
            "_target_": "torch.nn.LeakyReLU",
            "negative_slope": 0.01
        }
    },
    # options for training the dynamics model
    "algorithm": {
        "dataset_size": 30000,
        "learned_rewards": False,
        "target_is_delta": True,
        "normalize": True,
    },
    # these are experiment specific options
    "overrides": {
        "trial_length": trial_length,
        "num_steps": num_trials * trial_length,
        "model_batch_size": 64,
        "validation_ratio": 0.1
    }
}
cfg = omegaconf.OmegaConf.create(cfg_dict)

# Create a 1-D dynamics model for this environment
dynamics_model = common_util.create_one_dim_tr_model(cfg, obs_shape, act_shape)

# Create a gym-like environment to encapsulate the model
model_env = modquad_ModelEnv.modquad_ModelEnv(env, dynamics_model, generator=generator)

replay_buffer = common_util.create_replay_buffer(cfg, obs_shape, act_shape, rng=rng)

for tr in replay_buffer_sim:
    obs, action, next_obs, reward, terminate, truncated = tr
    replay_buffer.add(obs, action, next_obs, reward, terminate, truncated)
print("# samples stored", replay_buffer.num_stored)

agent_cfg = omegaconf.OmegaConf.create({
    # this class evaluates many trajectories and picks the best one
    "_target_": "mbrl.planning.TrajectoryOptimizerAgent",
    "planning_horizon": 10,#was 15 initially
    "replan_freq": 1,
    "verbose": True,
    "action_lb": "???",
    "action_ub": "???",
    # this is the optimizer to generate and choose a trajectory
    "optimizer_cfg": {
        "_target_": "mbrl.planning.CEMOptimizer",
        "num_iterations": 5,
        "elite_ratio": 0.1,
        "population_size": 100,
        "alpha": 0.175,
        "device": device,
        "lower_bound": "???",
        "upper_bound": "???",
        "return_mean_elites": True,
        "clipped_normal": False
    }
})

agent = planning.create_trajectory_optim_agent_for_model(
    model_env,
    agent_cfg,
    num_particles=10
)


# samples stored 14478


Main experiment block 

In [31]:
env.end_simulation()
env.reset()

Connected!
4 DoF [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
No propeller. Is it a magic box?


(array([-3.60691637e-08, -1.70620389e-08,  1.04601611e-07,  6.79286849e-09,
         1.04879405e-07,  2.00711838e-07, -8.13550650e-05, -6.94632308e-06,
         3.28326743e-04,  9.99999940e-01], dtype=float32),
 {})

In [33]:
train_losses = []
val_scores = []
train_time = []
plan_time = []

def train_callback(_model, _total_calls, _epoch, tr_loss, val_score, _best_val):
    train_losses.append(tr_loss)
    val_scores.append(val_score.mean().item())   # this returns val score per ensemble model


# Create a trainer for the model
model_trainer = models.ModelTrainer(dynamics_model, optim_lr=5e-5, weight_decay=5e-5)
env.initialize_target_trajectory(traj = "random trajectory") 
model_env.dt = env.dt

# Create visualization objects
# fig, axs = plt.subplots(1, 2, figsize=(14, 3.75), gridspec_kw={"width_ratios": [1, 1]})
# ax_text = axs[0].text(300, 50, "")
    
# the states will encapsulate the desired trajectory \

# Main PETS loop
all_rewards = [0]
trajectory_length, total_time, pos_traj, orient_traj = env.initialize_target_trajectory(traj = "random trajectory", position_change_scale=0.5, num_waypoints=10)
#     trajectory parameters, default arguments used for random trajectory
    # start_pos=[0.0, 0.0, 2.0], start_yaw=0.0, 
    # start_vel=[0.0, 0.0, 0.0], start_yaw_rate=0.0,
    # position_change_scale=1.0, fixed_pos_change_dist=True,
    # orientation_change_scale=0.1,
    # std_velocity_change=0.0,
    # std_angular_velocity_change=0.0,
    # std_acceleration_change=0.0,
    # std_angular_acceleration_change=0.0,
    # num_waypoints=20, 
    # num_hover_points=3,
    # time_step_duration=20,
    # num_samples=3):
# print(trajectory_length, total_time,"\n", pos_traj[0],"\n", pos_traj[1], "\n", pos_traj[2])

model_env.set_desired_trajectory(total_time, pos_traj, orient_traj)
# total_time *= 2
# logger._write_to_log("\nStarting main experiment loop...")

for trial in range(num_trials):
    
    print("resetting environment, and starting trial :", trial)
    obs, _ = env.reset()  
    agent.reset()
    
    terminated = False
    truncated = False
    total_reward = 0.0
    steps_trial = 0

    model_env.trajectory_step = 0
    env.trajectory_step = 0
    env.update_setpoint(model_env.trajectory_step)
    time.sleep(0.5)
    # env.pause_simulation()

    # --------------- Model Training -----------------

    if steps_trial == 0:
        print("Number of stored transitions: ", replay_buffer.num_stored)
        dynamics_model.update_normalizer(replay_buffer.get_all())  # update normalizer stats
        
        dataset_train, dataset_val = common_util.get_basic_buffer_iterators(
            replay_buffer,
            batch_size=cfg.overrides.model_batch_size,
            val_ratio=cfg.overrides.validation_ratio,
            ensemble_size=ensemble_size,
            shuffle_each_epoch=True,
            bootstrap_permutes=False,  # build bootstrap dataset using sampling with replacement 
        )
        print("Training model")
        training_start_time = time.time()
        model_trainer.train(
            dataset_train, 
            dataset_val=dataset_val, 
            num_epochs=50, 
            patience=50, 
            callback=train_callback,
            silent=True)
        print("training loss: ", train_losses[-1])
        print("validation loss: ", val_scores[-1])
        print("training time: ", time.time() - training_start_time)
        print("Model trained")

    last_setpoint_set = time.time()


    # update_axes(axs, env.render(), ax_text, trial, steps_trial, all_rewards)
    while not (terminated or truncated):# or (model_env.trajectory_step >= trajectory_length-1)):

        if (time.time() - last_setpoint_set > total_time/trajectory_length) :
            # env.resume_simulation()
            model_env.trajectory_step += 1
            env.trajectory_step += 1
            last_setpoint_set = time.time()
            print("setpoint updated")
            env.update_setpoint(model_env.trajectory_step)
            
            

            # env.pause_simulation()

        
        
        

        # --- Doing env step using the agent and adding to model dataset ---
        print("planning for trajectory step: ", model_env.trajectory_step)
        planning_start_time = time.time()
        model_env.current_pos = env.robot.get_position()
        new_obs, reward, terminated, truncated, _ = common_util.step_env_and_add_to_buffer(
            env, obs, agent, {}, replay_buffer)
        print("planning time: ", time.time() - planning_start_time)
        
            
        # update_axes(
        #     axs, env.render(), ax_text, trial, steps_trial, all_rewards)
        
        obs = new_obs
        # if truncated:
        #     reward = -100
        total_reward += reward
        steps_trial += 1

    env.end_simulation()
        
    # if steps_trial == trial_length:
    #         break
    
    all_rewards.append(total_reward)

# # update_axes(axs, env.render(), ax_text, trial, steps_trial, all_rewards, force_update=True)


x_shape 57
trajectory shape (57, 12)
resetting environment, and starting trial : 0
Connected!
4 DoF [[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
No propeller. Is it a magic box?
setting setpoint to:  0.0 0.0 0.30000000000000004
Number of stored transitions:  14478
Training model
training loss:  -42.72999692430683
validation loss:  0.002649058820679784
training time:  90.44977402687073
Model trained
planning for trajectory step:  0
Planning time: 0.971
action:  [2.6594467  0.06729684 0.00788644 0.09116668] reward:  -0.29999999999491545
planning time:  0.9873785972595215
planning for trajectory step:  0
Planning time: 0.936
action:  [ 4.203225   -0.00855848 -0.01908811  0.12428128] reward:  -0.29999999999491406
planning time:  0.9528024196624756
planning for trajectory step:  0
Planning time: 1.043
action:  [4.2245407  0.01636336 0.06797079 0.10393912] reward:  -0.29999998759246393
planning time:  1.055736780166626
planning for trajectory step:  0
Planning time: 0.976
action:  [ 2.5998476   0.05

KeyboardInterrupt: 

In [142]:
print(all_rewards)

[0, -103.48258039206779, -102.37088259030631, -106.88939530264432, -102.09854677835945, -104.85436910137211, -107.32944103239133, -105.96439655475936, -102.5801105437926, -104.80176441468943, -107.67793198904218, -101.0672106775768, -105.33021814558785, -101.97609834675292, -102.35064774465766, -102.62514999888869, -104.13621741183316, -102.50408697161838, -101.05234418820046, -105.67069207317091, -102.41506569947421]
