In [1]:
import os
import numpy as np
import pandas as pd 

from stable_baselines3 import PPO, SAC, TD3, DDPG, A2C, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback
from gym_donkeycar.envs.donkey_env import DonkeyEnv
from stable_baselines3.common.callbacks import CheckpointCallback, CallbackList

import sys

from torch.utils.data.dataset import Dataset, random_split

import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

import gym

path = r"c:\Users\spige\memoire\gym-donkeycar-retry\gym-donkeycar\CarConsumptionModel"
sys.path.insert(0, path)

from donkey_environment.ConsumptionWrapper import ConsumptionWrapper
from utils.callbacks import CustomProgressBarCallback
from utils.ExpertDataset import ExpertDataSet





In [3]:
#! pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# print(th.version.cuda)
# print(th.cuda.is_available())

11.8
True


### creation of the environment

In [2]:
env = ConsumptionWrapper(level="steep-ascent")

env.close()

starting DonkeyGym env
Setting default: start_delay 0.0
Setting default: max_cte 8.0
Setting default: frame_skip 1
Setting default: cam_resolution (120, 160, 3)
Setting default: log_level 20
Setting default: host localhost
Setting default: port 9091
Setting default: steer_limit 1.0
Setting default: throttle_min -1.0
Setting default: throttle_max 1.0
{'level': 'steep-ascent', 'start_delay': 0.0, 'max_cte': 8.0, 'frame_skip': 1, 'cam_resolution': (120, 160, 3), 'log_level': 20, 'host': 'localhost', 'port': 9091, 'steer_limit': 1.0, 'throttle_min': -1.0, 'throttle_max': 1.0}


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


### normalization methods

In [7]:
def normalize_actions(action: np.ndarray) -> np.ndarray:
     # [-1, -1], [1,1]
    
    low, high = np.array([-1, -1]), np.array([1, 1])
    predicted_action = 2 * (action - low) / (high - low) - 1
    return predicted_action

def denormalize_actions(normalized_action: np.ndarray) -> np.ndarray:
    low, high = np.array([-1, -1]), np.array([1, 1])
    denormalized_action = (normalized_action + 1) / 2 * (high - low) + low
    return denormalized_action


# Example action
action = np.array([0.5, 0.5])
# convert to tensor
action = th.tensor(action)

# Normalize the action
normalized_action = normalize_actions(action)
print("Normalized Action:", normalized_action)

# Denormalize the action
denormalized_action = denormalize_actions(normalized_action)
print("Denormalized Action:", denormalized_action)

Normalized Action: tensor([0.5000, 0.5000], dtype=torch.float64)
Denormalized Action: tensor([0.5000, 0.5000], dtype=torch.float64)


### Load observation_dataset

In [4]:
expert_dataset = ExpertDataSet()
expert_dataset.load_dataset("../data/rollout/dataset_clamped.npz")

print(f" observation shape : {expert_dataset.observations.shape} \n")
print(f" action shape : {expert_dataset.actions.shape} \n")

def transform_observation(obs: np.ndarray) -> np.ndarray:
    # transform (1332, 120, 160, 3) to (1332, 3, 120, 160)
    transformed_obs = np.transpose(obs, (0, 3, 1, 2))
    return transformed_obs

expert_dataset.observations = transform_observation(expert_dataset.observations)
train_size = int(0.8 * len(expert_dataset))

test_size = len(expert_dataset) - train_size

train_expert_dataset, test_expert_dataset = random_split(
    expert_dataset, [train_size, test_size]
)

print(f"Train dataset size: {train_size} \n"
        f"Test dataset size: {test_size}")


KeyboardInterrupt: 

### Pretraining script

In [7]:
clone_ppo_model = A2C("CnnPolicy", env, verbose=1, seed=42)
EPOCHS = 1000

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




In [12]:
def pretrain_agent(
    student,
    batch_size=64,
    epochs=1000,
    scheduler_gamma=0.7,
    learning_rate=1.0,
    log_interval=100,
    no_cuda=True,
    seed=1,
    test_batch_size=64,
    csv_file="../data/RL/pretraining.csv",
    test_file="../data/RL/pretraining_test.csv",
    loss_file="../data/RL/pretraining_loss.csv",
    test_loss_file="../data/RL/pretraining_test_loss.csv",
    ppo_model=clone_ppo_model,
):
    use_cuda = not no_cuda and th.cuda.is_available()
    th.manual_seed(seed)
    device = th.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
    print(f"Using device {device}") 
    
    criterion = nn.L1Loss()
    #criterion = nn.MSELoss()

    # Extract initial policy
    model = student.policy.to(device)

    file = open(csv_file, "w")
    file.write("observed_steering,observed_throttle,target_steering,target_throttle\n")

    test_file = open(test_file, "w")
    test_file.write("observed_steering,observed_throttle,target_steering,target_throttle\n")

    file_loss = open(loss_file, "w")
    file_loss.write("loss\n")

    test_file_loss = open(test_loss_file, "w")
    test_file_loss.write("loss\n")

    def train(model, device, train_loader, optimizer):

        model.train()

        for batch_idx, (current_observation, target_action) in enumerate(train_loader):
            current_observation, target_action = current_observation.to(device), target_action.to(device)
            optimizer.zero_grad()

            if isinstance(env.action_space, gym.spaces.Box):
                # A2C/PPO policy outputs actions, values, log_prob
                # SAC/TD3 policy outputs actions only
                if isinstance(student, (A2C, PPO)):
                    action, _, _ = model(current_observation)
                else:
                    # SAC/TD3:
                    action = model(current_observation)
                action_prediction = action.double()
            else:
                # Retrieve the logits for A2C/PPO when using discrete actions
                dist = model.get_distribution(current_observation)
                action_prediction = dist.distribution.logits
                target_action = target_action.long()

            loss = criterion(action_prediction, target_action)
            file_loss.write(f"{loss.item()}\n")
            loss.backward()
            optimizer.step()
            for observed_action, target in zip(action_prediction, target_action):
                # extract numpy scalars from PyTorch tensors
                observed_action = observed_action.cpu().detach().numpy()
                target = target.cpu().detach().numpy()

                observed_action = ",".join(map(str, observed_action))
                target = ",".join(map(str, target))
                
                file.write(f"{observed_action},{target}\n")

            
            if batch_idx % log_interval == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(current_observation),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )

    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        with th.no_grad():
            for current_observation, target_action in test_loader:
                current_observation, target_action = current_observation.to(device), target_action.to(device)

                if isinstance(env.action_space, gym.spaces.Box):
                    # A2C/PPO policy outputs actions, values, log_prob
                    # SAC/TD3 policy outputs actions only
                    if isinstance(student, (A2C, PPO)):
                        action, _, _ = model(current_observation)
                    else:
                        # SAC/TD3:
                        action = model(current_observation)
                    action_prediction = action.double()
                else:
                    # Retrieve the logits for A2C/PPO when using discrete actions
                    dist = model.get_distribution(current_observation)
                    action_prediction = dist.distribution.logits
                    target_action = target_action.long()
                
                test_loss = criterion(action_prediction, target_action)
                for observed_action, target in zip(action_prediction, target_action):
                    # extract numpy scalars from PyTorch tensors
                    observed_action = observed_action.cpu().detach().numpy()
                    target = target.cpu().detach().numpy()

                    observed_action = ",".join(map(str, observed_action))
                    target = ",".join(map(str, target))
                    
                    test_file.write(f"{observed_action},{target}\n")
        test_loss /= len(test_loader.dataset)
        print(f"Test set: Average loss: {test_loss}")
        test_file_loss.write(f"{test_loss}\n")

    # Here, we use PyTorch `DataLoader` to our load previously created `ExpertDataset` for training
    # and testing

    train_loader = th.utils.data.DataLoader(
        dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    )
    test_loader = th.utils.data.DataLoader(
        dataset=test_expert_dataset,
        batch_size=test_batch_size,
        shuffle=True,
        **kwargs,
    )

    # Define an Optimizer and a learning rate schedule.
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=scheduler_gamma)

    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)

        scheduler.step()

        model_name = clone_ppo_model.__class__.__name__
        # save the model policy
        th.save(model.state_dict(), f"../models/pretrained_{model_name}_{epoch}.pt")
        ppo_model.policy.load_state_dict(model.state_dict())    

        ppo_model.save(f"../models/pretrained_{model_name}_{epoch}")

    file.close()
    file_loss.close()
    test_file_loss.close()

    # Implant the trained policy network back into the RL student agent
    return model

In [13]:
directory = "../data/RL/"

# get name of "clone_ppo_model" instance

name = clone_ppo_model.__class__.__name__

directory = directory + name + "_"

policy_model = pretrain_agent(
    student = clone_ppo_model,
    epochs = EPOCHS,
    scheduler_gamma=0.7,
    learning_rate=1.0,
    log_interval=100,
    no_cuda=True,
    seed=1,
    batch_size=64,
    test_batch_size=64,
    ppo_model=clone_ppo_model,
    csv_file=directory + "pretraining.csv",
    test_file=directory + "pretraining_test.csv",
    loss_file=directory + "pretraining_loss.csv",
    test_loss_file=directory + "pretraining_test_loss.csv"
)

Using device cpu
Test set: Average loss: 9.829241286528513e-07


KeyboardInterrupt: 

In [6]:
print(env.action_space.low)
print(env.action_space.high)
env.close()

[-1.  0.]
[1. 1.]
