## Behavior Cloning Policy Training Example

In [1]:
import os
import sys

import numpy as np
import pandas as pd

import torch

from tqdm import tqdm

# get the current script's directory
current_directory = os.path.dirname(os.path.abspath(__file__)) if "__file__" in locals() else os.getcwd()
# get the parent directory
parent_directory = os.path.dirname(current_directory)
# Add the parent directory to the sys.path
sys.path.append(parent_directory)

from utils import common, constants
from utils.config import Config
from utils.dataset_loader import PolicyDatasetLoader

from models.policy_model import RobotPolicy

In [2]:
current_path = os.getcwd()
parent_path = os.path.dirname(current_path)
grand_parent_path = os.path.dirname(parent_path)

In [3]:
results_path = os.path.join(grand_parent_path, "results")

In [4]:
dataset_path = os.path.join(grand_parent_path, "dataset")
demo_path = os.path.join(dataset_path, "human_demonstrations")

In [5]:
# collected dataset folder name in ("dataset \\ human_demonstrations \\ collection_date")
collection_date = "2024_01_23" # year_month_day

In [6]:
dataset_folder = os.path.join(demo_path,
                              collection_date)

In [7]:
json_files = os.listdir(dataset_folder + "\\jsons")

In [8]:
column_names = constants.COLUMN_NAMES

# Test

In [9]:
configs = Config()
# call the parameters method to set the parameters
configs.parameters()

Current Time:  Jan_27_2024-19_40_18


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Training Device: ", device)
configs.device = device

Training Device:  cpu


In [11]:
model_directory = os.path.join(results_path, "policy_network_params")

if not os.path.exists(model_directory):
    os.makedirs(model_directory)

In [12]:
saving_path = configs.model_saving_path(directory=model_directory)

In [13]:
json_path = os.path.join(dataset_folder + "\\jsons", json_files[0])

In [14]:
training_data = PolicyDatasetLoader(demo_data_json_paths=[json_path])



Number of Trajectories:  1
Each Trajectory Length:  8
Full Demo Dataset Size:  8


In [15]:
torch_loader = torch.utils.data.DataLoader(training_data,
                                           batch_size=configs.batch_size,
                                           shuffle=configs.data_shuffle,
                                           num_workers=configs.num_workers)
torch_loader

<torch.utils.data.dataloader.DataLoader at 0x2782aca6770>

In [16]:
policy_network = RobotPolicy(state_size=configs.state_size,
                             hidden_size=configs.hidden_size,
                             out_size=configs.action_size,
                             std_min=configs.policy_std_min,
                             std_max=configs.policy_std_max,
                             device=configs.device)
policy_network

RobotPolicy(
  (backbone): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
  )
  (policy_mu): Sequential(
    (0): Linear(in_features=64, out_features=3, bias=True)
    (1): Tanh()
  )
  (policy_log_std): Sequential(
    (0): Linear(in_features=64, out_features=3, bias=True)
    (1): ReLU()
  )
)

In [17]:
# adam optimizer
optimizer = torch.optim.Adam(policy_network.parameters(),
                             lr=configs.policy_lr)

In [18]:
num_epochs = 2

In [19]:
for epoch in range(num_epochs):
    
    # loop through each batch inside the dataset
    for sample in tqdm(torch_loader):
        
        # get batch of data
        input_state, output_action = sample
        
        # forward pass to get mean of Gaussian distribution
        action_pred, action_std = policy_network.forward(x=input_state)
        action_prob, action_dist = policy_network.calculate_distribution(action_mu=action_pred,
                                                                         action_std=action_std)
        
        # policy distribution entropy
        entropy = action_dist.entropy()
        
        # compute negative log-likelihood loss value for maximum likelihood estimation
        loss_nll = - action_dist.log_prob(output_action).sum(axis=-1)
        batch_loss = loss_nll.mean()
        
        # backward pass and optimization
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        
        loss_value = round(batch_loss.item(), 5)
        loss_value_str = str(loss_value).replace(".", "_")
        
        print(f"Epoch {epoch + 1}/{num_epochs}, Batch Loss: {loss_value}")
    
    # save the action prediction model after each epoch
    filename = f"policy_network_epoch_{epoch + 1}_loss_{loss_value_str}.pt"
    torch.save(obj=policy_network.state_dict(),
               f=os.path.join(saving_path, filename))

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 221.55it/s]


Epoch 1/2, Batch Loss: 2.95097
Epoch 1/2, Batch Loss: 2.94773
Epoch 1/2, Batch Loss: 2.98054
Epoch 1/2, Batch Loss: 2.9507


100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 439.47it/s]

Epoch 2/2, Batch Loss: 2.93442
Epoch 2/2, Batch Loss: 2.93531
Epoch 2/2, Batch Loss: 2.94034
Epoch 2/2, Batch Loss: 2.87056



