## Reward Model Training Example Given Policy Model has Already been Trained

In [1]:
import os
import sys

import numpy as np
import pandas as pd

import torch

# get the current script's directory
current_directory = os.path.dirname(os.path.abspath(__file__)) if "__file__" in locals() else os.getcwd()
# get the parent directory
parent_directory = os.path.dirname(current_directory)
# add the parent directory to the sys.path
sys.path.append(parent_directory)

from utils import constants
from utils.dataset_loader import PolicyDatasetLoader

from optimization.updater import Updater
from optimization.functions import setup_config, get_directories, load_policy
from optimization.functions import find_indices_of_trajectory_changes, get_estimated_rewards

from models.policy_model import RobotPolicy
from models.reward_model import RewardFunction

In [2]:
pd.set_option("display.max_columns",
              None)

# Initialization

In [3]:
# available evaluating machine
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Evaluating Device: ", device)

# setup hyperparameters
configs = setup_config(device=device)

# create and return preliminary base paths
json_paths, results_path = get_directories(parent_directory=parent_directory,
                                           data_folder_name=constants.DEMO_COLLECTION_DATE)

Evaluating Device:  cpu
Current Time:  Feb_06_2024-10_59_08


In [4]:
# load train-validation dataset of demonstrations
all_data = PolicyDatasetLoader(demo_data_json_paths=json_paths)



Number of Trajectories:  43
Each Trajectory Length:  30
Full Demo Dataset Size:  1379


In [5]:
policy_network = RobotPolicy(state_size=configs.state_size,
                             hidden_size=configs.hidden_size,
                             out_size=configs.action_size,
                             log_std_min=configs.policy_log_std_min,
                             log_std_max=configs.policy_log_std_max,
                             log_std_init=configs.policy_log_std_init,
                             device=configs.device)

In [6]:
reward_network = RewardFunction(state_action_size=configs.state_action_size,
                                hidden_size=configs.hidden_size,
                                out_size=configs.reward_size,
                                device=configs.device)

In [7]:
updater = Updater(configs=configs,
                  policy_network=policy_network,
                  reward_network=reward_network)
updater.initialize_optimizers()

# Load Policy Model Parameters

In [8]:
# folder name where parameters are located ("results / policy_network_params / loading_folder_name")
policy_loading_folder_name = "Feb_05_2024-16_45_05"
policy_params_name = "policy_network_epoch_100_loss_0_30367.pt"

In [9]:
# location of the trained model parameters (make sure that the folder exists where model is trained priorly)
policy_model_folder_path = os.path.join(results_path,
                                        "policy_network_params",
                                        policy_loading_folder_name)

In [10]:
policy_model_path = os.path.join(policy_model_folder_path,
                                 policy_params_name)

In [11]:
# set trained parameters to neural network
policy_network = load_policy(policy_network=policy_network,
                             model_path=policy_model_path)

In [12]:
# set model to evaluation mode
for param in policy_network.parameters():
    param.requires_grad = False
policy_network = policy_network.eval()

# Training Reward Model with Each Trajectory

In [13]:
# get all indice numbers where the new trajectory is initialized in the dataset
trajectory_indices = find_indices_of_trajectory_changes(dataset=all_data)

In [14]:
# index location where the each trajectory starts in the dataframe
traj_start_index = 0

traj_df, reward_values_demo_data, reward_values_estim_data, logprob_action_estim_avg = get_estimated_rewards(configs=configs,
                                                                                                             updater_obj=updater,
                                                                                                             data_loader=all_data,
                                                                                                             policy_network=policy_network,
                                                                                                             reward_network=reward_network,
                                                                                                             trajectory_indices=trajectory_indices,
                                                                                                             traj_start_index=traj_start_index,
                                                                                                             is_inference_reward=False)

In [15]:
# TODO: currently nu weight is zero; will be updated later
nu_factor = torch.tensor(0.0)

In [16]:
# calculate irl loss function value of the particilar trajectories
irl_train_loss = updater.calculate_irl_loss(demo_traj_reward=reward_values_demo_data,
                                            robot_traj_reward=reward_values_estim_data,
                                            log_probability=logprob_action_estim_avg,
                                            nu_factor=nu_factor)

In [17]:
# backward pass and optimization
updater.run_reward_optimizer(irl_loss=irl_train_loss)

In [18]:
irl_train_loss

tensor([[-1.3826]], dtype=torch.float64, grad_fn=<NegBackward0>)