In [1]:
import os
import sys

import numpy as np
import pandas as pd

from tqdm import tqdm

import torch
import random

import seaborn as sns
import matplotlib.pyplot as plt

# get the current script's directory
current_directory = os.path.dirname(os.path.abspath(__file__)) if "__file__" in locals() else os.getcwd()
# get the parent directory
parent_directory = os.path.dirname(current_directory)
# add the parent directory to the sys.path
sys.path.append(parent_directory)

from optimization import functions
from optimization.updater import Updater

from utils import constants, common
from utils.config import Config
from utils.dataset_loader import PolicyDatasetLoader

from models.policy_model import RobotPolicy
from models.reward_model import RewardFunction

from environment.environment import RobotEnvironment
from environment.buffer import ReplayBuffer

In [2]:
mean_loss_reward, mean_loss_policy = [], []

EPISODES_TO_PLAY = 10
REWARD_FUNCTION_UPDATE = 5
DEMO_BATCH = 2 # 256
NUM_EPOCHS = 50

In [3]:
pd.set_option("display.max_columns", None)

current_path = os.getcwd()
parent_path = os.path.dirname(current_path)
grand_parent_path = os.path.dirname(parent_path)

results_path = os.path.join(grand_parent_path, "results")

In [4]:
configs = Config()
# call the parameters method to set the parameters
configs.parameters()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Training Device: ", device)
configs.device = device

configs = functions.setup_config(device=device)

random.seed(configs.seed)
np.random.seed(configs.seed)
torch.manual_seed(configs.seed)

Current Time:  Feb_15_2024-18_57_14
Training Device:  cpu
Current Time:  Feb_15_2024-18_57_14


<torch._C.Generator at 0x2a1b4a80810>

In [5]:
policy_saving_path, reward_saving_path = functions.create_directories(configs=configs,
                                                                      results_path=results_path,
                                                                      saving_policy=False,
                                                                      saving_reward=False)

json_paths_train, results_path = functions.get_directories(parent_directory=parent_directory,
                                                           data_folder_name=constants.DEMO_COLLECTION_DATE) # DEMO_COLLECTION_DATE, TEST_COLLECTION_DATE

training_data = PolicyDatasetLoader(demo_data_json_paths=json_paths_train)

train_loader = torch.utils.data.DataLoader(training_data,
                                           batch_size=configs.batch_size,
                                           shuffle=configs.data_shuffle,
                                           num_workers=configs.num_workers)

trajectory_indices = functions.find_indices_of_trajectory_changes(dataset=training_data)



Number of Trajectories:  43
Each Trajectory Length:  20
Full Demo Dataset Size:  879


In [6]:
policy_network = RobotPolicy(state_size=configs.state_size,
                             hidden_size=configs.hidden_size,
                             out_size=configs.action_size,
                             log_std_min=configs.policy_log_std_min,
                             log_std_max=configs.policy_log_std_max,
                             log_std_init=configs.policy_log_std_init,
                             device=configs.device)
reward_network = RewardFunction(state_action_size=configs.state_action_size,
                                hidden_size=configs.hidden_size,
                                out_size=configs.reward_size,
                                device=configs.device)

updater_obj = Updater(configs=configs,
                      policy_network=policy_network,
                      reward_network=reward_network)
updater_obj.initialize_optimizers()

env = RobotEnvironment()
env.set_reward_network(updater_obj.reward_network)

env.is_reward_inference = False

# Functions

In [7]:
def get_cumulative_rewards(rewards, gamma=0.99):
    G = torch.zeros_like(rewards, dtype=torch.float64)

    G[-1] = rewards[-1].clone()

    for idx in range(len(rewards) - 2, -1, -1):
        G[idx] = rewards[idx] + gamma * G[idx + 1]

    return G

In [8]:
def policy_gradient_loss(cumulative_log_probs,
                         advantages,
                         entropy_weight=1e-2):
    
    # negative log-likelihood multiplied by rewards)
#     weighted_log_probs = torch.sum(cumulative_log_probs, dim=1) * advantages.squeeze()
    weighted_log_probs = cumulative_log_probs * advantages
    policy_loss = - torch.mean(weighted_log_probs, dim=0).mean()
    
#     print("shape 1: ", torch.sum(cumulative_log_probs, dim=1).shape)
#     print("shape 2: ", advantages.shape)
    
#     print("sum cumulative_log_probs : ", torch.sum(cumulative_log_probs, dim=1))
#     print("advantages : ", advantages)
#     print("summed cumulative_log_probs : ", torch.sum(cumulative_log_probs, dim=1) * advantages.squeeze())
    
#     print("weighted_log_probs")
#     print(weighted_log_probs, weighted_log_probs.shape)
#     print("torch.mean(weighted_log_probs) : ", torch.mean(weighted_log_probs))
    
    # entropy regularization
    entropy = - torch.sum(torch.exp(cumulative_log_probs) * cumulative_log_probs, dim=1).mean()

#     print("cumulative_log_probs : ", cumulative_log_probs)
    loss = policy_loss + entropy_weight * entropy
    
    print("cumulative_log_probs : ",cumulative_log_probs)
    print("policy_loss : ", policy_loss)
    print("entropy: ", entropy)
    print("total loss : ", loss)
    
    return loss

In [21]:
def generate_session(t_max,
                     agent,
                     is_policy_inference): 
    
    states, traj_log_probs, actions, rewards = [], [], [], []
    
    state = env.reset()
    
    for t in range(t_max):
        action, action_log_prob = agent.get_action(state)
        
#         print("state : ", state)
#         print(" action : ", action)
#         print(" action : ", action.squeeze(0))
#         print(" action_log_prob : ", action_log_prob.squeeze(0))
        
        next_state, reward, done = env.step(state=state,
                                            action=torch.tensor(action.squeeze(0)))
        
        states.append(state.clone())
        actions.append(torch.tensor(action.squeeze(0)).clone())
        traj_log_probs.append(action_log_prob.squeeze(0).clone())
        rewards.append(reward.clone())
        
        state = next_state.detach()
        
        if done:  
            break
    
    return states, actions, traj_log_probs, rewards, None

In [10]:
def preprocess_traj(traj_list_,
                    step_tensor,
                    is_Demo=False):
    for traj_df in traj_list_:
        
        if is_Demo:
            states = torch.tensor(traj_df[["state_label_norm_1", "state_label_norm_2", "state_label_norm_3"]].values)
            actions = torch.tensor(traj_df[["action_label_norm_1", "action_label_norm_2", "action_label_norm_3"]].values)
            log_probs = torch.tensor(np.zeros((actions.shape[0], 1)))
        
        else:
            states = torch.stack(traj_df[0])
            actions = torch.stack(traj_df[1])
            log_probs = torch.stack(traj_df[3])
        
        mdp = torch.cat((states, log_probs, actions), dim=1)
        step_tensor = torch.cat((step_tensor.clone(), mdp.clone()), dim=0)
    
    return step_tensor

# Tests

In [11]:
# capacity = 100
# replay_buffer = ReplayBuffer(capacity)
# for i in range(NUM_EPOCHS):
#     print("Epoch : ", i)
#     for _ in range(EPISODES_TO_PLAY):
#         samp_trajs = generate_session(t_max=constants.TRAJECTORY_SIZE,
#                                       updater_obj=updater_obj,
#                                       replay_buffer=replay_buffer,
#                                       is_policy_inference=False,
#                                       is_policy_update=True)

In [12]:
# sampled_states, sampled_actions, sampled_rewards, sampled_next_states, sampled_dones, sampled_probs = \
#     replay_buffer.sample_trajectory()

# DDPG

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Critic(nn.Module):

    def __init__(self, obs_dim=3, action_dim=3):
        super(Critic, self).__init__()

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.linear1 = nn.Linear(3, 64)
        self.linear2 = nn.Linear(64 + 3, 32)
        self.linear3 = nn.Linear(32, 32)
        self.linear4 = nn.Linear(32, 1)

    def forward(self, x, a):
        x = F.relu(self.linear1(x))
        xa_cat = torch.cat([x,a], 1)
        xa = F.relu(self.linear2(xa_cat))
        xa = F.relu(self.linear3(xa))
        qval = self.linear4(xa)

        return qval

class Actor(nn.Module):

    def __init__(self, obs_dim=3, action_dim=3):
        super(Actor, self).__init__()

        self.obs_dim = obs_dim
        self.action_dim = action_dim

        self.linear1 = nn.Linear(3, 64)
        self.linear2 = nn.Linear(64, 64)
        self.linear3 = nn.Linear(64, 3)

    def forward(self, obs):
        x = F.relu(self.linear1(obs))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)

        return x
    
    def estimate_action(self, obs):
        action_mean = self.forward(obs)
        
        action_distribution = torch.distributions.Normal(action_mean, torch.ones_like(action_mean))
        
        sampled_action = action_distribution.rsample()
        log_prob = action_distribution.log_prob(sampled_action)
        
        return torch.tanh(sampled_action), log_prob

In [14]:
class OUNoise(object):
    def __init__(self, mu=0.0, theta=0.15, max_sigma=0.3, min_sigma=0.3, decay_period=100000):
        self.mu           = mu
        self.theta        = theta
        self.sigma        = max_sigma
        self.max_sigma    = max_sigma
        self.min_sigma    = min_sigma
        self.decay_period = decay_period
        self.action_dim   = 3
        self.low          = -1
        self.high         = +1
        self.reset()
        
    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu
        
    def evolve_state(self):
        x  = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(self.action_dim)
        self.state = x + dx
        return self.state
    
#     def get_action(self, action, t=0):
#         ou_state = self.evolve_state()
#         self.sigma = self.max_sigma - (self.max_sigma - self.min_sigma) * min(1.0, t / self.decay_period)
#         return np.clip(action + ou_state, self.low, self.high)

In [15]:
import torch
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F


class DDPGAgent:
    
    def __init__(self, env, gamma, tau, buffer_maxlen, critic_learning_rate, actor_learning_rate):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.env = env
        self.obs_dim = 3
        self.action_dim = 3
        
        # hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau
        
        # initialize actor and critic networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim, self.action_dim).to(self.device)
        
        self.actor = Actor(self.obs_dim, self.action_dim).to(self.device)
        self.actor_target = Actor(self.obs_dim, self.action_dim).to(self.device)
    
        # Copy critic target parameters
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)
        
        # optimizers
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate)
        self.actor_optimizer  = optim.Adam(self.actor.parameters(), lr=actor_learning_rate)
    
        self.replay_buffer = ReplayBuffer(buffer_maxlen)      
        self.noise = OUNoise()
        
    def get_action(self, obs):
        state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
#         action = self.actor.forward(state)
        action, log_prob = self.actor.estimate_action(state)
    
        action = action.cpu().detach().numpy()

        return action, log_prob
    
    def update(self, batch_size):
        
        states, actions, rewards, next_states, _, _ = self.replay_buffer.sample_batch(batch_size)
        
        state_batch, action_batch, reward_batch, next_state_batch, masks, _ = self.replay_buffer.sample_batch(batch_size)
        
#         print("masks : ", masks.int())
        
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks.float()).to(self.device)
   
        curr_Q = self.critic.forward(state_batch, action_batch)
        next_actions = self.actor_target.forward(next_state_batch)
        next_Q = self.critic_target.forward(next_state_batch, next_actions.detach())
        expected_Q = reward_batch + self.gamma * next_Q
        
        # update critic
        q_loss = F.mse_loss(curr_Q, expected_Q.detach())

        self.critic_optimizer.zero_grad()
        q_loss.backward() 
        self.critic_optimizer.step()

        # update actor
#         policy_loss = -self.critic.forward(state_batch, self.actor.forward(state_batch)).mean()
        policy_loss = -self.critic.forward(state_batch, self.actor.estimate_action(state_batch)[0]).mean()
        
        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # update target networks 
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))
       
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau))

In [23]:
def mini_batch_train(env, agent, max_episodes, max_steps, batch_size):
    episode_rewards = []

    for episode in range(max_episodes):
        state = env.reset()
        episode_reward = 0
        
        for step in range(max_steps):
#             action = torch.tensor(agent.get_action(state)[0])
            print("state : ", )
            print("agent.get_action(state)[0] : ", agent.get_action(state)[0])
            next_state, reward, done = env.step(state, torch.tensor(agent.get_action(state)[0].squeeze(0)))
#             print("action : ", action)
#             print("next_state : ", next_state)
#             print("reward : ", reward)
#             print("done : ", done)
            agent.replay_buffer.push(state, torch.tensor(agent.get_action(state)[0].squeeze(0)), reward, next_state, done, torch.tensor([]))
            episode_reward += reward

            if len(agent.replay_buffer) > batch_size:
                agent.update(batch_size)   

            if done or step == max_steps-1:
                episode_rewards.append(episode_reward)
                print("Episode " + str(episode) + ": " + str(episode_reward))
                break

            state = next_state

    return episode_rewards

In [17]:
D_demo, D_samp = torch.tensor([]), torch.tensor([])

max_episodes = 100
max_steps = 500
batch_size = 32

gamma = 0.99
tau = 1e-2
buffer_maxlen = 100000
critic_lr = 1e-3
actor_lr = 1e-3

In [18]:
env = RobotEnvironment()
env.set_reward_network(reward_network)
env.is_reward_inference = False

agent = DDPGAgent(env, gamma, tau, buffer_maxlen, critic_lr, actor_lr)

# episode_rewards = mini_batch_train(env, agent, max_episodes, max_steps, batch_size)

In [19]:
demo_traj_list = []

for traj_start_index in range(len(trajectory_indices)):
    
    traj_df, reward_values_demo_data, reward_values_estim_data, logprob_action_estim_avg = \
        functions.get_estimated_rewards(configs=configs,
                                        updater_obj=updater_obj,
                                        data_loader=training_data,
                                        policy_network=updater_obj.policy_network,
                                        reward_network=updater_obj.reward_network,
                                        trajectory_indices=trajectory_indices,
                                        traj_start_index=traj_start_index,
                                        is_inference_reward=True,
                                        is_inference_policy=True)
    demo_traj_list.append(traj_df)
    del traj_df

D_demo = preprocess_traj(traj_list_=demo_traj_list,
                         step_tensor=D_demo,
                         is_Demo=True)

state :  tensor([[0.1220, 0.4467, 0.2398]])
action_mu, action_std :  tensor([[ 0.0338,  0.0361, -0.0228]]) tensor([[1.0837, 1.0412, 1.0278]])
action_dist.mean :  tensor([[ 0.0338,  0.0361, -0.0228]])
action_dist.stddev :  tensor([[1.0837, 1.0412, 1.0278]])
action_sample :  tensor([[ 0.0338,  0.0361, -0.0228]])
action_log_prob :  tensor([[-0.9993, -0.9593, -0.9464]])
state :  tensor([[0.2116, 0.4693, 0.3819]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0946, 1.0480, 1.0302]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0946, 1.0480, 1.0302]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9487]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

action_mu, action_std :  tensor([[ 0.0326,  0.0357, -0.0224]]) tensor([[1.0820, 1.0392, 1.0271]])
action_dist.mean :  tensor([[ 0.0326,  0.0357, -0.0224]])
action_dist.stddev :  tensor([[1.0820, 1.0392, 1.0271]])
action_sample :  tensor([[ 0.0326,  0.0357, -0.0224]])
action_log_prob :  tensor([[-0.9978, -0.9574, -0.9457]])
state :  tensor([[0.2110, 0.4693, 0.3820]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0946, 1.0480, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0946, 1.0480, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9487]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.01

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.1392, 0.4837, 0.3045]])
action_mu, action_std :  tensor([[ 0.0353,  0.0372, -0.0221]]) tensor([[1.0882, 1.0454, 1.0296]])
action_dist.mean :  tensor([[ 0.0353,  0.0372, -0.0221]])
action_dist.stddev :  tensor([[1.0882, 1.0454, 1.0296]])
action_sample :  tensor([[ 0.0353,  0.0372, -0.0221]])
action_log_prob :  tensor([[-1.0034, -0.9634, -0.9481]])
state :  tensor([[0.2126, 0.4695, 0.3821]])
action_mu, action_std :  tensor([[ 0.0315,  0.0386, -0.0182]]) tensor([[1.0947, 1.0480, 1.0302]])
action_dist.mean :  tensor([[ 0.0315,  0.0386, -0.0182]])
action_dist.stddev :  tensor([[1.0947, 1.0480, 1.0302]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.1385, 0.4840, 0.3006]])
action_mu, action_std :  tensor([[ 0.0353,  0.037

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2137, 0.4719, 0.3840]])
action_mu, action_std :  tensor([[ 0.0316,  0.0385, -0.0181]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0385, -0.0181]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0385, -0.0181]])
action_log_prob :  tensor([[-1.0095, -0.9659, -0.9488]])
state :  tensor([[0.2118, 0.4694, 0.3835]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.1575, 0.5031, 0.3265]])
action_mu, action_std :  tensor([[ 0.0357,  0.0373, -0.0205]]) tensor([[1.0900, 1.0463, 1.0299]])
action_dist.mean :  tensor([[ 0.0357,  0.0373, -0.0205]])
action_dist.stddev :  tensor([[1.0900, 1.0463, 1.0299]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
actio

action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_dist.stddev :  tensor([[1.0947, 1.0481, 1.0303]])
action_sample :  tensor([[ 0.0316,  0.0386, -0.0183]])
action_log_prob :  tensor([[-1.0094, -0.9659, -0.9488]])
state :  tensor([[0.2117, 0.4693, 0.3834]])
action_mu, action_std :  tensor([[ 0.0316,  0.0386, -0.0183]]) tensor([[1.0947, 1.0481, 1.0303]])
action_dist.mean :  tensor([[ 0.0316,  0.0386, -0.0183]])
actio

In [24]:
for i in range(NUM_EPOCHS):
    print("Epoch : ", i)
    
    samp_trajs = [generate_session(t_max=max_steps,
                                   agent=agent,
                                   is_policy_inference=False) for _ in range(EPISODES_TO_PLAY)]
    
    D_samp = preprocess_traj(traj_list_=samp_trajs,
                             step_tensor=D_samp,
                             is_Demo=False)
    
    loss_reward, loss_policy = [], []
    samp_reward_values = []
    is_early_stop = False
    
    for _ in range(REWARD_FUNCTION_UPDATE):
        
        selected_samp = np.random.choice(len(D_samp), DEMO_BATCH, replace=True)
        selected_demo = np.random.choice(len(D_demo), DEMO_BATCH, replace=True)
        
        D_s_samp = D_samp[selected_samp].clone().detach()
        D_s_demo = D_demo[selected_demo].clone().detach()
        
        D_s_samp = torch.cat((D_s_demo, D_s_samp), dim=0)
        D_sr_samp = D_s_samp[torch.randperm(int(D_s_samp.size(0)))]
        
        states_robot, log_probs_robot, actions_robot = D_sr_samp[:, :3], D_sr_samp[:, 3:4], D_sr_samp[:, 4:]
        states_expert, actions_expert = D_s_demo[:, :3], D_s_demo[:, 4:]
        
        samp_rewards = updater_obj.reward_network.estimate_reward(state_action=states_robot.float(), is_inference=False)
        demo_rewards = updater_obj.reward_network.estimate_reward(state_action=states_expert.float(), is_inference=False)
        
        loss_IOC = - torch.mean(demo_rewards) + \
            torch.log(torch.mean(torch.exp(samp_rewards) / (torch.exp(log_probs_robot) + 1e-7)))
        
        print("loss_IOC : ", loss_IOC.detach().numpy())
        
        updater_obj.run_reward_optimizer(irl_loss=loss_IOC)
        
        loss_reward.append(loss_IOC.detach().item())
        
        if torch.mean(demo_rewards) >= 0.99:
            is_early_stop = True
    
    episode_rewards = mini_batch_train(env, agent, max_episodes, max_steps, batch_size)

Epoch :  0
loss_IOC :  -0.2266782596314631
loss_IOC :  -0.22670409902744226
loss_IOC :  -0.22665104833641708
loss_IOC :  -0.22661380200170111
loss_IOC :  -0.22616552736715967
state : 
agent.get_action(state)[0] :  [[-0.49455532 -0.6351016  -0.70126   ]]
state : 
agent.get_action(state)[0] :  [[0.6234666  0.41434664 0.87598324]]
state : 
agent.get_action(state)[0] :  [[0.8159721  0.9538097  0.30330393]]
state : 
agent.get_action(state)[0] :  [[-0.5497063  -0.45127332  0.43321475]]
state : 
agent.get_action(state)[0] :  [[ 0.84627545  0.02798351 -0.0808639 ]]
state : 
agent.get_action(state)[0] :  [[ 0.86010545 -0.6220966   0.7851015 ]]
state : 
agent.get_action(state)[0] :  [[-0.80227643 -0.3762393   0.75453216]]
state : 
agent.get_action(state)[0] :  [[-0.91654336  0.85077244  0.441727  ]]
state : 
agent.get_action(state)[0] :  [[-0.91843575  0.9002074   0.55702996]]
state : 
agent.get_action(state)[0] :  [[ 0.63082296 -0.1797663  -0.89138204]]
state : 
agent.get_action(state)[0] :  [[

state : 
agent.get_action(state)[0] :  [[-0.5814748 -0.936389  -0.3419835]]
state : 
agent.get_action(state)[0] :  [[ 0.56501746  0.6202943  -0.78273726]]
state : 
agent.get_action(state)[0] :  [[ 0.82808226 -0.9014816   0.9813825 ]]
state : 
agent.get_action(state)[0] :  [[-0.6406293  -0.8229641  -0.05245674]]
state : 
agent.get_action(state)[0] :  [[ 0.7280882  -0.30896103  0.27444246]]
Episode 32: tensor([3.0699])
state : 
agent.get_action(state)[0] :  [[-0.26222885  0.88440794  0.9855776 ]]
Episode 33: tensor([0.5142])
state : 
agent.get_action(state)[0] :  [[-0.1531474  -0.22357629  0.8185562 ]]
state : 
agent.get_action(state)[0] :  [[-0.47945678 -0.71580577 -0.57525605]]
state : 
agent.get_action(state)[0] :  [[-0.6214455  -0.26292375  0.73100704]]
Episode 34: tensor([1.5384])
state : 
agent.get_action(state)[0] :  [[-0.89882743  0.39898375  0.7879453 ]]
state : 
agent.get_action(state)[0] :  [[ 0.24386272  0.9156174  -0.62887263]]
state : 
agent.get_action(state)[0] :  [[0.1627

Episode 5: tensor([0.5131])
state : 
agent.get_action(state)[0] :  [[ 0.9944286  -0.999712    0.99971646]]
Episode 6: tensor([0.5123])
state : 
agent.get_action(state)[0] :  [[ 0.9999672  -0.99670464  0.9998429 ]]
Episode 7: tensor([0.5125])
state : 
agent.get_action(state)[0] :  [[ 0.9979926 -0.9922857  0.9848746]]
Episode 8: tensor([0.5124])
state : 
agent.get_action(state)[0] :  [[ 0.9953716 -0.9953525  0.9998549]]
Episode 9: tensor([0.5120])
state : 
agent.get_action(state)[0] :  [[ 0.9998727  -0.99875665  0.99409807]]
Episode 10: tensor([0.5128])
state : 
agent.get_action(state)[0] :  [[ 0.99985707 -0.9929304   0.9060419 ]]
Episode 11: tensor([0.5128])
state : 
agent.get_action(state)[0] :  [[ 0.9829028  -0.99565136  0.993939  ]]
Episode 12: tensor([0.5126])
state : 
agent.get_action(state)[0] :  [[ 0.99940103 -0.9956479   0.9999912 ]]
Episode 13: tensor([0.5131])
state : 
agent.get_action(state)[0] :  [[ 0.99904734 -0.99989694  0.9983335 ]]
Episode 14: tensor([0.5121])
state : 
a

Episode 7: tensor([0.5108])
state : 
agent.get_action(state)[0] :  [[ 0.99979097 -0.99630815  0.9999737 ]]
Episode 8: tensor([0.5111])
state : 
agent.get_action(state)[0] :  [[ 0.9999291  -0.9999796   0.99953705]]
Episode 9: tensor([0.5111])
state : 
agent.get_action(state)[0] :  [[ 0.9993543 -0.999166   0.9984508]]
Episode 10: tensor([0.5114])
state : 
agent.get_action(state)[0] :  [[ 0.99988425 -0.9965942   0.9987139 ]]
Episode 11: tensor([0.5115])
state : 
agent.get_action(state)[0] :  [[ 0.9998405 -0.9999221  0.999797 ]]
Episode 12: tensor([0.5113])
state : 
agent.get_action(state)[0] :  [[ 0.99997205 -0.999866    0.99997646]]
Episode 13: tensor([0.5111])
state : 
agent.get_action(state)[0] :  [[ 0.9997829 -0.9995422  0.9999795]]
Episode 14: tensor([0.5109])
state : 
agent.get_action(state)[0] :  [[ 0.9999957  -0.99999154  0.9997292 ]]
Episode 15: tensor([0.5111])
state : 
agent.get_action(state)[0] :  [[ 0.9996643 -0.9999771  0.9998709]]
Episode 16: tensor([0.5118])
state : 
agent

Episode 84: tensor([0.5111])
state : 
agent.get_action(state)[0] :  [[ 0.9999783 -0.9999758  0.9999976]]
Episode 85: tensor([0.5111])
state : 
agent.get_action(state)[0] :  [[ 0.9998762  -0.99999547  0.99999815]]
Episode 86: tensor([0.5104])
state : 
agent.get_action(state)[0] :  [[ 0.9999739  -0.9997745   0.99999785]]
Episode 87: tensor([0.5112])
state : 
agent.get_action(state)[0] :  [[ 0.9999791  -0.99999774  0.9999932 ]]
Episode 88: tensor([0.5112])
state : 
agent.get_action(state)[0] :  [[ 0.9999629 -0.9999984  0.9999541]]
Episode 89: tensor([0.5102])
state : 
agent.get_action(state)[0] :  [[ 0.99999166 -0.999987    0.9997107 ]]
Episode 90: tensor([0.5107])
state : 
agent.get_action(state)[0] :  [[ 0.99999917 -0.99997073  0.9999881 ]]
Episode 91: tensor([0.5112])
state : 
agent.get_action(state)[0] :  [[ 0.9998788  -0.9999959   0.99998623]]
Episode 92: tensor([0.5113])
state : 
agent.get_action(state)[0] :  [[ 0.9996762 -0.9999109  0.9999955]]
Episode 93: tensor([0.5113])
state : 

Episode 75: tensor([0.5097])
state : 
agent.get_action(state)[0] :  [[ 0.9999845  -0.9999989   0.99999636]]
Episode 76: tensor([0.5094])
state : 
agent.get_action(state)[0] :  [[ 0.9999215  -0.99998915  0.9999949 ]]
Episode 77: tensor([0.5098])
state : 
agent.get_action(state)[0] :  [[ 1.         -0.99977225  0.9999941 ]]
Episode 78: tensor([0.5098])
state : 
agent.get_action(state)[0] :  [[ 0.99998736 -0.9994626   0.9999935 ]]
Episode 79: tensor([0.5093])
state : 
agent.get_action(state)[0] :  [[ 0.9999924 -0.9999255  0.9999981]]
Episode 80: tensor([0.5100])
state : 
agent.get_action(state)[0] :  [[ 0.9999975  -0.99999005  0.9999927 ]]
Episode 81: tensor([0.5100])
state : 
agent.get_action(state)[0] :  [[ 1.         -0.99999726  0.9999968 ]]
Episode 82: tensor([0.5101])
state : 
agent.get_action(state)[0] :  [[ 0.9999936 -0.9999999  0.9999876]]
Episode 83: tensor([0.5095])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -0.999991    0.9999999 ]]
Episode 84: tensor([0.5103])
state

state : 
agent.get_action(state)[0] :  [[ 0.9999896 -0.9999933  1.       ]]
Episode 73: tensor([0.5087])
state : 
agent.get_action(state)[0] :  [[ 0.9999997  -0.99999166  0.99999547]]
Episode 74: tensor([0.5082])
state : 
agent.get_action(state)[0] :  [[ 0.99999946 -0.99997294  0.99999994]]
Episode 75: tensor([0.5086])
state : 
agent.get_action(state)[0] :  [[ 0.9999996 -0.9999991  0.9999999]]
Episode 76: tensor([0.5083])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -0.99999875  0.9999998 ]]
Episode 77: tensor([0.5080])
state : 
agent.get_action(state)[0] :  [[ 0.99998564 -0.99999917  0.9999994 ]]
Episode 78: tensor([0.5081])
state : 
agent.get_action(state)[0] :  [[ 0.99999917 -0.9999969   0.9999995 ]]
Episode 79: tensor([0.5079])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -0.9999971   0.9999993 ]]
Episode 80: tensor([0.5078])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -0.9999854   0.9999796 ]]
Episode 81: tensor([0.5080])
state : 
agent.get_action(state)[0

Episode 49: tensor([0.5063])
state : 
agent.get_action(state)[0] :  [[ 1.        -0.9999996  1.       ]]
Episode 50: tensor([0.5062])
state : 
agent.get_action(state)[0] :  [[ 1.        -0.9999991  0.9999999]]
Episode 51: tensor([0.5070])
state : 
agent.get_action(state)[0] :  [[ 0.9999994  -0.99999994  0.9999998 ]]
Episode 52: tensor([0.5070])
state : 
agent.get_action(state)[0] :  [[ 0.9999999  -0.99999964  0.9999999 ]]
Episode 53: tensor([0.5065])
state : 
agent.get_action(state)[0] :  [[ 0.9999999  -1.          0.99999994]]
Episode 54: tensor([0.5065])
state : 
agent.get_action(state)[0] :  [[ 0.9999999 -1.         0.9999999]]
Episode 55: tensor([0.5069])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -1.          0.9999999 ]]
Episode 56: tensor([0.5061])
state : 
agent.get_action(state)[0] :  [[ 1.         -0.99999994  0.9999996 ]]
Episode 57: tensor([0.5065])
state : 
agent.get_action(state)[0] :  [[ 0.9999999 -1.         1.       ]]
Episode 58: tensor([0.5060])
state : 
ag

Episode 35: tensor([0.5056])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -0.99999994  1.        ]]
Episode 36: tensor([0.5053])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 37: tensor([0.5046])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -1.          1.        ]]
Episode 38: tensor([0.5052])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -0.99999994  0.9999999 ]]
Episode 39: tensor([0.5055])
state : 
agent.get_action(state)[0] :  [[ 1.         -0.9999999   0.99999994]]
Episode 40: tensor([0.5057])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 41: tensor([0.5053])
state : 
agent.get_action(state)[0] :  [[ 0.9999998  -0.99999994  0.9999997 ]]
Episode 42: tensor([0.5046])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -0.99999994  1.        ]]
Episode 43: tensor([0.5059])
state : 
agent.get_action(state)[0] :  [[ 1.        -0.9999998  1.       ]]
Episode 44: tensor([0.5055])
state : 
agent.get_action(state)[0] :  [[ 0.999999

Episode 36: tensor([0.5045])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -1.          1.        ]]
Episode 37: tensor([0.5038])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 38: tensor([0.5035])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 39: tensor([0.5043])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 40: tensor([0.5041])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 41: tensor([0.5051])
state : 
agent.get_action(state)[0] :  [[ 1.         -0.99999994  1.        ]]
Episode 42: tensor([0.5048])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 43: tensor([0.5039])
state : 
agent.get_action(state)[0] :  [[ 1.         -1.          0.99999994]]
Episode 44: tensor([0.5041])
state : 
agent.get_action(state)[0] :  [[ 0.99999976 -0.99999994  1.        ]]
Episode 45: tensor([0.5040])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 46: tensor([0.5032])
state : 
agent.get_action(state)[0]

Episode 24: tensor([0.5029])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 25: tensor([0.5033])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 26: tensor([0.5028])
state : 
agent.get_action(state)[0] :  [[ 1.         -0.99999994  1.        ]]
Episode 27: tensor([0.5031])
state : 
agent.get_action(state)[0] :  [[ 1.         -0.99999994  0.99999994]]
Episode 28: tensor([0.5025])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 29: tensor([0.5026])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 30: tensor([0.5021])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -1.          1.        ]]
Episode 31: tensor([0.5035])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 32: tensor([0.5026])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 33: tensor([0.5032])
state : 
agent.get_action(state)[0] :  [[ 0.99999994 -1.          1.        ]]
Episode 34: tensor([0.5023])
state : 
agent.get_action(state)[0]

Episode 44: tensor([0.5000])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 45: tensor([0.5006])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 46: tensor([0.5002])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 47: tensor([0.4999])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 48: tensor([0.5014])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 49: tensor([0.5008])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 50: tensor([0.5010])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 51: tensor([0.5003])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 52: tensor([0.5012])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 53: tensor([0.5012])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 54: tensor([0.5005])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 55: tensor([0.5015])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 46: tensor([0.4993])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 47: tensor([0.4995])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 48: tensor([0.4992])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 49: tensor([0.5003])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 50: tensor([0.4989])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 51: tensor([0.4998])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 52: tensor([0.4997])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 53: tensor([0.4989])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 54: tensor([0.4994])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 55: tensor([0.4989])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 56: tensor([0.4996])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 57: tensor([0.4991])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 58: tensor([0.4983])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 59: tensor([0.4978])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 60: tensor([0.4984])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 61: tensor([0.4979])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 62: tensor([0.4977])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 63: tensor([0.4983])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 64: tensor([0.4985])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 65: tensor([0.4972])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 66: tensor([0.4970])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 67: tensor([0.4977])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 68: tensor([0.4980])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 69: tensor([0.4982])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 65: tensor([0.4972])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 66: tensor([0.4967])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 67: tensor([0.4960])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 68: tensor([0.4965])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 69: tensor([0.4974])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 70: tensor([0.4964])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 71: tensor([0.4981])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 72: tensor([0.4973])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 73: tensor([0.4962])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 74: tensor([0.4964])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 75: tensor([0.4965])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 76: tensor([0.4965])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 78: tensor([0.4954])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 79: tensor([0.4960])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 80: tensor([0.4956])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 81: tensor([0.4957])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 82: tensor([0.4958])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 83: tensor([0.4962])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 84: tensor([0.4961])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 85: tensor([0.4952])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 86: tensor([0.4964])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 87: tensor([0.4970])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 88: tensor([0.4965])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 89: tensor([0.4960])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 77: tensor([0.4954])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 78: tensor([0.4940])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 79: tensor([0.4943])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 80: tensor([0.4944])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 81: tensor([0.4940])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 82: tensor([0.4934])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 83: tensor([0.4946])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 84: tensor([0.4938])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 85: tensor([0.4937])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 86: tensor([0.4962])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 87: tensor([0.4947])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 88: tensor([0.4944])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 96: tensor([0.4940])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 97: tensor([0.4923])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 98: tensor([0.4932])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 99: tensor([0.4937])
Epoch :  16
loss_IOC :  -0.22902495786723437
loss_IOC :  -0.2330254827578634
loss_IOC :  -0.23134779465500394
loss_IOC :  -0.22355593299717558
loss_IOC :  -0.22558115309052523
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 0: tensor([0.4923])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 1: tensor([0.4907])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 2: tensor([0.4928])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 3: tensor([0.4919])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 4: tensor([0.4921])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 5: tensor([0.4925])
state : 
agent.get_action(state)[0] :  [[ 1. 

agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 11: tensor([0.4925])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 12: tensor([0.4901])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 13: tensor([0.4913])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 14: tensor([0.4916])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 15: tensor([0.4909])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 16: tensor([0.4920])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 17: tensor([0.4909])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 18: tensor([0.4898])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 19: tensor([0.4923])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 20: tensor([0.4919])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 21: tensor([0.4918])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 22: tensor([0.4909])
s

Episode 14: tensor([0.4911])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 15: tensor([0.4907])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 16: tensor([0.4907])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 17: tensor([0.4904])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 18: tensor([0.4898])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 19: tensor([0.4902])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 20: tensor([0.4907])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 21: tensor([0.4897])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 22: tensor([0.4909])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 23: tensor([0.4905])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 24: tensor([0.4898])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 25: tensor([0.4903])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 33: tensor([0.4895])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 34: tensor([0.4899])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 35: tensor([0.4899])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 36: tensor([0.4894])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 37: tensor([0.4901])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 38: tensor([0.4893])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 39: tensor([0.4894])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 40: tensor([0.4897])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 41: tensor([0.4898])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 42: tensor([0.4890])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 43: tensor([0.4892])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 44: tensor([0.4898])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 39: tensor([0.4894])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 40: tensor([0.4884])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 41: tensor([0.4901])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 42: tensor([0.4893])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 43: tensor([0.4874])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 44: tensor([0.4877])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 45: tensor([0.4882])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 46: tensor([0.4879])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 47: tensor([0.4882])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 48: tensor([0.4878])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 49: tensor([0.4883])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 50: tensor([0.4897])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 59: tensor([0.4866])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 60: tensor([0.4873])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 61: tensor([0.4873])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 62: tensor([0.4881])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 63: tensor([0.4888])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 64: tensor([0.4869])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 65: tensor([0.4867])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 66: tensor([0.4871])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 67: tensor([0.4875])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 68: tensor([0.4876])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 69: tensor([0.4882])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 70: tensor([0.4879])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 62: tensor([0.4860])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 63: tensor([0.4863])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 64: tensor([0.4858])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 65: tensor([0.4869])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 66: tensor([0.4871])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 67: tensor([0.4875])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 68: tensor([0.4867])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 69: tensor([0.4872])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 70: tensor([0.4865])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 71: tensor([0.4861])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 72: tensor([0.4866])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 73: tensor([0.4876])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 82: tensor([0.4869])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 83: tensor([0.4878])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 84: tensor([0.4872])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 85: tensor([0.4867])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 86: tensor([0.4860])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 87: tensor([0.4863])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 88: tensor([0.4866])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 89: tensor([0.4884])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 90: tensor([0.4855])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 91: tensor([0.4864])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 92: tensor([0.4861])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 93: tensor([0.4868])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 90: tensor([0.4855])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 91: tensor([0.4866])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 92: tensor([0.4875])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 93: tensor([0.4855])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 94: tensor([0.4842])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 95: tensor([0.4842])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 96: tensor([0.4838])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 97: tensor([0.4858])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 98: tensor([0.4858])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 99: tensor([0.4842])
Epoch :  25
loss_IOC :  -0.22043310074539435
loss_IOC :  -0.23208748725925815
loss_IOC :  -0.2359176127407956
loss_IOC :  -0.21915509853277026
loss_IOC :  -0.22414765449135055
state : 
agent.get_action(state)[0] :  

loss_IOC :  -0.2340011856363035
loss_IOC :  -0.2234574854695393
loss_IOC :  -0.22317340481707476
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 0: tensor([0.4842])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 1: tensor([0.4833])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 2: tensor([0.4822])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 3: tensor([0.4839])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 4: tensor([0.4841])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 5: tensor([0.4828])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 6: tensor([0.4827])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 7: tensor([0.4838])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 8: tensor([0.4838])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 9: tensor([0.4832])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 10: tensor

Episode 12: tensor([0.4810])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 13: tensor([0.4838])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 14: tensor([0.4836])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 15: tensor([0.4827])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 16: tensor([0.4829])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 17: tensor([0.4824])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 18: tensor([0.4838])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 19: tensor([0.4822])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 20: tensor([0.4821])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 21: tensor([0.4830])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 22: tensor([0.4835])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 23: tensor([0.4831])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 26: tensor([0.4834])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 27: tensor([0.4816])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 28: tensor([0.4794])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 29: tensor([0.4812])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 30: tensor([0.4831])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 31: tensor([0.4809])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 32: tensor([0.4802])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 33: tensor([0.4826])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 34: tensor([0.4834])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 35: tensor([0.4813])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 36: tensor([0.4838])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 37: tensor([0.4812])
state : 
agent.get_action(state)[0] :  [[ 1. -1

agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 52: tensor([0.4809])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 53: tensor([0.4813])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 54: tensor([0.4822])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 55: tensor([0.4814])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 56: tensor([0.4806])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 57: tensor([0.4808])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 58: tensor([0.4794])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 59: tensor([0.4811])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 60: tensor([0.4813])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 61: tensor([0.4810])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 62: tensor([0.4796])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 63: tensor([0.4823])
s

Episode 70: tensor([0.4800])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 71: tensor([0.4800])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 72: tensor([0.4794])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 73: tensor([0.4805])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 74: tensor([0.4794])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 75: tensor([0.4791])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 76: tensor([0.4793])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 77: tensor([0.4800])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 78: tensor([0.4814])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 79: tensor([0.4800])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 80: tensor([0.4786])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 81: tensor([0.4786])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 78: tensor([0.4789])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 79: tensor([0.4808])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 80: tensor([0.4781])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 81: tensor([0.4782])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 82: tensor([0.4770])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 83: tensor([0.4787])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 84: tensor([0.4789])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 85: tensor([0.4792])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 86: tensor([0.4792])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 87: tensor([0.4804])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 88: tensor([0.4800])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 89: tensor([0.4800])
state : 
agent.get_action(state)[0] :  [[ 1. -1

loss_IOC :  -0.2398021814880706
loss_IOC :  -0.23390214900294398
loss_IOC :  -0.2200174386249379
loss_IOC :  -0.23610616698723813
loss_IOC :  -0.2307094824278249
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 0: tensor([0.4776])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 1: tensor([0.4771])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 2: tensor([0.4782])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 3: tensor([0.4753])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 4: tensor([0.4802])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 5: tensor([0.4769])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 6: tensor([0.4790])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 7: tensor([0.4780])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 8: tensor([0.4756])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 9: tensor([0.4772])
state : 

Episode 13: tensor([0.4737])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 14: tensor([0.4754])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 15: tensor([0.4772])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 16: tensor([0.4748])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 17: tensor([0.4756])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 18: tensor([0.4769])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 19: tensor([0.4774])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 20: tensor([0.4765])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 21: tensor([0.4780])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 22: tensor([0.4759])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 23: tensor([0.4781])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 24: tensor([0.4744])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 26: tensor([0.4737])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 27: tensor([0.4740])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 28: tensor([0.4748])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 29: tensor([0.4750])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 30: tensor([0.4748])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 31: tensor([0.4756])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 32: tensor([0.4745])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 33: tensor([0.4768])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 34: tensor([0.4764])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 35: tensor([0.4757])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 36: tensor([0.4752])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 37: tensor([0.4746])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 23: tensor([0.4722])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 24: tensor([0.4761])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 25: tensor([0.4739])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 26: tensor([0.4729])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 27: tensor([0.4759])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 28: tensor([0.4759])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 29: tensor([0.4727])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 30: tensor([0.4732])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 31: tensor([0.4746])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 32: tensor([0.4730])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 33: tensor([0.4744])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 34: tensor([0.4758])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 25: tensor([0.4727])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 26: tensor([0.4725])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 27: tensor([0.4717])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 28: tensor([0.4690])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 29: tensor([0.4745])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 30: tensor([0.4715])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 31: tensor([0.4717])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 32: tensor([0.4707])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 33: tensor([0.4732])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 34: tensor([0.4744])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 35: tensor([0.4720])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 36: tensor([0.4706])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 31: tensor([0.4705])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 32: tensor([0.4719])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 33: tensor([0.4694])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 34: tensor([0.4724])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 35: tensor([0.4727])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 36: tensor([0.4705])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 37: tensor([0.4708])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 38: tensor([0.4730])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 39: tensor([0.4714])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 40: tensor([0.4724])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 41: tensor([0.4703])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 42: tensor([0.4704])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 41: tensor([0.4719])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 42: tensor([0.4703])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 43: tensor([0.4686])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 44: tensor([0.4694])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 45: tensor([0.4718])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 46: tensor([0.4704])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 47: tensor([0.4699])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 48: tensor([0.4732])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 49: tensor([0.4697])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 50: tensor([0.4703])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 51: tensor([0.4713])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 52: tensor([0.4684])
state : 
agent.get_action(state)[0] :  [[ 1. -1

Episode 37: tensor([0.4710])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 38: tensor([0.4705])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 39: tensor([0.4706])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 40: tensor([0.4705])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 41: tensor([0.4677])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 42: tensor([0.4701])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 43: tensor([0.4710])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 44: tensor([0.4709])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 45: tensor([0.4678])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 46: tensor([0.4701])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 47: tensor([0.4676])
state : 
agent.get_action(state)[0] :  [[ 1. -1.  1.]]
Episode 48: tensor([0.4702])
state : 
agent.get_action(state)[0] :  [[ 1. -1

KeyboardInterrupt: 

In [None]:
torch.mean(demo_rewards)

In [None]:
samp_rewards

In [None]:
torch.exp(samp_rewards)

In [None]:
log_probs_robot

In [None]:
torch.exp(log_probs_robot)

In [None]:
torch.exp(log_probs_robot) + 1e-7

In [None]:
torch.exp(samp_rewards) / (torch.exp(log_probs_robot) + 1e-7)

In [None]:
1.6320 / 1.6300

In [None]:
samp_rewards

In [None]:
log_probs_robot

In [None]:
torch.mean(torch.exp(samp_rewards) / (torch.exp(log_probs_robot) + 1e-7))

In [None]:
torch.log(torch.mean(torch.exp(samp_rewards) / (torch.exp(log_probs_robot) + 1e-7)))