# Import Libraries

In [1]:
import copy
import utils
import torch
import constants
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

from env import Env
from agent import Agent
from torchsummary import summary
from torch.distributions import Normal, Categorical

# Initialise Environment

In [2]:
#initialise environment
min_x, max_x =  -0.110 - 0.150,   -0.110 + 0.150
min_y, max_y =   0.560 - 0.125,    0.560 + 0.125
min_z, max_z =               0,              0.4 

workspace_lim = np.asarray([[min_x, max_x], 
                            [min_y, max_y],
                            [min_z, max_z]])

print(f"workspace space: \n{workspace_lim}")

obj_dir = 'objects/blocks/'
N_obj   = 2

env = Env(obj_dir, N_obj, workspace_lim, cluttered_mode=True)

workspace space: 
[[-0.26   0.04 ]
 [ 0.435  0.685]
 [ 0.     0.4  ]]


# Initialise Agent

In [3]:
agent = Agent(env, 
              max_memory_size = 10000, 
              is_debug        = True, 
              N_batch         = 256, 
              N_batch_hld     = 256, 
              lr              = 1e-3, 
              hld_lr          = 1e-3,
              tau             = 0.01,
              tau_hld         = 0.01)

device: cuda
[SUCCESS] initialise environment
[SUCCESS] initialise networks
[LOAD BUFFER] data_length: 10003
[SUCCESS] load low-level buffer
[SUCCESS] load previous buffer
[SUCCESS] initialise memory buffer


# Gather Demonstration Experience

In [4]:
agent.gather_guidance_experience(train_grasp = False)

[SUCCESS] buffer is full


# HLD-net clone

In [5]:
# hld_exp = agent.buffer_replay_hld.get_experience()
# print(f"N_hld_exp: {len(hld_exp[0])}")
# hld_train_loader, hld_test_loader = agent.get_train_test_dataloader_hld_net(hld_exp, train_ratio=0.9)

In [6]:
# agent.behaviour_cloning_hld(hld_train_loader, hld_test_loader, agent.hld_net, agent.hld_net_target, num_epochs = 1000)

# # hld_exp          = None
# # hld_train_loader = None
# # hld_test_loader  = None

In [7]:
def behaviour_cloning_eval_hld(test_loader, hld_net, hld_net_target):
    
    #set network in evaluation mode
    hld_net.eval()
    
    #set target q network in eval mode
    hld_net_target.eval()

    #initialise loss funciton
    mse_loss  = nn.MSELoss()

    #initialise mean loss value
    mean_hld_loss = 0.

    #initialise batch counter
    batch_cnt = 0

    with torch.no_grad():

        for states, next_states, target_action_types, next_target_action_types, rewards, dones in test_loader:            

            states                   = states.to(agent.device)
            next_states              = next_states.to(agent.device)
            target_action_types      = target_action_types.to(agent.device)
            next_target_action_types = next_target_action_types.to(agent.device)
            rewards                  = rewards.unsqueeze(1).to(agent.device)
            dones                    = dones.unsqueeze(1).to(agent.device)

            # compute current q - values
            batch_indices = torch.arange(states.size(0)).long().to(agent.device)
            qs = hld_net(states)[batch_indices, target_action_types.long()]
            qs = qs.unsqueeze(1)

            # Compute target q - values
            with torch.no_grad():
                # next_qs   = torch.max(hld_net_target(next_states), dim=1)[0]
                next_qs   = hld_net_target(next_states)[batch_indices, next_target_action_types.long()]
                next_qs   = next_qs.unsqueeze(1)
                target_qs = rewards + agent.gamma * (1 - dones) * next_qs

            if agent.is_debug:
                if len(qs.shape) != len(target_qs.shape):
                    print("[ERROR] len(qs.shape) != len(target_qs.shape)")
                    break
                else:
                    if qs.shape[0] != target_qs.shape[0]:
                        print("[ERROR] qs.shape[0] != target_qs.sahpe[0]")
                        break
                    elif qs.shape[1] != target_qs.shape[1]:
                        print("[ERROR] qs.shape[1] != target_qs.sahpe[1]")
                        break
            
            all_qs = hld_net(states)
            print("=====")
            for i in range(all_qs.shape[0]):
                print(all_qs[i].argmax(dim = 0).item(), 
                      int(target_action_types[i].item()), 
                      rewards[i].item(),
                      dones[i].item(), 
                      qs[i] - target_qs[i])

            hld_net_loss = mse_loss(qs, target_qs)

            mean_hld_loss += hld_net_loss.item()
        
            batch_cnt += 1

    mean_hld_loss /= batch_cnt

    return mean_hld_loss


In [8]:
# agent.hld_net.load_checkpoint()
# agent.hld_net_target.load_checkpoint()
# behaviour_cloning_eval_hld(hld_train_loader, agent.hld_net, agent.hld_net_target)

# # hld_exp          = None
# # hld_train_loader = None
# # hld_test_loader  = None

# Push Clone

In [9]:
push_exp  = agent.buffer_replay.get_experience_by_action_type(constants.PUSH)
print(f"N_push_exp: {len(push_exp[0])}")
push_train_loader, push_test_loader = agent.get_train_test_dataloader( push_exp, is_grasp = False, train_ratio=0.9)

N_push_exp: 10003


In [10]:
agent.push_critic1.load_checkpoint()
agent.push_critic2.load_checkpoint()
agent.push_critic1_target.load_checkpoint()
agent.push_critic2_target.load_checkpoint()
agent.push_actor.load_checkpoint()

agent.behaviour_cloning(push_train_loader, push_test_loader, 
                        agent.push_critic1, agent.push_critic2, 
                        agent.push_critic1_target, agent.push_critic2_target, 
                        agent.push_actor, num_epochs = 100, is_grasp = False)

push_exp          = None
push_train_loader = None
push_test_loader  = None

Epoch: 1/100 
[TRAIN] critic1 loss: 0.041686, critic2 loss: 0.051378, actor Loss: 0.065225 
[EVAL] critic1 loss eval: 0.029747/inf critic2 loss eval: 0.035720/inf 
[EVAL] actor loss eval: 0.067653/inf 
[SUCCESS] save critic1 model!
[SUCCESS] save critic2 model!
[SUCCESS] save actor model!
Epoch: 2/100 
[TRAIN] critic1 loss: 0.024101, critic2 loss: 0.025333, actor Loss: 0.032747 
[EVAL] critic1 loss eval: 0.015256/0.029747 critic2 loss eval: 0.015303/0.035720 
[EVAL] actor loss eval: 0.032406/0.067653 
[SUCCESS] save critic1 model!
[SUCCESS] save critic2 model!
[SUCCESS] save actor model!
Epoch: 3/100 
[TRAIN] critic1 loss: 0.023217, critic2 loss: 0.026433, actor Loss: 0.029535 
[EVAL] critic1 loss eval: 0.014563/0.015256 critic2 loss eval: 0.018793/0.015303 
[EVAL] actor loss eval: 0.073804/0.032406 
[SUCCESS] save critic1 model!
Epoch: 4/100 
[TRAIN] critic1 loss: 0.021991, critic2 loss: 0.025824, actor Loss: 0.027613 
[EVAL] critic1 loss eval: 0.013617/0.014563 critic2 loss eval: 0.0

# Grasp Clone

In [11]:
grasp_exp = agent.buffer_replay.get_experience_by_action_type(constants.GRASP)
print(f"N_grasp_exp: {len(grasp_exp[0])}")

N_grasp_exp: 0


In [12]:
# agent.grasp_critic1.load_checkpoint()
# agent.grasp_critic2.load_checkpoint()
# agent.grasp_critic1_target.load_checkpoint()
# agent.grasp_critic2_target.load_checkpoint()
# agent.grasp_actor.load_checkpoint()

grasp_train_loader, grasp_test_loader = agent.get_train_test_dataloader(grasp_exp, is_grasp = True, train_ratio=0.9)
agent.behaviour_cloning(grasp_train_loader, grasp_test_loader, 
                        agent.grasp_critic1, agent.grasp_critic2, 
                        agent.grasp_critic1_target, agent.grasp_critic2_target, 
                        agent.grasp_actor, num_epochs = 750, is_grasp = True)

grasp_exp          = None
grasp_train_loader = None
grasp_test_loader  = None

ValueError: num_samples should be a positive integer value, but got num_samples=0