In [2]:
import pickle
import os
from utils import *
import torch
import torch.onnx
from torch import nn

In [3]:
policy_net, value_net, running_state = pickle.load(
        open(os.path.join(assets_dir(), 'learned_models/{}_ppo_rand_best.p'.format("navigation",
                                                                      5)), "rb"))

In [4]:
print(policy_net)

Policy(
  (conv1): Conv2d(5, 32, kernel_size=(10, 14), stride=(8, 8), padding=(1, 4))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc_img): Linear(in_features=5120, out_features=512, bias=True)
  (fc_goal): Linear(in_features=15, out_features=96, bias=True)
  (fc_ray): Linear(in_features=5, out_features=32, bias=True)
  (fc_action): Linear(in_features=10, out_features=64, bias=True)
  (img_goal_ray1): Linear(in_features=704, out_features=512, bias=True)
  (img_goal_ray2): Linear(in_features=512, out_features=2, bias=True)
  (relu): ReLU()
  (tanh): Tanh()
)


In [35]:
class MyPolicy(nn.Module):
    def __init__(self, pretrained_net):
        super(MyPolicy, self).__init__()
        self.is_disc_action = False

        """ layers for inputs of depth_images """
        self.conv1 = pretrained_net.conv1
        self.conv2 = pretrained_net.conv2
        self.conv3 = pretrained_net.conv3
        self.fc_img = pretrained_net.fc_img

        """ layers for inputs of goals and rays """
        self.fc_ray = pretrained_net.fc_ray
        self.fc_action = pretrained_net.fc_action

        """ layers for inputs concatenated information """
        self.img_goal_ray1 = nn.Linear(608, 512)
        self.img_goal_ray2 = pretrained_net.img_goal_ray2 # two dimensions of actions: upward and downward; turning

        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

        self.action_log_std = pretrained_net.action_log_std

    def forward(self, depth_img, ray, hist_action):
        depth_img = self.relu(self.conv1(depth_img))
        depth_img = self.relu(self.conv2(depth_img))
        depth_img = self.relu(self.conv3(depth_img))
        depth_img = depth_img.view(depth_img.size(0), -1)
        depth_img = self.relu(self.fc_img(depth_img))

        ray = ray.view(ray.size(0), -1)
        ray = self.relu(self.fc_ray(ray))

        hist_action = hist_action.view(hist_action.size(0), -1)
        hist_action = self.relu(self.fc_action(hist_action))

        img_goal_ray_aciton = torch.cat((depth_img, ray, hist_action), 1)
        img_goal_ray_aciton = self.relu(self.img_goal_ray1(img_goal_ray_aciton.float()))
        action_mean = self.tanh(self.img_goal_ray2(img_goal_ray_aciton.double()))

        action_log_std = self.action_log_std.expand_as(action_mean)
        action_std = torch.exp(action_log_std)

        return action_mean, action_log_std, action_std

    def select_action(self, depth_img, ray, hist_action):
        action_mean, _, action_std = self.forward(depth_img, ray, hist_action)
        # print "action:", action_mean, action_std
        action = torch.clamp(torch.normal(action_mean, action_std), -1, 1)
        # print action, "\n\n\n"
        return action

    def get_log_prob(self, depth_img, ray, hist_action, actions):
        action_mean, action_log_std, action_std = self.forward(depth_img, ray, hist_action)
        return normal_log_density(actions, action_mean, action_log_std, action_std)

In [36]:
my_policy = MyPolicy(policy_net)
depth_img = torch.randn(1, 5, 128, 160, requires_grad=True, dtype=torch.double)
ray = torch.randn(1, 5, requires_grad=True, dtype=torch.double)
hist_action = torch.randn(1, 5, 2, requires_grad=True, dtype=torch.double)
torch_out = my_policy(depth_img, ray, hist_action)

In [37]:
torch_out

(tensor([[0.1735, 0.1958]], dtype=torch.float64, grad_fn=<TanhBackward0>),
 tensor([[-0.7574, -0.7946]], dtype=torch.float64, grad_fn=<ExpandBackward0>),
 tensor([[0.4689, 0.4518]], dtype=torch.float64, grad_fn=<ExpBackward0>))