<a href="https://colab.research.google.com/github/rosshalpin/clip-guided-scene-arrangement/blob/main/Parallel_MultiAgent_DissertationProject_v1_0ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
try:
  import clip
  # import stable_baselines3
  # import sb3_contrib
  import pettingzoo
  from plot_image_grid import image_grid
  import supersuit as ss
  import ray
except ModuleNotFoundError:
  !pip install gym==0.22.0
  !pip install git+https://github.com/openai/CLIP.git
  # !pip install stable-baselines3[extra]
  # !pip install git+https://github.com/Stable-Baselines-Team/stable-baselines3-contrib
  !pip install pettingzoo==1.19.0
  !wget https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/docs/tutorials/utils/plot_image_grid.py
  !pip install supersuit==3.3.4
  !pip install ray
  !pip install lz4




In [2]:
!pip show gym

Name: gym
Version: 0.22.0
Summary: Gym: A universal API for reinforcement learning environments.
Home-page: https://github.com/openai/gym
Author: Gym Community
Author-email: jkterry@umd.edu
License: MIT
Location: /usr/local/lib/python3.7/dist-packages
Requires: cloudpickle, importlib-metadata, gym-notices, numpy
Required-by: SuperSuit, PettingZoo, dopamine-rl


In [3]:
import os, sys
from google.colab import drive
drive.mount('/content/drive')
nb_path = '/content/notebooks'

!ln -s /content/drive/My\ Drive/Colab\ Notebooks/ $nb_path

sys.path.insert(0,nb_path)

!ln -s /content/gdrive/My\ Drive/ /mydrive

Mounted at /content/drive


In [4]:
import pytorch3d

In [5]:
import os
import torch
import matplotlib.pyplot as plt

# Util function for loading meshes
from pytorch3d.io import load_objs_as_meshes, load_obj
from pytorch3d.ops import sample_points_from_meshes

# Data structures and functions for rendering
from pytorch3d.structures import Meshes, join_meshes_as_batch, join_meshes_as_scene, Pointclouds
from pytorch3d.vis.plotly_vis import AxisArgs, plot_batch_individually, plot_scene
from pytorch3d.vis.texture_vis import texturesuv_image_matplotlib
from pytorch3d.renderer import (
    look_at_view_transform,
    FoVPerspectiveCameras, 
    PointLights,
    AmbientLights,
    DirectionalLights, 
    Materials, 
    RasterizationSettings, 
    MeshRenderer, 
    MeshRasterizer,  
    SoftPhongShader,
    TexturesUV,
    TexturesVertex
)

# add path for demo utils functions 
import sys
import os
sys.path.append(os.path.abspath(''))
import IPython.display
import matplotlib.pyplot as plt
from PIL import Image
import torch
import numpy as np

In [6]:
# Setup
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    torch.cuda.set_device(device)
else:
    device = torch.device("cpu")

# Set paths
DATA_DIR = '/content/drive/My Drive/DissertationProject_v0.0/data'

In [10]:
def load_mesh(input_path, dev) -> Meshes:
  obj_filename = os.path.join(DATA_DIR, input_path)
  return load_objs_as_meshes([obj_filename], device=dev)

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

100%|███████████████████████████████████████| 338M/338M [00:10<00:00, 34.3MiB/s]


In [12]:
import copy

from pytorch3d.renderer import (
  HardPhongShader
)

class SceneObject():
    def __init__(self, mesh, scale=1):
      new_mesh = mesh.clone().scale_verts(scale)
      self._mesh = new_mesh
      self._scale = scale
      self._position = self._mesh_position()
      self._prev_position = self._mesh_position()

    @property
    def mesh(self):
      return self._mesh

    @property
    def position(self):
      return self._position

    @position.setter
    def position(self, value):
      offset = [round(a-b,3) for a, b in zip(value, self._position)]
      self._set_position_helper(offset)

    def _mesh_position(self):
      return [round(float(((c.cpu()[0]+c.cpu()[1])/2)), 3) for c in self._mesh.get_bounding_boxes()[0]]

    def _set_position_helper(self, value):
      self._prev_position = copy.deepcopy(self._position)
      offset = self._mesh.verts_padded().new_tensor(value).expand(self._mesh.verts_packed().shape)
      self._mesh = self._mesh.offset_verts(offset)
      self._position = self._mesh_position()

    def translate(self, value):
      self._set_position_helper(value)

    def reset_pos(self):
      self._position = copy.deepcopy(self._prev_position)

class Scene():
  def __init__(self, meshes: list, azim, elev, dist):
    self.AZIM = azim
    self.ELEV = elev
    self.num_cameras = max(len(self.AZIM), len(self.ELEV))
    self._meshes = meshes
    self.CAMERA_DIST = dist
    self._scene = join_meshes_as_scene(meshes).extend(self.num_cameras)
    self.device = device
    

  @property
  def scene(self):
    return self._scene

  @scene.setter
  def scene(self, value):
    self._scene = join_meshes_as_scene(value).extend(self.num_cameras)

  @property
  def _lights(self):
    return PointLights(device=device, location=[[0.0, 5.0, 7.0]])
    # return AmbientLights(device=self.device)

  @property
  def _cameras(self):
    R, T = look_at_view_transform(dist=self.CAMERA_DIST, azim=self.AZIM, elev=self.ELEV)
    return FoVPerspectiveCameras(device=self.device, R=R, T=T)

  @property
  def renderer(self):
    return MeshRenderer(
        rasterizer=MeshRasterizer(
            raster_settings=RasterizationSettings(
              image_size=256, 
              faces_per_pixel=1,
              bin_size=None
            )
        ),
        shader=HardPhongShader(
            device=self.device
        )
    )
  
  def render(self):
    return self.renderer(self.scene, cameras=self._cameras, lights=self._lights).cpu().numpy()


In [13]:
from PIL import Image

def get_pil_image(input):
  return Image.fromarray((input * 255).astype('uint8'))

def clip_sim_3(input: list, description: str):
  text = clip.tokenize(description).to(device)
  with torch.no_grad():
    text_features = model.encode_text(text)
  text_features /= text_features.norm(dim=-1, keepdim=True)
  similarities = []
  for image_input in input:
    # image_input = get_pil_image(image_input[0, ..., :3])
    image_input = preprocess(image_input).unsqueeze(0).to(device)

    with torch.no_grad():
      image_features = model.encode_image(image_input).float()

    image_features /= image_features.norm(dim=-1, keepdim=True)
    similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T
    # print(similarity[0][0])
    similarities.append(similarity[0][0])
  return similarities

In [14]:
from itertools import product

DIRECTIONS = list(product(range(-1, 2), repeat=3))
mod = 0.2
ALL_DIRECTIONS = [[a * mod for a in b] for b in DIRECTIONS]
ACTIONS_MAP = {
  i: ALL_DIRECTIONS[i] for i in range(len(ALL_DIRECTIONS))
}

In [15]:
import functools
from gym.spaces import Box, Discrete, Dict, MultiDiscrete
from pettingzoo import ParallelEnv
from pettingzoo.utils import wrappers
from pettingzoo.utils import parallel_to_aec
from scipy.stats import mannwhitneyu
from collections import OrderedDict

class RenderEnv(ParallelEnv):
  metadata = {"render_modes": ["human"], "name": "render_v2"}

  def __init__(self, objs, guide, limit=None):
    """
    The init method takes in environment arguments and should define the following attributes:
    - possible_agents
    - action_spaces
    - observation_spaces

    These attributes should not be changed after initialization.
    """
    self.GUIDE_STRING = guide

    self.limit = limit
    self.rounds = 0

    self.camera_config = {
      'azim': torch.linspace(0, 180, 4),
      'elev': [50],
      'dist': 20.0
    }

    self.actions_map = ACTIONS_MAP

    self.limit_box = [[-10,-1,-10],[10,10,10]]
    self.p_threshold = 0.1
    

    self.best = {}
    self.images = None

    self.possible_agents = ["object_" + str(r) for r in range(len(objs))]
    self.agents = self.possible_agents
    self.agent_mapping = dict(
        zip(self.agents, [copy.deepcopy(obj) for obj in objs])
    )

    self.scene = Scene(
      meshes=[a.mesh for a in list(self.agent_mapping.values())], 
      **self.camera_config
    )

    self.num_cameras = self.scene.num_cameras
    self.best_sim_matrix = np.zeros(self.num_cameras).astype(np.float32)
    self.prev_sim_matrix = np.zeros(self.num_cameras).astype(np.float32)

    self.limited = np.ones((len(self.agents))).astype(np.float32)

    self.action_spaces = {agent: Discrete(len(self.actions_map)) for agent in self.possible_agents}
    self.observation_spaces = {
      agent: Box(low=-20, high=20, shape=(3,)) for agent in self.possible_agents
    }

    # self.action_space = {agent: Discrete(len(self.actions_map)) for agent in self.possible_agents}
    # self.observation_space = {
    #     agent: Discrete(len(self.actions_map)) for agent in self.possible_agents
    # }

  # this cache ensures that same space object is returned for the same agent
  # allows action space seeding to work as expected

  @functools.lru_cache(maxsize=None)
  def observation_space(self, agent):
    # Gym spaces are defined and documented here: https://gym.openai.com/docs/#spaces
    return self.observation_spaces[agent]

  @functools.lru_cache(maxsize=None)
  def action_space(self, agent):
    return self.action_spaces[agent]

  def render_scene(self) -> None:
      self.scene = Scene([a.mesh for a in list(self.agent_mapping.values())], **self.camera_config)
      self.images = self.scene.render()

  def clip_scores(self):
    self.render_scene()
    pil_images = [get_pil_image(img[..., :3]) for img in self.images]
    return clip_sim_3(pil_images, self.GUIDE_STRING)

  def limit_action(self, action, i):
    limited = False
    translation_result = [a+b for a,b in zip (list(self.agent_mapping.values())[i].position, action)]
    for i, val in enumerate(translation_result):
        if val < self.limit_box[0][i]:
            limited = True
        elif val > self.limit_box[1][i]:
            limited = True
    return limited

  def perform_test(self, a, b):
    stat, p = mannwhitneyu(a, b, alternative='greater',method='exact')
    return stat, p

  def get_reward(self, sim_matrix) -> int:
    rw = 0

    stat_best, p_best = self.perform_test(sim_matrix, self.best_sim_matrix)
    stat_prev, p_prev = self.perform_test(sim_matrix, self.prev_sim_matrix)
    # tmax = 20
    # tmin = 0
    # stat_best = (stat_best - tmin)/(tmax-tmin)
    # stat_prev = (stat_prev - tmin)/(tmax-tmin)

    if p_best <= self.p_threshold:
      self.best_sim_matrix = sim_matrix
      self.best["images"] = self.images
      self.best["scene"] = self.scene.scene

    rw += 1-p_best
    rw += 1-p_prev

    rw = (2 *(rw - -2)/(2- -2)) - 1

    self.prev_sim_matrix = sim_matrix

    return rw


  def take_action(self, i, action):
    value = action
    action = self.actions_map[action]
    limits = self.limit_action(action[:], i)
    if limits:
      self.limited[i] = 0.0
    else:
      self.agent_mapping[self.agents[i]].translate(action)


  def reset(self, seed=None, options=None):
    """
    Reset needs to initialize the `agents` attribute and must set up the
    environment so that render(), and step() can be called without issues.

    Here it initializes the `num_moves` variable which counts the number of
    hands that are played.

    Returns the observations for each agent
    """
    self.limited = np.ones((len(self.agents))).astype(np.float32)
    self.best_sim_matrix = np.zeros(self.num_cameras).astype(np.float32)
    self.prev_sim_matrix = np.zeros(self.num_cameras).astype(np.float32)
    self.rounds = 0
    return {agent: np.asarray([0,0,0]).astype(np.float32) for agent in self.possible_agents}

  def step(self, actions):
    """
    step(action) takes in an action for each agent and should return the
    - observations
    - rewards
    - dones
    - infos
    dicts where each dict looks like {agent_1: item_1, agent_2: item_2}
    """
    done = False
    self.rounds +=1 

    # If a user passes in actions with no agents, then just return empty observations, etc.
    if not actions:
        self.agents = []
        return {}, {}, {}, {}

    if self.limit != None:
      env_done = self.rounds >= self.limit
    else:
      env_done = False
    dones = {agent: env_done for agent in self.agents}

    for i in range(len(self.possible_agents)):
      self.take_action(i, actions[self.agents[i]])

    # current observation is just the other player's most recent action
    observations = {agent: self.agent_mapping[agent].position  for agent in self.possible_agents}

    sim_matrix = np.asarray(self.clip_scores()).astype(np.float32)

    overall_reward = self.get_reward(sim_matrix)

    rewards = {self.agents[i]: overall_reward * self.limited[i] for i in range(len(self.agents)) }

    self.limited = np.ones((len(self.agents))).astype(np.float32)
    # typically there won't be any information in the infos, but there must
    # still be an entry for each agent
    infos = {agent: {} for agent in self.agents}

    # infos["images"] = self.images
    # infos["best"] = self.best

    if env_done:
        self.agents = []

    return observations, rewards, dones, infos

In [19]:
fruit_mesh = load_mesh(f"{DATA_DIR}/fruit_mesh/pear_export.obj", torch.device("cuda"))
table_mesh = load_mesh(f"{DATA_DIR}/table_mesh/GenericClassicTable001.obj", torch.device("cuda"))

In [20]:
fruit_object = SceneObject(fruit_mesh, scale=0.25)
fruit_object2 = SceneObject(fruit_mesh, scale=0.25)
table_object = SceneObject(table_mesh, scale=8)
table_object.position=[0,0,0]

In [21]:
env = RenderEnv([fruit_object,fruit_object2, table_object], "Pieces of fruit on top of a wooden table")
max_cycles = 2

for step in range(max_cycles):
  actions = {agent: Discrete(len(env.actions_map)).sample() for agent in env.agents}
  observations, rewards, dones, infos = env.step(actions)
  print(observations)
env.reset()

{'object_0': [0.184, 0.436, -0.201], 'object_1': [-0.216, 0.236, 0.199], 'object_2': [-0.2, 0.2, -0.2]}
{'object_0': [-0.016, 0.636, -0.401], 'object_1': [-0.416, 0.236, -0.001], 'object_2': [0.0, 0.4, -0.0]}


{'object_0': array([0., 0., 0.], dtype=float32),
 'object_1': array([0., 0., 0.], dtype=float32),
 'object_2': array([0., 0., 0.], dtype=float32)}

In [22]:
from torch import cuda
cuda.empty_cache()

In [None]:
# image_grid(env.best["images"], rows=1, cols=4, rgb=True)

In [None]:
# plot_batch_individually(env.scene.scene[0])

In [None]:
# import supersuit as ss
# from stable_baselines3 import PPO, A2C

# env = RenderEnv([fruit_object,fruit_object2, table_object], "Pieces of fruit on top of a wooden table")

# vec_env = ss.pettingzoo_env_to_vec_env_v1(env)

# n_envs = 3

# parallel_env = ss.concat_vec_envs_v1(vec_env, n_envs, num_cpus=1, base_class='stable_baselines3')


# n_steps = 64
# total_timesteps = (n_steps * n_envs) * 3
# train_model = PPO('MlpPolicy', parallel_env, verbose=1, n_steps=n_steps).learn(n_eval_episodes=64, total_timesteps=total_timesteps)

In [19]:
from torch import cuda
cuda.empty_cache()

In [23]:
import ray
import ray.rllib.agents.ppo as ppo

ray.shutdown()
ray.init(ignore_reinit_error=True)

RayContext(dashboard_url='', python_version='3.7.13', ray_version='1.13.0', ray_commit='e4ce38d001dbbe09cd21c497fedd03d692b2be3e', address_info={'node_ip_address': '172.28.0.2', 'raylet_ip_address': '172.28.0.2', 'redis_address': None, 'object_store_address': '/tmp/ray/session_2022-08-20_10-34-43_496547_90/sockets/plasma_store', 'raylet_socket_name': '/tmp/ray/session_2022-08-20_10-34-43_496547_90/sockets/raylet', 'webui_url': '', 'session_dir': '/tmp/ray/session_2022-08-20_10-34-43_496547_90', 'metrics_export_port': 45327, 'gcs_address': '172.28.0.2:64980', 'address': '172.28.0.2:64980', 'node_id': 'c14397e05d1b0843c669ad541546cf340b5f9aa5e81594a50dbd58d0'})