In [1]:
import math
import torch

class Camera:
    """
    For the camera there are four different coordinate systems (or spaces):
    - Camera space: We follow OpenGL coordinate system with the
        right-hand rule where we have +X points right, +Y points up and +Z
        point to the camera.
    - Cilp space: Any coordinate outside within the range is being clipped.
        We use this coordinate system as an input for the rasterizer. Note
        That we use homogeneous coordinates, hence the w value (x,y,z,w) is
        used for clipping -w < x,y,z < w. Note that we convert here from
        right-hand rule to left-hand rule, because +Z now points awy from
        the camera.
    - NDC space: This is the normalzied Clip space where each coordiante
        in the frustrum is between (-1,1). We don't performe the
        transformation ourselv we let OpenGL do that.
    - Screen space: The representation of the frustrum defined in pixel space
        instead of a normalzied space, where the top left pixel is (0,0) and
        the bottom right corner is (W,H).
    """

    def __init__(
        self,
        width: int,
        height: int,
        fov_y: float = 45.0,
        near: float = 0.01,
        far: float = 100.0,
        K: torch.Tensor | None = None,
        device: str = "cuda",
    ):
        self.width = width
        self.height = height
        self.near = near
        self.far = far
        self.fov_y = fov_y

        self.device = device

        if K is None:
            self.projection_matrix = self.fov_perspective_projection(
                fov_y=self.fov_y,
                width=self.width,
                height=self.height,
                near=self.near,
                far=self.far,
            )
        else:
            self.projection_matrix = self.intrinsics_perspective_projection(
                K=K,
                width=self.width,
                height=self.height,
                near=self.near,
                far=self.far,
            )


    def fov_perspective_projection(
        self,
        fov_y: float,
        width: int,
        height: int,
        near: float = 1.0,
        far: float = 100.0,
    ):
        """
        Returns:
            P: Perspective projection matrix, (4, 4)
                P = [
                        [2*n/(r-l), 0.0,        (r+l)/(r-l),    0.0         ],
                        [0.0,       2*n/(t-b),  (t+b)/(t-b),    0.0         ],
                        [0.0,       0.0,        -(f+n)/(f-n),   -(f*n)/(f-n)],
                        [0.0,       0.0,        -1.0,            0.0         ]
                    ]
        """
        deg2rad = math.pi / 180
        tan_fov_y = math.tan(fov_y * 0.5 * deg2rad)
        tan_fov_x = tan_fov_y * (width / height)
        top = tan_fov_y * near
        bottom = -top
        right = tan_fov_x * near
        left = -right
        z_sign = -1.0

        proj = torch.zeros([4, 4], device=self.device)

        proj[0, 0] = 2.0 * near / (right - left)
        proj[1, 1] = 2.0 * near / (top - bottom)
        proj[0, 2] = (right + left) / (right - left)
        proj[1, 2] = (top + bottom) / (top - bottom)
        proj[3, 2] = z_sign
        proj[2, 2] = z_sign * (far + near) / (far - near)
        proj[2, 3] = -(2.0 * far * near) / (far - near)
        return proj

    def intrinsics_perspective_projection(
        self,
        K: torch.Tensor,
        width: int,
        height: int,
        near: float = 1.0,
        far: float = 100.0,
    ):
        """
        Transform points from camera space (x: right, y: up, z: out) to clip space (x: right, y: up, z: in)

        For information check out the math:
        https://www.songho.ca/opengl/gl_projectionmatrix.html

        Args:
            K: Intrinsic matrix, (3, 3)
                K = [
                        [fx, 0, cx],
                        [0, fy, cy],
                        [0,  0,  1],
                    ]
        Returns:
            P: Perspective projection matrix, (4, 4)
                P = [
                        [2*fx/w, 0.0,     (w - 2*cx)/w,             0.0                     ],
                        [0.0,    2*fy/h,  (h - 2*cy)/h,             0.0                     ],
                        [0.0,    0.0,     -(far+near) / (far-near), -2*far*near / (far-near)],
                        [0.0,    0.0,     -1.0,                     0.0                     ]
                    ]
        """
        w = width
        h = height
        fx = K[0, 0]
        fy = K[1, 1]
        cx = K[0, 2]
        cy = K[1, 2]

        proj = torch.zeros([4, 4], device=self.device)
        proj[0, 0] = fx * 2 / w
        proj[1, 1] = fy * 2 / h
        proj[0, 2] = (w - 2 * cx) / w
        proj[1, 2] = (h - 2 * cy) / h
        proj[2, 2] = -(far + near) / (far - near)
        proj[2, 3] = -2 * far * near / (far - near)
        proj[3, 2] = -1
        return proj

    def convert_to_homo_coords(self, p: torch.Tensor):
        shape = list(p.shape)
        assert shape[-1] == 3
        shape[-1] = shape[-1] + 1
        p_homo = torch.ones(shape, device=p.device)
        p_homo[..., :3] = p
        return p_homo 

    def clip_transform(self, p_camera: torch.Tensor):
        return torch.matmul(p_camera, self.projection_matrix.T)

    def ndc_transform(self, p_camera: torch.Tensor):
        p_clip = self.clip_transform(p_camera)
        p_clip[..., 0] /= p_clip[..., 3] 
        p_clip[..., 1] /= p_clip[..., 3] 
        p_clip[..., 2] /= p_clip[..., 3] 
        p_clip[..., 3] /= p_clip[..., 3] 
        return p_clip
    
    def screen_transform(self, p_camera: torch.Tensor):
        p_ndc = self.ndc_transform(p_camera)
        depth = p_camera[..., 2]
        u = (p_ndc[..., 0] + 1) * 0.5 * self.width
        v = (p_ndc[..., 1] + 1) * 0.5 * self.height
        return torch.stack([u, v, depth], dim=-1)
    
    def unproject_points(self, xy_depth: torch.Tensor):
        """
        The x,y data contains the values in ndc coordinates space, hence they
        are between (-1, 1) and the depth is the value from the z-plane in
        camera space, hence the sign should be negative.

        xy_depth[i] = [x[i], y[i], depth[i]]
        """
        z_camera = xy_depth[..., 2] 
        p1 = self.projection_matrix[2, 2]
        p2 = self.projection_matrix[2, 3]
        p3 = self.projection_matrix[3, 2] 
        w_clip = p3 * z_camera
        z_ndc = (p1 * z_camera + p2) / w_clip
        p_ndc = torch.stack([xy_depth[..., 0], xy_depth[..., 1], z_ndc])
        p_ndc = self.convert_to_homo_coords(p_ndc)
        # convert back to clip space
        p_clip = p_ndc 
        p_clip[..., 0] *= w_clip  
        p_clip[..., 1] *= w_clip  
        p_clip[..., 2] *= w_clip  
        p_clip[..., 3] *= w_clip  
        # extract only x,y,z component
        p_camera = torch.matmul(p_clip, self.projection_matrix.inverse().T)
        return p_camera[..., :3]

In [None]:
from pathlib import Path
from lib.utils.loader import load_depth, load_intrinsics

image_idx = 0
data_dir = "/home/borth/GuidedResearch/data/dphm_christoph_mouthmove"
K = load_intrinsics(data_dir, return_tensor="pt")
depth = load_depth(data_dir, image_idx, return_tensor="pt", smooth=False)

In [None]:
# camera = Camera(fov_y=45, width=1920, height=1080, near=0.01, far=100)
camera = Camera(K=K, width=1920, height=1080, near=0.01, far=100)
p_camera = torch.tensor([[0.05, -0.16, -0.5, 1.0], [-0.1, 0.2, -0.3, 1.0]]).to("cuda")
camera.ndc_transform(p_camera)

In [None]:
x_ndc = torch.linspace(-1, 1, steps=camera.width)
y_ndc = torch.linspace(1, -1, steps=camera.height)
y_grid, x_grid = torch.meshgrid(y_ndc, x_ndc, indexing="ij")
xy_ndc = torch.stack([x_grid, y_grid], dim=-1)
xy_depth = torch.concatenate([xy_ndc, -depth.unsqueeze(-1)], dim=-1).to("cuda")
p_camera = camera.unproject_points(xy_depth=xy_depth)
p_camera

In [None]:
# check
xy_depth = torch.tensor([0.0948, -0.5283, -0.5]).to("cuda")
p_ndc_homo = camera.unproject_points(xy_depth=xy_depth)
p_ndc_homo