In [1]:
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, parent_dir)

parent_dir = os.path.abspath(os.path.join(os.getcwd().split('render')[0],'gaussian_data'))
sys.path.insert(0, parent_dir)

from gaussian_data.Camera import Camera
from gaussian_data.Frame import Frame
import gaussian_data.Plotters
import gaussian_data.Utils as Utils
from GaussianSplat import GaussianSplat
import numpy as np
from plyfile import PlyData
import numpy as np
import argparse
from io import BytesIO
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import scipy.io
import pandas as pd
from Render import Render

%matplotlib qt

# load hull
path = 'I:/My Drive/Research/gs_data/mov19_2022_03_03/'
real_coord = scipy.io.loadmat(f'{path}/3d_pts/real_coord.mat')['all_coords']
points_3d = {body_wing : pd.DataFrame(Utils.load_hull(body_wing,path),columns = ['X','Y','Z','frame']) for body_wing in ['body','rwing','lwing']}
# initilize objects
frames = [1408]

image_names,points_in_idx = Utils.define_frames(frames,points_3d)
cameras = {f'cam{cam + 1}':Camera(path,cam) for cam in range(4)}
frames = {f'{im_name}.jpg':Frame(path,im_name,points_in_idx[im_name.split('CAM')[0]],real_coord,idx) for idx,im_name in enumerate(image_names)}
# map 3d voxels to 2d pixels
[frames[im_name].map_3d_2d(croped_image = True) for im_name in frames.keys()]
voxel_dict,colors_dict = Utils.get_dict_for_points3d(frames)


In [2]:
def build_rotation(r):
    norm = np.sqrt(r[:,0]*r[:,0] + r[:,1]*r[:,1] + r[:,2]*r[:,2] + r[:,3]*r[:,3])

    q = r / norm[:, None]

    R = np.zeros((q.shape[0], 3, 3))

    r = q[:, 0]
    x = q[:, 1]
    y = q[:, 2]
    z = q[:, 3]

    R[:, 0, 0] = 1 - 2 * (y*y + z*z)
    R[:, 0, 1] = 2 * (x*y - r*z)
    R[:, 0, 2] = 2 * (x*z + r*y)
    R[:, 1, 0] = 2 * (x*y + r*z)
    R[:, 1, 1] = 1 - 2 * (x*x + z*z)
    R[:, 1, 2] = 2 * (y*z - r*x)
    R[:, 2, 0] = 2 * (x*z - r*y)
    R[:, 2, 1] = 2 * (y*z + r*x)
    R[:, 2, 2] = 1 - 2 * (x*x + y*y)
    return R


def build_scaling_rotation(s, r):
    L = np.zeros((s.shape[0], 3, 3))
    R = build_rotation(r)

    L[:,0,0] = s[:,0]
    L[:,1,1] = s[:,1]
    L[:,2,2] = s[:,2]

    L = R @ L
    return L

In [3]:

def homogeneous(points):
    """
    homogeneous points
    :param points: [..., 3]
    """
    return np.column_stack((points, np.ones(points.shape[0])))

def homogeneous_vec(vec, vectoadd = [0,0]):
    """
    homogeneous points
    :param points: [..., 3]
    """
    return np.concatenate((vec,np.tile(np.array([vectoadd]).T,(vec.shape[0],1,1))),axis = 2)


In [4]:

def getProjectionMatrix(znear, zfar, fovX, fovY):
    import math
    tanHalfFovY = math.tan((fovY / 2))
    tanHalfFovX = math.tan((fovX / 2))

    top = tanHalfFovY * znear
    bottom = -top
    right = tanHalfFovX * znear
    left = -right

    P = np.zeros((4, 4))

    z_sign = 1.0

    P[0, 0] = 2.0 * znear / (right - left)
    P[1, 1] = 2.0 * znear / (top - bottom)
    P[0, 2] = (right + left) / (right - left)
    P[1, 2] = (top + bottom) / (top - bottom)
    P[3, 2] = z_sign
    P[2, 2] = z_sign * zfar / (zfar - znear)
    P[2, 3] = -(zfar * znear) / (zfar - znear)
    return P


def focal2fov(focal, pixels):
    import math
    return 2*math.atan(pixels/(2*focal))


def get_inputs(num_points=8):
    length = 0.5
    num_points = 8
    x = np.linspace(-1, 1, num_points) * length
    y = np.linspace(-1, 1, num_points) * length
    x, y = np.meshgrid(x, y)
    means3D = np.stack([x,y, 0 * np.random.rand(*x.shape)], axis=-1).reshape(-1,3)
    quats = np.tile(np.zeros((1,4)),(len(means3D), 1))
    quats[..., 0] = 1.
    scale = length /(num_points-1)
    scales = np.tile(np.ones((1,3)),(len(means3D), 1))*scale
    return means3D, scales, quats

def get_cameras():    
    intrins = np.array([[711.1111,   0.0000, 256.0000,   0.0000],
               [  0.0000, 711.1111, 256.0000,   0.0000],
               [  0.0000,   0.0000,   1.0000,   0.0000],
               [  0.0000,   0.0000,   0.0000,   1.0000]])
    c2w = np.array([[-8.6086e-01,  3.7950e-01, -3.3896e-01,  6.7791e-01],
         [ 5.0884e-01,  6.4205e-01, -5.7346e-01,  1.1469e+00],
         [ 1.0934e-08, -6.6614e-01, -7.4583e-01,  1.4917e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]])

    width, height = 512, 512
    focal_x, focal_y = intrins[0, 0], intrins[1, 1]
    viewmat = np.linalg.inv(c2w).T
    FoVx = focal2fov(focal_x, width)
    FoVy = focal2fov(focal_y, height)
    projmat = getProjectionMatrix(znear=0.2, zfar=1000, fovX=FoVx, fovY=FoVy).T
    projmat = viewmat @ projmat
    return intrins, viewmat, projmat, height, width


def intersection_point(pixel,T):
    k = -T[..., 0] + pixel[0]*T[...,3]
    l = -T[..., 1] + pixel[1] * T[..., 3]
    points = np.cross(k, l, axis=-1)
    return points[..., :2] / points[..., -1:]

In [43]:
rotations = build_scaling_rotation(scales, quats)
scales
quats

array([[0.9925334 , 0.78650045, 0.44547302, 0.32422578],
       [0.35393873, 0.39494005, 0.767374  , 0.6268835 ],
       [0.12577748, 0.3875593 , 0.20494738, 0.5106863 ],
       ...,
       [0.5579117 , 0.05666701, 0.6204774 , 0.8053967 ],
       [0.60280967, 0.66645724, 1.0965135 , 0.51852083],
       [0.6569601 , 0.04838631, 0.3349766 , 0.14912188]], dtype=float32)

In [47]:
# Surface splatting (2D Gaussian Splatting)
def setup(means3D, scales, quats, opacities, colors, viewmat, projmat):
    rotations = build_scaling_rotation(scales, quats)

   
    # 1. Viewing transform
    # Eq.4 and Eq.5
    p_view = (means3D @ viewmat[:3,:3]) + viewmat[-1:,:3] # rotate the gaussian mean to camera FoR
    uv_view = (rotations @ viewmat[:3,:3]) # rotate to camera FoR

    # M is H matrix that representes the transformation from tangent plane to camera. 
    # its the scaled axes concatenated to the gaussian mean location - represented in homogeneous coordinates

    # !! need to check that the order of axes ar ok for M !!
    M = np.concatenate((homogeneous_vec(uv_view[:,:2,:]),homogeneous(p_view)[:,np.newaxis]),axis = 1)
    T = M @ projmat # T stands for (WH)^T in Eq.9 - projmat transforms from camera to NDC (screen coordinates)
    # T is the transformation of every gaussian from tangent plane to NDC, its homogebnus coordinates. with the rotation matrix 
    # representing the axes and the translation vector representing the location of the center of each gaussian. 
    # We notice that projmat is a prespective projection matrix. 

    # Next, We calculate the radius of the gaussian. We normalize by w to get homogeneus coordinates. In addition we flip Z axis (not sure why) 
    # we calculate the distance from the camera to the gaussian mean (this is w, the last row of a homogenues coordinate, deviding by it will give perspective view)
    # Notice that the rotation is scaled (in build_scaling_rotation) and is not normalized.

    # point_image - the projectes mean of the gaussian (with flipped z)
    # half_extend - used to calculate the radius of the gaussian, we take 3 sigma. because the ratation is scaled 
    # we calculate the distance for each axis and can get the 3 sigma by multiplying each distance. (we also devide by w to get the prespective view)

    temp_point = np.tile([1,1,-1],(T.shape[0],1))
    distance  = np.sum(temp_point*T[..., 3] * T[..., 3],-1)
    f = (1 / distance[:,np.newaxis]) * temp_point

    # distance = (temp_point * (T[..., 3] * T[..., 3])).sum(dim=-1, keepdims=True)
    point_image = np.column_stack((np.sum(f * T[..., 0] * T[...,3],1),np.sum(f * T[..., 1] * T[...,3],1),np.sum(f * T[..., 2] * T[...,3],1)))





    axes_dist = np.column_stack((np.sum(f * T[..., 0] * T[...,0],1),np.sum(f * T[..., 1] * T[...,1],1),np.sum(f * T[..., 2] * T[...,2],1)))

    half_extend = point_image * point_image - axes_dist
    radii = np.sqrt(np.maximum(half_extend, 1e-4)) * 3
    center = point_image

    # 3. Perform Sorting
    depth = p_view[..., 2] # depth is used only for sorting
    index = np.argsort(depth)
    T = T[index]
    colors = colors[index]
    center = center[index]
    depth = depth[index]
    radii = radii[index]
    return T, colors, opacities, center, depth, radii,half_extend

In [50]:
T, colors, opacities, center, depth, radii,he = setup(means3D, scales, quats, opacity, colors, viewmat, projmat)


In [54]:
he

array([[ 3.46448593e-01,  1.88572573e+00,  0.00000000e+00],
       [ 4.97534301e-01,  1.76865442e+00, -1.11022302e-16],
       [ 1.28061808e+00,  3.87049703e-01,  0.00000000e+00],
       ...,
       [ 3.83391108e-01,  2.70979787e+00,  0.00000000e+00],
       [ 4.14285464e-01,  1.96007882e+00,  0.00000000e+00],
       [ 4.87262153e-01,  1.12683738e+00, -1.11022302e-16]])

In [55]:
np.maximum(he, 1e-4)

array([[3.46448593e-01, 1.88572573e+00, 1.00000000e-04],
       [4.97534301e-01, 1.76865442e+00, 1.00000000e-04],
       [1.28061808e+00, 3.87049703e-01, 1.00000000e-04],
       ...,
       [3.83391108e-01, 2.70979787e+00, 1.00000000e-04],
       [4.14285464e-01, 1.96007882e+00, 1.00000000e-04],
       [4.87262153e-01, 1.12683738e+00, 1.00000000e-04]])

In [53]:
radii2 = np.sqrt(np.maximum(he, 1e-4)) * 3
radii2

array([[1.76579652, 4.11965187, 0.03      ],
       [2.11608334, 3.98972302, 0.03      ],
       [3.39493191, 1.86639956, 0.03      ],
       ...,
       [1.85755753, 4.93843911, 0.03      ],
       [1.93095033, 4.20008445, 0.03      ],
       [2.09412497, 3.1845779 , 0.03      ]])

In [6]:

def alpha_blending(alpha, colors):
    # Calculate cumulative alpha blending weights
    T = np.concatenate([np.ones_like(alpha[-1:]), np.cumprod(1 - alpha, axis=0)[:-1]], axis=0)
    
    # Blend the colors using the alpha values and the cumulative transparency
    image = np.sum(T * alpha * colors, axis=0).reshape(-1, colors.shape[-1])
    
    # Accumulate alpha values for the alpha map
    alphamap = np.sum(T * alpha, axis=0).reshape(-1, 1)
    
    return image, alphamap

def alpha_blending_with_gaussians(dist2, colors, opacities, depth_acc, H, W):
    # Reshape colors and depth accumulation for proper broadcasting
    colors = colors.reshape(-1, 1, colors.shape[-1])
    depth_acc = depth_acc.T[..., None]
    depth_acc = np.repeat(depth_acc, 1, axis=2)  # equivalent to repeating across the 3rd axis

    # Evaluate Gaussians (using cutoff for visualization purposes)
    cutoff = 1#1**2  # Equivalent to 1 sigma^2
    dist2 = dist2.T  # Transpose to match the expected shape
    gaussians = np.exp(-0.5 * dist2) * (dist2 < cutoff)
    gaussians = gaussians[..., None]  # Add an extra dimension for broadcasting
    alpha = opacities[:, None] * gaussians  # Opacities expanded to match the shape of Gaussians

    # Accumulate Gaussians through alpha blending
    image, _ = alpha_blending(alpha, colors)
    depthmap, alphamap = alpha_blending(alpha, depth_acc)
    
    # Normalize depthmap by the alpha map
    depthmap = depthmap / alphamap
    depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)  # Handle NaN and infinity values
    
    return image.reshape(H, W, -1), depthmap.reshape(H, W, -1)

In [38]:
# Make inputs
import matplotlib
import sh_utils


input_file = "I:/My Drive/Research/gs_data/mov19_2022_03_03/time/time2_norm01_dist1000000_iter200_start100_moreFr/1408/point_cloud/iteration_200/point_cloud.ply"

vertices = PlyData.read(input_file)["vertex"]


means3D = np.column_stack((vertices['x'],vertices['y'],vertices['z']))
viewmat = frames['P1408CAM1.jpg'].world_to_cam.T



sh = np.column_stack([vertices[key] for key in vertices.data.dtype.names if 'rest' in key or 'dc' in key])
colors = sh_utils.rgb_from_sh(0,sh,xyz = None,camera_position = None)
opacity = 1 / (1 + np.exp(-vertices["opacity"]))[:,np.newaxis]
quats = np.column_stack([(vertices[f'rot_{idx}']) for idx in range(4)])
s = np.column_stack(([(vertices[f'scale_{idx}']) for idx in range(2)]))
s = np.exp(np.column_stack(([vertices["scale_0"], vertices["scale_1"]])))
scales = np.column_stack((s,vertices['scale_0']*0))
projmat = frames['P1408CAM1.jpg'].full_proj_transform
intrins = frames['P1408CAM1.jpg'].K_crop



In [44]:
frames['P1408CAM1.jpg'].K_crop

array([[-5.30842080e+03,  3.90521789e-08,  1.38258101e+02],
       [ 0.00000000e+00, -5.33634778e+03,  1.73681823e+02],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00]])

In [8]:
# Make inputs
import matplotlib
num_points1=8
means3D, scales, quats = get_inputs(num_points=num_points1)
scales[:,2] = scales[:,2]*0
intrins, viewmat, projmat, height, width = get_cameras()
intrins = intrins[:3,:3]
colors = matplotlib.colormaps['Accent'](np.random.randint(1,64, 64)/64)[..., :3]

opacity = np.ones(means3D[:,:1].shape)


In [32]:
means3D

array([[-4.87189414e-03, -8.92511383e-03, -6.41715669e-05],
       [-4.87817964e-03, -9.02052503e-03, -1.39234980e-04],
       [-4.83089685e-03, -8.93305335e-03, -1.03303864e-04],
       ...,
       [ 9.76864321e-05, -7.71776587e-03, -1.43542327e-03],
       [ 8.52383382e-05, -7.75804883e-03, -1.33686094e-03],
       [ 1.36520335e-04, -7.80833233e-03, -1.39167847e-03]], dtype=float32)

In [45]:
# Rasterization setup
projmat = np.zeros((4,4))
projmat[:3,:3] = intrins
projmat[-1,-2] = 1.0
projmat = projmat.T
T, colors, opacities, center, depth, radii = setup(means3D, scales, quats, opacity, colors, viewmat, projmat)
plt.figure(),plt.plot(depth)

(<Figure size 640x480 with 1 Axes>,
 [<matplotlib.lines.Line2D at 0x1f60c36d600>])

In [46]:
radii

array([[2.17467273, 3.68131276, 0.03      ],
       [2.46709429, 3.28792125, 0.03      ],
       [3.05154894, 2.86103645, 0.03      ],
       ...,
       [3.21051175, 5.61257962, 0.03      ],
       [3.87759254, 3.84584565, 0.03      ],
       [3.48260063, 5.92005488, 0.03      ]])

In [10]:

# Rasterization setup
projmat = np.zeros((4,4))
projmat[:3,:3] = intrins
projmat[-1,-2] = 1.0
projmat = projmat.T
T, colors, opacities, center, depth, radii = setup(means3D, scales, quats, opacity, colors, viewmat, projmat)

# Rasterization
# 1. Generate pixels
W, H = 80,160#int(intrins[0, -1] * 2), int(intrins[1, -1] * 2)
pix_x, pix_y = np.meshgrid(np.arange(W), np.arange(H), indexing='xy')
pix = np.stack([pix_x, pix_y], axis=-1)

# 2. Compute ray splat intersection # Eq.9 and Eq.10
pix_flat = pix.reshape(-1, 1, 2)
pixels = np.squeeze(pix_flat)
x,y = pixels[:,0],pixels[:,1]
s = np.vstack([intersection_point(pixel,T) for pixel in pixels])


# 3. Add 
# -pass filter # Eq.11
# When a point (2D Gaussian) is viewed from a far distance or at a slanted angle,
# the 2D Gaussian falls between pixels, and no fragment is used to rasterize the Gaussian.
# Add a low-pass filter to handle aliasing.
dist3d = np.sum(s * s, axis=-1)
dist3d = np.reshape(dist3d,(len(pixels),T.shape[0]))

filtersze = np.sqrt(2) / 2
dist_xycenter = np.hstack([pixel - center[None, :, :2] for pixel in pixels])
dist2d = (1 / filtersze) ** 2 * np.linalg.norm(dist_xycenter, axis=-1) ** 2
dist2d = np.reshape(dist2d,(len(pixels),T.shape[0]))
# Min of dist2 is equal to max of Gaussian exp(-0.5 * dist2)
dist2 = np.minimum(dist3d, dist2d)

depth_acc = np.sum(homogeneous(s)*np.tile(T[..., -1],(len(pixels),1)),axis = 1)
depth_acc = np.reshape(depth_acc,(len(pixels),T.shape[0]))
image, depthmap = alpha_blending_with_gaussians(dist2, colors, opacities, depth_acc, H, W)


invalid value encountered in divide



In [28]:

# Reshape colors and depth accumulation for proper broadcasting
colors = colors.reshape(-1, 1, colors.shape[-1])
depth_acc = depth_acc.T[..., None]
depth_acc = np.repeat(depth_acc, 1, axis=2)  # equivalent to repeating across the 3rd axis

# Evaluate Gaussians (using cutoff for visualization purposes)
cutoff = 1#1**2  # Equivalent to 1 sigma^2
dist2 = dist2.T  # Transpose to match the expected shape
gaussians = np.exp(-0.5 * dist2) * (dist2 < cutoff)
gaussians = gaussians[..., None]  # Add an extra dimension for broadcasting
alpha = opacities[:, None] * gaussians  # Opacities expanded to match the shape of Gaussians


In [37]:
gaussians[gaussians > 0]

array([0.76463841, 0.61487502, 0.95106797, ..., 0.68078601, 0.90185044,
       0.92000735])

In [17]:
plt.plot(opacities)

[<matplotlib.lines.Line2D at 0x1f59b2f76d0>]

In [26]:



image

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       ...,

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

In [48]:
gaussians = np.exp(-0.5 * dist2)
gaussians[gaussians>0.005]

array([0.00716238, 0.06831398, 0.05433685, ..., 0.92000735, 0.06057291,
       0.28610291])

In [395]:
import numpy as np
import plotly.graph_objects as go
import numpy as np
import plotly.graph_objects as go

def generate_ellipsoid_points(radius_x, radius_y, radius_z, num_points=50):
    """
    Generate points on an ellipsoid surface.
    Parameters:
        radius_x, radius_y, radius_z: Radii along the X, Y, and Z axes.
        num_points: Resolution of the ellipsoid mesh.
    Returns:
        x, y, z: Coordinates of the ellipsoid surface points.
    """
    u = np.linspace(0, 2 * np.pi, num_points)
    v = np.linspace(0, np.pi, num_points)
    u, v = np.meshgrid(u, v)

    x = radius_x * np.cos(u) * np.sin(v)
    y = radius_y * np.sin(u) * np.sin(v)
    z = radius_z * np.cos(v)

    return x, y, z

def plot_ellipsoids_in_3d(means3D, axes_lengths, colors):
    """
    Plot ellipsoids in 3D using Plotly.
    Parameters:
        means3D: Centers of the ellipsoids.
        axes_lengths: Array of shape (N, 3), each row contains the (radius_x, radius_y, radius_z) for an ellipsoid.
        colors: RGB colors for each ellipsoid.
    """
    fig = go.Figure()

    for i, (center, axes, color) in enumerate(zip(means3D, axes_lengths, colors)):
        # Generate points for the ellipsoid
        x, y, z = generate_ellipsoid_points(*axes)
        
        # Translate the ellipsoid to its 3D center
        x += center[0]
        y += center[1]
        z += center[2]

        # Add the ellipsoid as a 3D mesh
        fig.add_trace(go.Mesh3d(
            x=x.flatten(), y=y.flatten(), z=z.flatten(),
            alphahull=0,
            opacity=0.6,
            color=f'rgb({color[0]}, {color[1]}, {color[2]})',
            name=f'Ellipsoid {i+1}'
        ))

    # Add Gaussian centers as markers
    fig.add_trace(go.Scatter3d(
        x=means3D[:, 0], y=means3D[:, 1], z=means3D[:, 2],
        mode='markers',
        marker=dict(size=5, color='black'),
        name='Gaussian Centers'
    ))

    # Update layout
    fig.update_layout(
        scene=dict(
            xaxis_title='X',
            yaxis_title='Y',
            zaxis_title='Z',
        ),
        title='3D Gaussian Ellipsoids'
    )

    fig.show()



# Example Usage
# Replace these with your actual data

import matplotlib
num_points1=8
means3D, scales, quats = get_inputs(num_points=num_points1)
scales[:,2] = scales[:,2]*0
intrins, viewmat, projmat, height, width = get_cameras()
intrins = intrins[:3,:3]
colors = matplotlib.colormaps['Accent'](np.random.randint(1,64, 64)/64)[..., :3]

opacity = np.ones(means3D[:,:1].shape)

T, colors, opacities, center, depth, radii = setup(means3D, scales, quats, opacity, colors, viewmat, projmat)
idx = (radii <1).all(1)
plot_ellipsoids_in_3d(means3D[idx,:], radii[idx,:],colors[idx,:])


In [398]:
radii.shape

(64, 3)

## 2DGS - Math explained
### Modeling: 
The 2D Gaussian is defined in terms of its local coordinate system, where the $ X$ and $ Y$ axes are scaled according to the Gaussian's shape. The Gaussian is represented in the **world coordinates**, with the **axes** of the Gaussian in world space defined by the tangential vectors $ \mathbf{t_u} $ and $ \mathbf{t_v} $, and its **scaling factors** by $ s_u $ and $ s_v $. These tangential vectors define the directions of the local coordinate axes in the tangent plane (the object FoR).

The **object plane**, which is the tangent plane to the Gaussian in world space, can be described using the plane equation:
$$
P(u, v) = p_k + s_u \mathbf{t_u} u + s_v \mathbf{t_v} v
$$
where $ p_k $ is the center of the Gaussian in world coordinates, and $ u $ and $ v $ represent local coordinates on the tangent plane. This equation expresses the position of any point in the tangent plane in terms of the world coordinates, as modified by the scaling along the tangential axes $ \mathbf{t_u} $ and $ \mathbf{t_v} $.

To transform the local coordinates in the tangent plane (object frame of reference) to world coordinates, we define the following transformation matrix $ H $, which encodes the **scaling**, **rotation**, and **translation** from the object space to the world space:
$$
H = \begin{pmatrix}
s_u \mathbf{t_u} & s_v \mathbf{t_v} & 0 & p_k \\
0 & 0 & 0 & 1
\end{pmatrix}
= \begin{pmatrix}
R & p_k \\
0 & 1
\end{pmatrix}
$$
where:
- $ \mathbf{t_u} = \begin{pmatrix} t_{u_x} \\ t_{u_y} \\ t_{u_z} \end{pmatrix} $ and $ \mathbf{t_v} = \begin{pmatrix} t_{v_x} \\ t_{v_y} \\ t_{v_z} \end{pmatrix} $ are the 3D tangential vectors (defining the axes of the tangent plane in world coordinates),
- $ p_k = \begin{pmatrix} p_{k_x} \\ p_{k_y} \\ p_{k_z} \end{pmatrix} $ is the 3D position of the Gaussian center in world coordinates, and
- $ R $ is the 3x3 rotation matrix that describes the orientation of the tangent plane in world space.

The matrix $ H $ can be interpreted in two parts:
- The first part, $ \begin{pmatrix} s_u \mathbf{t_u} & s_v \mathbf{t_v} & 0 \end{pmatrix} $, represents the scaling and rotation of the axes in the world coordinates. It scales the tangential vectors $ \mathbf{t_u} $ and $ \mathbf{t_v} $ by $ s_u $ and $ s_v $, respectively, and applies any rotational transformation.
- The second part, $ p_k $, represents the **translation** of the Gaussian's center in world coordinates, which shifts the origin of the local tangent plane to the desired world position.

The matrix can also be interpreted in terms of the rotation matrix $ R $ (which describes the orientation of the tangent plane in world space) and the translation vector $ p_k $. The transformation from local coordinates $ (u, v) $ to world coordinates $ (x, y, z) $ is thus achieved through the multiplication of $ H $ by the local coordinate vector:
$$
\begin{pmatrix}
x \\
y \\
z \\
1
\end{pmatrix}
= H \begin{pmatrix}
u \\
v \\
1 \\
1
\end{pmatrix}
$$

This transformation allows us to map a point in the tangent plane (object space) to the corresponding point in the world space, accounting for both the Gaussian's shape (through the scaling $ s_u, s_v $) and its orientation (through the rotation $ R $).

In summary, the matrix $ H $ serves as a **homogeneous transformation** that encapsulates both the geometric properties (scaling and rotation) and the positioning (translation) of the 2D Gaussian in world space.

for every u,v on the tangent plane we can calculate the gaussian power: 
$$ G(u) = \exp\left( -\frac{u^2 + v^2}{2} \right) \tag{6} $$

* $ \mathbf{t_u} $ , $ \mathbf{t_v} , $ $\mathbf{s_u} $ and $\mathbf{s_v}$ are learnable parameters.
* each gaussian is defined by the opacity $\alpha$ and a view dependent color.

### Splatting


The goal of this process is to find the intersection between a ray, originating from a pixel in the image, and the tangent plane of a 2D Gaussian, and then evaluate the power of the Gaussian at the intersection point. Below are the detailed steps.

Step 1: Defining the Image Ray 

We begin by defining the ray from a pixel in the image using two orthogonal planes. The pixel location $(x, y) $ in the screen space can be used to define two planes:


* The x-plane (yz): a plane defined by a normal vector $ \mathbf{n}_x = (-1, 0, 0) $ and an offset $ x $. The 4D homogeneous form is $ h_x = (-1, 0, 0, x) $.
* The y-plane (xz): a plane defined by a normal vector $ \mathbf{n}_y = (0, -1, 0) $ and an offset $ y $. The 4D homogeneous form is $ h_y = (0, -1, 0, y) $.


These planes intersect at a ray in 3D space that is represented as the line of intersection between the two planes.

Step 2: Transforming the Planes to the Tangent Plane

Next, we transform the planes from the image space into the local coordinates of the 2D Gaussian primitive (the tangent plane). Using the transformation matrix $ M = (WH)^{-1} $, we apply the inverse transpose to the planes (W is the prespective projection matrix (transformation from camera to screen space), H is a transformation from tangant plane to camera space):

multiplying W (camera to screen) by H (tangent plane to camera) will transform a coordinate from tangent plane to camera to screen. Transposing it will map from screen space to tangent plane 

$$
h_u = (WH)^\top h_x \\
h_v = (WH)^\top h_y
$$
* those are the planes rotated from image space to tangent space


- We can imagine a ray from the camera center through a pixel, The pixel is described in screen space (here we use NDC) which means we transform it twice. 
1. from NDC to camera coordinate
2. from camera coordinates to gaussian (objec) coordinates. 
* The ray stays the same, it is still begins in the camera center and pass through the pixel. After all transformations we have the ray as seen from the gaussian axes. 

Step 3: Solving for the Intersection

The next step is to solve for the intersection point $(u, v) $ of the ray with the transformed tangent plane. We do this by solving the following system of equations:

$$
h_u \cdot (u, v, 1, 1)^\top = 0 \\
h_v \cdot (u, v, 1, 1)^\top = 0
$$

Expanding these equations gives us:


$$
    h_1^u u + h_2^u v + h_3^u + h_4^u = 0 \\
    h_1^v u + h_2^v v + h_3^v + h_4^v = 0
$$


The solution for $ u $ and $ v $ is obtained by solving this system of equations. This leads to the following expressions for the coordinates $ u(x) $ and $ v(x) $:

$$
u = \frac{h_2^u h_4^v - h_4^u h_2^v}{h_1^u h_2^v - h_2^u h_1^v} \\
v = \frac{h_4^u h_1^v - h_1^u h_4^v}{h_1^u h_2^v - h_2^u h_1^v}
$$

Where $ h_i^u $ and $ h_i^v $ are the homogeneous parameters of the transformed planes.

Step 4: Evaluating the Gaussian at the Intersection

Once we have the coordinates $ (u, v) $, we can evaluate the 2D Gaussian function at the intersection point. The Gaussian function is typically of the form:

$$
G(u, v) = \exp\left( -\frac{u^2 + v^2}{2} \right)
$$

This represents the Gaussian value at the point $ (u, v) $ on the tangent plane.


Once the values of $ u(x) $ and $ v(x) $ are found, we can compute the depth of the intersection point using the following equation:
$$
x = (x_z, y_z, z, z)^\top = W P(u, v) = W H (u, v, 1, 1)^\top
$$
Here, $ W $ is the camera projection matrix, and $ H $ is the transformation matrix from the tangent plane to world coordinates. The last component of the resulting vector $ x = (xz, yz, z, z)^\top $ gives the depth $ z $ of the intersection point (the third element, need to make sure). 



#### Screen space
this transformation returns a 3d coordinate in screen space. 
, where rendering occures. the projection matrix transformes x,y,z to the screen space
* x axis spans from 0 to -1 (width)
* y axis spans from 0 to -1 (hight)
* Z_{NDC} represents the depth (need to check that)
from NDC we can map to pixels

#### Transforming planes
* A point in 3D homogeneous coordinates is represented as $ (x,y,z,w)
* a plane consist of both a normal and an offset - in 3D homogeneous coordinates it is represented as $(a,b,c,d)
where $(a,b,c) is the planes normal vector and $d$ is its offset. 

The plane equation : $ ax + by + cz + dw = 0 $
which means that that dot product of the plane parameters $ (a,b,c,d)$ and a point $(x,y,z,w)$ is zero for a point lying on the plane. 

** Transforming Point on a Plane **
we Define matrix $M$ - a transformation matrix. 
tansforming a point on the plain using $M$: $p'=M\cdot p$
after transforming it, the plane equation should still hold: $h'p' = 0$ were $h', p'$ are the transformed plane and point parameters.

Because $p' = M\cdot p$ we get that $h'(M\cdot p) = 0 $ and since $ h' \cdot(M\cdotp p)=M^T \cdot h' \cdot p $ we can write 
$$
M^T \cdot h' \cdot p = h\cdot p \\
M^T \cdot h' = h
$$

and from that we show that the plane $h'$ remains valid after transformation. 
