In [5]:
from utils import *
from alignment import *

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
image_dir = "Data/ShapeNetSem/Images/subset_200"
pc_dir = "Data/ProcessedData/PointClouds"

dataset = AlignedModalityDataset(dataset_path, image_dir, pc_dir)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

for idx, token_text, image_tensor, point_cloud in dataloader:
    print(idx)
    print(f"Tokenized Text: {token_text.shape}, {type(token_text)}")
    print(f"Image Tensor: {image_tensor.shape}, {type(image_tensor)}")
    print(f"Point Cloud: {point_cloud.shape}, {type(point_cloud)}")
    break

tensor([198])
Tokenized Text: torch.Size([1, 77]), <class 'torch.Tensor'>
Image Tensor: torch.Size([1, 3, 518, 518]), <class 'torch.Tensor'>
Point Cloud: torch.Size([1, 1024, 3]), <class 'torch.Tensor'>


In [7]:
try:
    dinov2_encoder = load_dinov2()
    clip_encoder = load_clip()
    pclip_encoder = load_point_clip()
    dinov2_encoder.eval()
    clip_encoder.eval()
    pclip_encoder.eval()
    print('All Models loaded succesfully and set to eval mode')
except:
    print('Error in Loading Models')

Dinov2 Loaded Successfully!
CLIP Model Loaded Successfully!
Point CLIP Model Loaded Successfully!
All Models loaded succesfully and set to eval mode


In [8]:
# Initialize the model
align_model = AlignEncoder(400)  # Ensure the architecture matches

# Load the saved weights
checkpoint_path = "TrainedModels/Baseline/150.pth"
state_dict = torch.load(checkpoint_path, map_location=torch.device('cpu'))  # Load to CPU

# Apply the weights to the model
align_model.load_state_dict(state_dict)

# Set to evaluation mode (if needed)
align_model.eval()
device = "cuda" if torch.cuda.is_available() else "cpu"
align_model.to(device)

print("Model weights loaded successfully!")


Model weights loaded successfully!


Forward Pass

In [None]:
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
image_dir = "Data/ShapeNetSem/Images/subset_200"
pc_dir = "Data/ProcessedData/PointClouds"

# Set up CLIP preprocessing
preprocess = image_transform(
    clip_encoder.visual.image_size,  # Correct image size for CLIP
    is_train=False  # Ensures we use inference preprocessing
)

dataset = AlignedModalityDataset(dataset_path, image_dir, pc_dir)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
loss_fn = NTXentLoss(temperature=0.07)

with torch.no_grad():
    for i, batch in enumerate(dataloader):
        idx, tokenized_text, image_tensor, point_cloud = batch
        tokenized_text = tokenized_text.to(device) # (B, 77)
        image_tensor = image_tensor.to(device) # (B, 3, 518, 518)

        point_cloud = point_cloud.to(device) # (B, 1024, 3)

        # Assuming point_cloud is a batch of point clouds (shape: [batch_size, N, 3])
        batch_size = point_cloud.shape[0]

        # Convert each point cloud to a depth map and preprocess it
        depth_maps = [preprocess(point_cloud_to_depth_map(point_cloud[i])).unsqueeze(0) for i in range(batch_size)]

        # Stack depth maps into a single batch tensor
        depth_maps = torch.cat(depth_maps, dim=0).to(device)  # Shape: [batch_size, 3, H, W]

        text_emb = clip_encoder.encode_text(tokenized_text) # (B, 768)
        img_emb = dinov2_encoder(image_tensor) # (B, 384)
        pc_emb = pclip_encoder.encode_image(depth_maps) # (B, 768)
        text_proj, img_proj, pc_proj = align_model(text_emb, img_emb, pc_emb)

        loss_text_point = loss_fn(text_proj, pc_proj)
        loss_text_image = loss_fn(text_proj, img_proj)
        loss_image_point = loss_fn(img_proj, pc_proj)

        avg_loss = (loss_text_point + loss_text_image + loss_image_point) / 3
        """
        print("Input shapes: ", tokenized_text.shape, image_tensor.shape, point_cloud.shape)
        print("Depth Map Shapes: ", depth_maps.shape)
        print("Embedding shapes: ", text_emb.shape, img_emb.shape, pc_emb.shape)
        print("Projection Shapes: ", text_proj.shape, img_proj.shape, pc_proj.shape)
        print("Loss: ", loss_text_point, loss_text_image, loss_image_point)
        print("Average Loss: ",avg_loss)
        """  
        break  

<class 'torch.Tensor'>


# Cross Modal Retrieval

In [37]:
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
embed_path = "Embeddings/ALIGN/subset_template_200.pt"

cmr = CrossModalRetrival(dataset_path, embed_path)

In [30]:
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
image_dir = "Data/ShapeNetSem/Images/subset_200"
pc_dir = "Data/ProcessedData/PointClouds"

# Set up CLIP preprocessing
preprocess = image_transform(
    clip_encoder.visual.image_size,  # Correct image size for CLIP
    is_train=False  # Ensures we use inference preprocessing
)

dataset = AlignedModalityDataset(dataset_path, image_dir, pc_dir)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

with torch.no_grad():
    for i, batch in enumerate(dataloader):
        idx, tokenized_text, image_tensor, point_cloud = batch
        tokenized_text = tokenized_text.to(device) # (B, 77)
        image_tensor = image_tensor.to(device) # (B, 3, 518, 518)

        point_cloud = point_cloud.to(device) # (B, 1024, 3)

        # Assuming point_cloud is a batch of point clouds (shape: [batch_size, N, 3])
        batch_size = point_cloud.shape[0]

        # Convert each point cloud to a depth map and preprocess it
        depth_maps = [preprocess(point_cloud_to_depth_map(point_cloud[i])).unsqueeze(0) for i in range(batch_size)]

        # Stack depth maps into a single batch tensor
        depth_maps = torch.cat(depth_maps, dim=0).to(device)  # Shape: [batch_size, 3, H, W]

        text_emb = clip_encoder.encode_text(tokenized_text) # (B, 768)
        img_emb = dinov2_encoder(image_tensor) # (B, 384)
        pc_emb = pclip_encoder.encode_image(depth_maps) # (B, 768)
        text_proj, img_proj, pc_proj = align_model(text_emb, img_emb, pc_emb)

        break  

In [38]:
idx

[71, 76, 82, 80, 85]

In [39]:
df = pd.read_csv(dataset_path)
df.iloc[idx]

Unnamed: 0,fullId,category,name,tags,synset words,synset gloss,template1_desc,template2_desc,template3_desc
71,eaf341c056c79bec1a2c782fdbf60db6,"MediaStorage,TvStand",dresser entertainment chest,"ajberger,bed,bedroom,chest,drawers,dresser,ent...","[nan, 'stand']","[nan, 'a small table for holding articles of v...",A dresser entertainment chest which is commo...,The dresser entertainment chest is a stand o...,dresser entertainment chest is a stand desig...
76,3cdd2855b459d71199bdf8d5a28e79e9,"MediaStorage,TvStand",tv cabinet,"cabinet,center,media,stand,television,tv","[nan, 'stand']","[nan, 'a small table for holding articles of v...",A tv cabinet which is commonly known as cabine...,The tv cabinet is a stand often used for cabin...,"tv cabinet is a stand designed for cabinet,cen..."
82,aa122afea2dcf725db039d8689a74349,"ChestOfDrawers,Nightstand",end table,"arts and crafts,cabinet,craftsman,drawer,dynam...","['chest of drawers,chest,bureau,dresser']",['furniture with drawers for keeping clothes'],A end table which is commonly known as arts an...,"The end table is a chest of drawers,chest,bure...","end table is a chest of drawers,chest,bureau,d..."
80,1844a4ed0ff7ed38c2474c54a2e772f2,"ChestOfDrawers,Nightstand",modern bedside table,"bedside table,end table,modern","['chest of drawers,chest,bureau,dresser']",['furniture with drawers for keeping clothes'],A modern bedside table which is commonly known...,The modern bedside table is a chest of drawers...,"modern bedside table is a chest of drawers,che..."
85,5360cc2de7dfbc237a8c23d2d40f51b8,"ChestOfDrawers,Nightstand",ikea rian table,"bedside table,ikea,rian,small table,table","['chest of drawers,chest,bureau,dresser']",['furniture with drawers for keeping clothes'],A ikea rian table which is commonly known as b...,"The ikea rian table is a chest of drawers,ches...","ikea rian table is a chest of drawers,chest,bu..."


In [40]:
cmr.load_embeddings()

In [41]:
# retrieve(self, query, query_modality, target_modality, top_k=5)
idx, mesh_ids, projs = cmr.retrieve(text_proj, 'text', 'img', 5)

In [42]:
df.iloc[idx]

Unnamed: 0,fullId,category,name,tags,synset words,synset gloss,template1_desc,template2_desc,template3_desc
71,eaf341c056c79bec1a2c782fdbf60db6,"MediaStorage,TvStand",dresser entertainment chest,"ajberger,bed,bedroom,chest,drawers,dresser,ent...","[nan, 'stand']","[nan, 'a small table for holding articles of v...",A dresser entertainment chest which is commo...,The dresser entertainment chest is a stand o...,dresser entertainment chest is a stand desig...
76,3cdd2855b459d71199bdf8d5a28e79e9,"MediaStorage,TvStand",tv cabinet,"cabinet,center,media,stand,television,tv","[nan, 'stand']","[nan, 'a small table for holding articles of v...",A tv cabinet which is commonly known as cabine...,The tv cabinet is a stand often used for cabin...,"tv cabinet is a stand designed for cabinet,cen..."
82,aa122afea2dcf725db039d8689a74349,"ChestOfDrawers,Nightstand",end table,"arts and crafts,cabinet,craftsman,drawer,dynam...","['chest of drawers,chest,bureau,dresser']",['furniture with drawers for keeping clothes'],A end table which is commonly known as arts an...,"The end table is a chest of drawers,chest,bure...","end table is a chest of drawers,chest,bureau,d..."
80,1844a4ed0ff7ed38c2474c54a2e772f2,"ChestOfDrawers,Nightstand",modern bedside table,"bedside table,end table,modern","['chest of drawers,chest,bureau,dresser']",['furniture with drawers for keeping clothes'],A modern bedside table which is commonly known...,The modern bedside table is a chest of drawers...,"modern bedside table is a chest of drawers,che..."
85,5360cc2de7dfbc237a8c23d2d40f51b8,"ChestOfDrawers,Nightstand",ikea rian table,"bedside table,ikea,rian,small table,table","['chest of drawers,chest,bureau,dresser']",['furniture with drawers for keeping clothes'],A ikea rian table which is commonly known as b...,"The ikea rian table is a chest of drawers,ches...","ikea rian table is a chest of drawers,chest,bu..."


In [43]:
mesh_ids

71    eaf341c056c79bec1a2c782fdbf60db6
76    3cdd2855b459d71199bdf8d5a28e79e9
82    aa122afea2dcf725db039d8689a74349
80    1844a4ed0ff7ed38c2474c54a2e772f2
85    5360cc2de7dfbc237a8c23d2d40f51b8
Name: fullId, dtype: object