# Pretrained Encoders

This is a notebook to test the pretrained encoders and see how they perform out of the box, we need to check the quality of their outputs and use technicques like t-SNE and UMAP to check if the embeddings are clustering as expected. If so we can use them directly out of the box.

## 1. DINO

In [86]:
from dataset import Dinov2ImageModalityDataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np

In [87]:
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
image_dir = "Data/ShapeNetSem/Images/subset_200"
dino_dataset = Dinov2ImageModalityDataset(dataset_path, image_dir)
dataloader = DataLoader(dino_dataset, batch_size=8, shuffle=True, num_workers=0)

In [88]:
# Iterate through DataLoader
for batch in dataloader:
    idx, image_paths, image_tensors = batch  # Extract image tensors and paths
    print(f"Batch size: {image_tensors.shape}")  # Expected shape: [8, 3, 224, 224]
    print(f"Sample image paths: {image_paths}")
    print(idx)
    break  # Just showing one batch

Batch size: torch.Size([8, 3, 518, 518])
Sample image paths: ('ffcf092f4675bd818e257e5bcc125963', '31fcf965836ab3484212ff51b27f0221', '61fe7cce3cc4b7f2f1783a44a88d6274', 'fc821511ed3090ac2846fa729d90e125', '4afb6dd55a1c4bb659ea5f21debb55d9', '48c8f3176fbe7b37384368499a680cf1', '3d15f0b93dc94909ce92f6b720939174', '90ed47303b284a8e8bd3a6724eacfb54')
tensor([ 75,  69, 192,   8,  72, 118,   9,  14])


In [89]:
import pandas as pd

In [90]:
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
image_dir = "Data/ShapeNetSem/Images/subset_200"
dino_dataset = Dinov2ImageModalityDataset(dataset_path, image_dir)
dataloader = DataLoader(dino_dataset, batch_size=8, shuffle=True, num_workers=0)

In [91]:
import torch
from PIL import Image
import timm

# Load Pretrained DINOv2 Model (Use 'vit_small_patch14_dinov2' for smaller models)
model_path = "PretrainedModels/dinov2_vits14_pretrain.pth"  # Change to your local path
model = timm.create_model("vit_small_patch14_dinov2", pretrained=False)

# Load the state dictionary and remove "mask_token"
checkpoint = torch.load(model_path, map_location="cpu")
checkpoint = {k: v for k, v in checkpoint.items() if k != "mask_token"}  # Remove unexpected key

# Load the modified state dict into the model
model.load_state_dict(checkpoint, strict=False)  # strict=False allows minor mismatches
model.eval()  # Set model to evaluation mode
print('Dinov2 Loaded Successfully')

Dinov2 Loaded Successfully


In [92]:
# Load Dataset
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
image_dir = "Data/ShapeNetSem/Images/subset_200"
dino_dataset = Dinov2ImageModalityDataset(dataset_path, image_dir)
dataloader = DataLoader(dino_dataset, batch_size=1, shuffle=True)

# Run Inference
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

for idx, image_path, image_tensor in dataloader:
    image_tensor = image_tensor.to(device)  # Move to GPU if available

    with torch.no_grad():  # Inference mode: no need to calculate gradient as no backprop
        embedding = model(image_tensor)  # Get feature vector
    
    print(f"Image Path: {image_path[0]}")
    print(f"Embedding Shape: {embedding.shape}")  # Expected shape: (1, 384) for ViT-Small
    print(f"Embedding Vector: {embedding.squeeze().cpu().numpy()}")

    break  # Only process one image for demonstration

Image Path: 26b4c8e753b7dd06a6fd617b2ff5d2d
Embedding Shape: torch.Size([1, 384])
Embedding Vector: [ 3.2771463e+00  7.3204428e-01  3.4540975e-01 -2.6265020e+00
 -7.6223135e-01  1.4487228e+00 -2.4464202e+00 -7.6935977e-01
 -2.2339597e+00 -8.0050230e-02 -3.7161463e-01  8.8754535e-01
  4.7706842e+00  1.0852878e+00  2.1162837e+00  1.9615690e+00
 -2.1609964e+00 -1.6863813e+00  1.7126704e+00 -4.6789956e+00
  6.2038569e+00 -2.4939299e+00  8.1460875e-01 -2.6219106e+00
 -2.9815216e+00 -4.1059089e+00  1.7338355e+00  3.5276411e+00
  3.7680969e+00 -1.4947901e+00 -2.4651363e+00  1.8144444e+00
  3.2125998e-01 -5.0219595e-01 -3.0231531e+00  3.1424706e+00
 -1.3231571e+00  4.6948275e-01  1.8991089e+00  4.0456362e+00
 -1.4451011e+00  2.0422914e+00  2.2484410e+00  2.7931983e+00
  3.1051904e-02 -3.1917313e-01 -2.1890783e+00 -2.7368019e+00
  1.4163955e+00 -1.7167895e+00 -4.1831341e+00  2.4398804e+00
  2.6013696e+00  2.9430735e+00 -1.8108184e+00  8.4120731e+00
  1.0076212e+00  3.8799400e+00  3.7516108e+00 

In [93]:
from torch import Tensor
# Load Dataset
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
image_dir = "Data/ShapeNetSem/Images/subset_200"
dino_dataset = Dinov2ImageModalityDataset(dataset_path, image_dir)
dataloader = DataLoader(dino_dataset, batch_size=1, shuffle=True)

all_embeddings = []
all_idx = []
all_cats = []
data = pd.read_csv(dataset_path)

# Run Inference
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

for idx, image_path, image_tensor in dataloader:
    image_tensor = image_tensor.to(device)  # Move to GPU if available

    with torch.no_grad():  # Inference mode: no need to calculate gradient as no backprop
        embedding = model(image_tensor)  # Get feature vector (1, 384)
    
    all_embeddings.append(embedding)
    all_idx.append(idx)

    all_cats.append(data.loc[int(idx.item()), 'category'])


In [94]:
len(all_embeddings), len(all_idx), len(all_cats)

(200, 200, 200)

In [95]:
embeddings_t = torch.concat(all_embeddings, dim=0)
data_dict = {
    "embedding": embeddings_t,
    "index": all_idx,
    "category": all_cats
}

save_path = "Embeddings/Dinov2/subset_template_200.pt"
torch.save(data_dict, save_path)

In [96]:
import sklearn

In [97]:
embeddings_t.numpy().shape

(200, 384)

In [98]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
import numpy as np

embeddings = embeddings_t.numpy()

In [99]:
formatted_categories = []
for subcategory in all_cats:
    subcategories = subcategory.split(',')
    new_subcategories = [s for s in subcategories if '_' not in s]
    formatted_categories.append(new_subcategories[0])

In [100]:
# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
categories = np.array(formatted_categories).reshape(-1, 1)
# Fit and transform the data
one_hot_encoded = encoder.fit_transform(categories)
print(f"After OHE categories shape: {one_hot_encoded.shape}")
labels = one_hot_encoded.argmax(axis=1)
print(f"After Argmax labels shape: {labels.shape}")
cat_mapping = {i: cat for i, cat in enumerate(encoder.categories_[0])}
print("Mapping (integer label -> string category):", cat_mapping)


After OHE categories shape: (200, 19)
After Argmax labels shape: (200,)
Mapping (integer label -> string category): {0: 'Bed', 1: 'Books', 2: 'Camera', 3: 'CeilingFan', 4: 'ChestOfDrawers', 5: 'Couch', 6: 'DrinkingUtensil', 7: 'Fan', 8: 'Faucet', 9: 'MediaStorage', 10: 'PersonStanding', 11: 'PillBottle', 12: 'Plant', 13: 'PottedPlant', 14: 'Showerhead', 15: 'Table', 16: 'Truck', 17: 'USBStick', 18: 'Vase'}


In [101]:
print(f"Before PCA embedding shape: {embeddings.shape}")
pca = PCA(n_components=50, random_state=42)
pca_result = pca.fit_transform(embeddings)
print(f"After PCA embedding shape: {pca_result.shape}")

Before PCA embedding shape: (200, 384)
After PCA embedding shape: (200, 50)


In [102]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
print(f"Before TSNE embedding shape: {pca_result.shape}")
tsne_result = tsne.fit_transform(pca_result)
print(f"After TSNE embedding shape: {tsne_result.shape}")

Before TSNE embedding shape: (200, 50)
After TSNE embedding shape: (200, 2)


In [103]:
import plotly.express as px
plot_data = pd.DataFrame({
    'x': tsne_result[:, 0],
    'y': tsne_result[:, 1],
    'numeric_label': labels,
    'category': [cat_mapping[label] for label in labels]
})

# Create an interactive scatter plot using Plotly Express
fig = px.scatter(
    plot_data, x='x', y='y', color='category',
    hover_data={'numeric_label': True, 'category': True},
    title="Interactive t-SNE Visualization with Category Info"
)
fig.show()

## CLIP

In [1]:
from dataset import CLIPTextModalityDataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import numpy as np
import torch
import open_clip
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
clip_dataset = CLIPTextModalityDataset(dataset_path)
dataloader = DataLoader(clip_dataset, batch_size=8, shuffle=True, num_workers=0)

In [None]:
"""
# Load CLIP model from OpenCLIP and get its state_dict
model_name = "ViT-L-14"  # Change this to ViT-L/14 if needed
pretrained = "openai"  # Choose "openai" for the original CLIP weights

# Load the model
model, _, _ = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)

# Save model state_dict
save_path = f"PretrainedModels/clip_vitl14_pretrain.pth"
torch.save(model.state_dict(), save_path)

print(f"CLIP model saved to: {save_path}")
"""



CLIP model saved to: PretrainedModels/clip_vitl14_pretrain.pth


In [4]:
# Load Dataset
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
clip_dataset = CLIPTextModalityDataset(dataset_path)
dataloader = DataLoader(clip_dataset, batch_size=8, shuffle=True, num_workers=0)
model_name = "ViT-L-14"  # Change this to ViT-L/14 if needed
pretrained = "openai"  
save_path = f"PretrainedModels/clip_vitl14_pretrain.pth"

clip_model = open_clip.create_model(model_name, pretrained=False)

# Load saved state dict
checkpoint = torch.load(save_path, map_location="cpu")
clip_model.load_state_dict(checkpoint)
clip_model.eval()

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model.to(device)

# Extract text embeddings
for idx, tokenized_text, text_prompt in dataloader:
    tokenized_text = tokenized_text.to(device)

    with torch.no_grad():
        text_embedding = clip_model.encode_text(tokenized_text)

    print(f"Text: {text_prompt}")
    print(f"Text Embedding Shape: {text_embedding.shape}")  # Expected: (batch_size, 512 or 768)
    break  # Process one batch for testing

Text: ('ceiling fan is a fan designed for appliance,ceiling,fan,home,light. It serves the purpose of a device for creating a current of air by movement of a surface or surfaces.', 'bronze meenakari center table is a table, coffee table,cocktail table designed for center table,coffee table,ethnic,meenakari,table. It serves the purpose of a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs; "it was a sturdy table", low table where magazines can be placed and coffee or cocktails are served.', 'A bladeless ceiling fan which is commonly known as bladeless,bladeless fan,ceiling fan,circular fan,dyson,dyson fan,fan,fans,modern fan,room fan. It is associated with the following characteristics: fan.\nA general description of this item is: a device for creating a current of air by movement of a surface or surfaces.', 'The vasque  in sketchup is a vase often used for bowl,escargot,perduccizzi,pot,poterie,pottery,vase,vaso,vasque. It can be describe

In [5]:
from torch import Tensor
# Load Dataset
dataset_path = "Data/ShapeNetSem/Datasets/subset_template_200.csv"
clip_dataset = CLIPTextModalityDataset(dataset_path)
dataloader = DataLoader(clip_dataset, batch_size=1, shuffle=True, num_workers=0)
model_name = "ViT-L-14"  # Change this to ViT-L/14 if needed
pretrained = "openai"  
save_path = f"PretrainedModels/clip_vitl14_pretrain.pth"

clip_model = open_clip.create_model(model_name, pretrained=False)

# Load saved state dict
checkpoint = torch.load(save_path, map_location="cpu")
clip_model.load_state_dict(checkpoint)
clip_model.eval()

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model.to(device)

all_embeddings = []
all_idx = []
all_cats = []
data = pd.read_csv(dataset_path)

with torch.no_grad():
    for idx, tokenized_text, text_prompt in dataloader:
        tokenized_text = tokenized_text.to(device)  # Move to GPU if available

        embedding = clip_model.encode_text(tokenized_text)  # Get feature vector (1, 384)
        
        all_embeddings.append(embedding)
        all_idx.append(idx)

        all_cats.append(data.loc[int(idx.item()), 'category'])


KeyboardInterrupt: 

In [10]:
len(all_embeddings), len(all_idx), len(all_cats)

(200, 200, 200)

In [11]:
embeddings_t = torch.concat(all_embeddings, dim=0)
data_dict = {
    "embedding": embeddings_t,
    "index": all_idx,
    "category": all_cats
}

save_path = "Embeddings/CLIP/subset_template_200.pt"
torch.save(data_dict, save_path)

In [12]:
import sklearn

In [13]:
embeddings_t.numpy().shape

(200, 768)

In [14]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
import numpy as np

embeddings = embeddings_t.numpy()

In [15]:
formatted_categories = []
for subcategory in all_cats:
    subcategories = subcategory.split(',')
    new_subcategories = [s for s in subcategories if '_' not in s]
    formatted_categories.append(new_subcategories[0])

In [16]:
# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
categories = np.array(formatted_categories).reshape(-1, 1)
# Fit and transform the data
one_hot_encoded = encoder.fit_transform(categories)
print(f"After OHE categories shape: {one_hot_encoded.shape}")
labels = one_hot_encoded.argmax(axis=1)
print(f"After Argmax labels shape: {labels.shape}")
cat_mapping = {i: cat for i, cat in enumerate(encoder.categories_[0])}
print("Mapping (integer label -> string category):", cat_mapping)


After OHE categories shape: (200, 19)
After Argmax labels shape: (200,)
Mapping (integer label -> string category): {0: 'Bed', 1: 'Books', 2: 'Camera', 3: 'CeilingFan', 4: 'ChestOfDrawers', 5: 'Couch', 6: 'DrinkingUtensil', 7: 'Fan', 8: 'Faucet', 9: 'MediaStorage', 10: 'PersonStanding', 11: 'PillBottle', 12: 'Plant', 13: 'PottedPlant', 14: 'Showerhead', 15: 'Table', 16: 'Truck', 17: 'USBStick', 18: 'Vase'}


In [17]:
print(f"Before PCA embedding shape: {embeddings.shape}")
pca = PCA(n_components=100, random_state=42)
pca_result = pca.fit_transform(embeddings)
print(f"After PCA embedding shape: {pca_result.shape}")

Before PCA embedding shape: (200, 768)
After PCA embedding shape: (200, 100)


In [18]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
print(f"Before TSNE embedding shape: {pca_result.shape}")
tsne_result = tsne.fit_transform(pca_result)
print(f"After TSNE embedding shape: {tsne_result.shape}")

Before TSNE embedding shape: (200, 100)


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



After TSNE embedding shape: (200, 2)


In [19]:
import plotly.express as px
plot_data = pd.DataFrame({
    'x': tsne_result[:, 0],
    'y': tsne_result[:, 1],
    'numeric_label': labels,
    'category': [cat_mapping[label] for label in labels]
})

# Create an interactive scatter plot using Plotly Express
fig = px.scatter(
    plot_data, x='x', y='y', color='category',
    hover_data={'numeric_label': True, 'category': True},
    title="Interactive t-SNE Visualization with Category Info"
)
fig.show()