In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/VRD-IU

/content/drive/MyDrive/VRD-IU


In [3]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
from torchvision import transforms
import pickle
from transformers import AutoImageProcessor
import torch
class CompVisualDataset(Dataset):
    def __init__(self, pickle_file,image_path_root):
        super().__init__()
        with open(pickle_file, 'rb') as file:
            data = pickle.load(file)
        self.components = []
        self.root_path = image_path_root
        for k in data.keys():
            for comp in data[k]['components']:
                if comp['bbox'] == [0.0, 0.0, 0.0, 0.0]:
                  continue
                self.components.append(comp)

    def __len__(self):
        return len(self.components)

    def __getitem__(self, index):
        comp = self.components[index]
        img = Image.open(os.path.join(self.root_path, f"{comp['object_id']}.png")).convert("RGB")
        return img, comp['object_id']

def collate_fn(batch):
    imgs = [e[0] for e in batch]
    object_ids = [e[1] for e in batch]
    return imgs, object_ids

In [4]:
train_dataset = CompVisualDataset('train_data.pkl','train_components')

In [5]:
from transformers import AutoModel
class VisualEncoder(torch.nn.Module):
    def __init__(self,):
        super().__init__()
        self.dinvov2 = AutoModel.from_pretrained('facebook/dinov2-base')
        self.dinvov2.config.return_dict=False

    def forward(self, pixel_values):
        outputs = self.dinvov2(pixel_values)
        sequence_outputs = outputs[0]
        cls_token = sequence_outputs[:,0]
        patch_tokens = sequence_outputs[:,1:]
        embedding = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
        return embedding
encoder = VisualEncoder()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [7]:
model = VisualEncoder()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(f"Using: {device}")
model.to(device)
model.eval()
with torch.no_grad():
    traced_model = torch.jit.trace(model, torch.rand(1,3,224,224).to(device))

Using: cuda


  if num_channels != self.num_channels:


In [8]:
image_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

In [9]:
from tqdm import tqdm
import os
def extract_features(dataloader, feature_path):
  if not os.path.exists(feature_path):
    os.makedirs(feature_path)
  with torch.no_grad():
      for imgs, object_ids in tqdm(dataloader):
          image_inputs = image_processor(imgs, return_tensors="pt").to(device)
          features = model(image_inputs.pixel_values)
          for idx, obj_id in enumerate(object_ids):
            torch.save(features[idx],os.path.join(feature_path,f"{obj_id}.pt"))

In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [10]:
train_dataloader = DataLoader(train_dataset,batch_size=32, collate_fn= collate_fn, num_workers=4)



In [11]:
extract_features(train_dataloader,  'train_visual_features')
print("Extraction completed for training set!")

100%|██████████| 1354/1354 [1:10:24<00:00,  3.12s/it]

Extraction completed for training set!



