In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/VRD-IU

/content/drive/MyDrive/VRD-IU


In [20]:
!ls train_visual_features/ > train_objects.txt
!wc -l train_objects.txt

6880 train_objects.txt


In [1]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
from torchvision import transforms
import pickle
from transformers import AutoImageProcessor
import torch
class CompVisualDataset(Dataset):
    def __init__(self, pickle_file,image_path_root):
        super().__init__()
        with open(pickle_file, 'rb') as file:
            data = pickle.load(file)
        self.components = []
        self.root_path = image_path_root
        for k in data.keys():
            for comp in data[k]['components']:
                if comp['bbox'] == [0.0, 0.0, 0.0, 0.0]:
                  continue
                self.components.append(comp)

    def __len__(self):
        return len(self.components)

    def __getitem__(self, index):
        comp = self.components[index]
        img = Image.open(os.path.join(self.root_path, f"{comp['object_id']}.png")).convert("RGB")
        return img, comp['object_id']

def collate_fn(batch):
    imgs = [e[0] for e in batch]
    object_ids = [e[1] for e in batch]
    return imgs, object_ids

In [2]:
train_dataset = CompVisualDataset('train_data.pkl','train_components')

In [3]:
from transformers import AutoModel
class VisualEncoder(torch.nn.Module):
    def __init__(self,):
        super().__init__()
        self.dinvov2 = AutoModel.from_pretrained('facebook/dinov2-base')
        self.dinvov2.config.return_dict=False

    def forward(self, pixel_values):
        outputs = self.dinvov2(pixel_values)
        sequence_outputs = outputs[0]
        cls_token = sequence_outputs[:,0]
        patch_tokens = sequence_outputs[:,1:]
        embedding = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
        return embedding
encoder = VisualEncoder()       

In [4]:
model = VisualEncoder()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
print(f"Using: {device}")
model.to(device)
model.eval()
with torch.no_grad():
    traced_model = torch.jit.trace(model, torch.rand(1,3,224,224))

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Using: cpu


  if num_channels != self.num_channels:


In [5]:
image_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')

In [14]:
from tqdm import tqdm
import os
def extract_features(dataloader, feature_path):
  if not os.path.exists(feature_path):
    os.makedirs(feature_path)
  with torch.no_grad():
      for imgs, object_ids in tqdm(dataloader):
          image_inputs = image_processor(imgs, return_tensors="pt").to(device)
          features = model(image_inputs.pixel_values)
          for idx, obj_id in enumerate(object_ids):
            torch.save(features[idx],os.path.join(feature_path,f"{obj_id}.pt"))

In [8]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [7]:
start_idx = 0
subset = torch.utils.data.Subset(train_dataset, range(start_idx, len(train_dataset)))
train_dataloader = DataLoader(subset,batch_size=24, collate_fn= collate_fn, num_workers=4)

In [15]:
extract_features(train_dataloader,  'train_visual_features')
print("Extraction completed for training set!")

  1%|          | 17/1806 [02:02<3:35:29,  7.23s/it]


KeyboardInterrupt: 

In [None]:
val_dataset = CompVisualDataset('val_data.pkl','val_components')
val_dataloader = DataLoader(val_dataset,batch_size=8, collate_fn= collate_fn, num_workers=4)

In [None]:
extract_features(val_dataloader,  'val_visual_features')
print("Extraction completed for validation set!")
