In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cp /content/drive/MyDrive/VRD-IU/train_data.pkl .

In [3]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import pickle
from transformers import AutoTokenizer
import torch
class CompTextDataset(Dataset):
    def __init__(self, pickle_file):
        super().__init__()
        with open(pickle_file, 'rb') as file:
            data = pickle.load(file)
        self.components = []
        for k in data.keys():
            for comp in data[k]['components']:
                if comp['bbox'] == [0.0, 0.0, 0.0, 0.0]:
                  continue
                self.components.append(comp)

    def __len__(self):
        return len(self.components)

    def __getitem__(self, index):
        comp = self.components[index]
        try:
            text = comp['text']
        except:
            text = comp['category']
        return text, comp['object_id']

In [4]:
train_dataset = CompTextDataset('train_data.pkl')

In [5]:
from transformers import AutoModel
model = AutoModel.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")
model.to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Using: cuda


XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(250002, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine

In [6]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [7]:
from tqdm import tqdm
import os
def extract_features(dataloader, feature_path):
  if not os.path.exists(feature_path):
    os.makedirs(feature_path)
  with torch.no_grad():
      for texts, object_ids in tqdm(dataloader):
          text_inputs = tokenizer(texts, return_tensors="pt",padding=True, truncation=True).to(device)
          outputs = model(**text_inputs)
          features = outputs[0][:,0].detach().cpu()
          for idx, obj_id in enumerate(object_ids):
            torch.save(features[idx],os.path.join(feature_path,f"{obj_id}.pt"))

In [8]:
train_dataloader = DataLoader(train_dataset,batch_size=256, num_workers=2)

In [9]:
extract_features(train_dataloader,  'train_textual_features')
print("Extraction completed for training set!")

100%|██████████| 170/170 [17:55<00:00,  6.33s/it]

Extraction completed for training set!





In [10]:
!zip -r train_textual_features.zip train_textual_features

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: train_textual_features/30041.pt (deflated 37%)
  adding: train_textual_features/16562.pt (deflated 21%)
  adding: train_textual_features/4810.pt (deflated 12%)
  adding: train_textual_features/42036.pt (deflated 14%)
  adding: train_textual_features/58176.pt (deflated 11%)
  adding: train_textual_features/12654.pt (deflated 15%)
  adding: train_textual_features/50584.pt (deflated 15%)
  adding: train_textual_features/39619.pt (deflated 12%)
  adding: train_textual_features/42930.pt (deflated 9%)
  adding: train_textual_features/60442.pt (deflated 58%)
  adding: train_textual_features/51323.pt (deflated 14%)
  adding: train_textual_features/15985.pt (deflated 15%)
  adding: train_textual_features/13406.pt (deflated 31%)
  adding: train_textual_features/53595.pt (deflated 14%)
  adding: train_textual_features/2598.pt (deflated 20%)
  adding: train_textual_features/18729.pt (deflated 10%)
  adding: train_textual_fe