**Generating Visual Embeddings for VisualBert**

Following the tutorial https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing#scrollTo=643yOpAZwRWq.

In [1]:
import torch
torch.__version__

'2.1.2'

In [2]:
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True


In [3]:
%%capture
#!pip install pyyaml==5.1
#!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' # because FastRCNNOutputs is not there anymore, they changed codebase
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.5'

In [4]:
import torch, torchvision
import tensorflow as tf
import matplotlib.pyplot as plt
import json
import cv2
import numpy as np
from copy import deepcopy

2024-05-20 11:56:06.408103: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-20 11:56:06.408164: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-20 11:56:06.409807: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [10]:
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.structures.image_list import ImageList
from detectron2.data import transforms as T
from detectron2.modeling.box_regression import Box2BoxTransform
from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputs # check this!
from detectron2.structures.boxes import Boxes
from detectron2.layers import nms
from detectron2 import model_zoo
from detectron2.config import get_cfg

In [5]:
# paths for kaggle
import pickle
path_test = '/kaggle/input/test-list/image_list_test3.pkl'
with open(path_test, 'rb') as f:
    image_list_test = pickle.load(f)


In [8]:
print(len(image_list_test))
print(type(image_list_test[0])) # list of 

3
<class 'numpy.ndarray'>


**Load Config and Model Weights**


In [11]:
cfg_path = "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml"

def load_config_and_model_weights(cfg_path):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file(cfg_path))

    # ROI HEADS SCORE THRESHOLD
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5

    # Comment the next line if you're using 'cuda'
    #cfg['MODEL']['DEVICE']='cpu' # CHANGE HERE cpu or comment out

    cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(cfg_path)

    return cfg

cfg = load_config_and_model_weights(cfg_path)

### Load the Object Detection Model
The `build_model` method can be used to load a model from the configuration, the checkpoints have to be loaded using the `DetetionCheckpointer`.

In [12]:
def get_model(cfg):
    # build model
    model = build_model(cfg)

    # load weights
    checkpointer = DetectionCheckpointer(model)
    checkpointer.load(cfg.MODEL.WEIGHTS)

    # eval mode
    model.eval()
    return model

model = get_model(cfg)

### Convert Image to Model Input
The detectron uses resizing and normalization based on the configuration parameters and the input is to be provided using `ImageList`. The `model.backbone.size_divisibility` handles the sizes (padding) such that the FPN lateral and output convolutional features have same dimensions.

In [13]:
def prepare_image_inputs(cfg, img_list):
    # Resizing the image according to the configuration
    transform_gen = T.ResizeShortestEdge(
                [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
            )
    img_list = [transform_gen.get_transform(img).apply_image(img) for img in img_list]

    # Convert to C,H,W format
    convert_to_tensor = lambda x: torch.Tensor(x.astype("float32").transpose(2, 0, 1))

    batched_inputs = [{"image":convert_to_tensor(img), "height": img.shape[0], "width": img.shape[1]} for img in img_list]

    # Normalizing the image
    num_channels = len(cfg.MODEL.PIXEL_MEAN)
    pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1)
    pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)
    normalizer = lambda x: (x - pixel_mean) / pixel_std
    images = [normalizer(x["image"]) for x in batched_inputs]

    # Convert to ImageList
    images =  ImageList.from_tensors(images,model.backbone.size_divisibility)
    
    return images, batched_inputs

#images, batched_inputs = prepare_image_inputs(cfg, [img_bgr1, img_bgr2]) # two images

#images, batched_inputs = prepare_image_inputs(cfg, image_list)
#images_train, batched_inputs_train = prepare_image_inputs(cfg, image_list_train)
#images_val, batched_inputs_val = prepare_image_inputs(cfg, image_list_val)
images_test, batched_inputs_test = prepare_image_inputs(cfg, image_list_test)
print(images_test)

<detectron2.structures.image_list.ImageList object at 0x7e98dacae7d0>


### Get ResNet+FPN features
The ResNet model in combination with FPN generates five features for an image at different levels of complexity. For more details, refer to the FPN paper or this [article](https://medium.com/@hirotoschwert/digging-into-detectron-2-47b2e794fabd). For this tutorial, just know that `p2`, `p3`, `p4`, `p5`, `p6` are the features needed by the RPN (Region Proposal Network). The proposals in combination with `p2`, `p3`, `p4`, `p5` are then used by the ROI (Region of Interest) heads to generate box predictions.

In [15]:
#torch.cuda.empty_cache()
print(model.device)
#model.cpu()
print(model.device)
images_test = images_test.to(device)
print(images_test.device)
# Move model to GPU # COMP. EXPENSIVE
def get_features(model, images):
    features = model.backbone(images.tensor)
    return features

features = get_features(model, images_test)

#features = get_features(model, images) # AUFPASSEN, OB DAS GLEICHE MODEL BENUTZT WERDEN KANN!
#features_train = get_features(model, images_train)
#features_val = get_features(model, images_val)
# Move images to GPU
#images_test.tensor = images_test.tensor.to(device)
#images = images_test.tensor[0:1]
#image_list = ImageList(images, len(images))
#print(type(image_list))
#print(images_test.tensor.device)
#subset_images = images_test.tensor[0] # how to slice ImageList

#features_test_test = get_features(model, image_list_test)

cuda:0
cuda:0
cuda:0


In [18]:
#features.keys()
#print(features_train.keys())
#print(features_val.keys())
print(features.keys())
print(images_test.device)
images_test = images_test.to("cuda")
print("now:",images_test.device)

# push everything back to CPU
#model = model.to("cpu")

dict_keys(['p2', 'p3', 'p4', 'p5', 'p6'])
cpu
now: cuda:0


### Get region proposals from RPN
This RPN takes in the features and images and generates the proposals. Based on the configuration we chose, we get 1000 proposals.

In [20]:
def get_proposals(model, images, features):
    proposals, _ = model.proposal_generator(images, features)
    return proposals

#proposals = get_proposals(model, images, features)
#proposals_train = get_proposals(model, images_train, features_train)
#proposals_val = get_proposals(model, images_val, features_val)
print(model.device)
model = model.to("cuda") # ALLE AUF CUDA
print(model.device)
print(images_test.device)
proposals = get_proposals(model, images_test, features)


cpu
cuda:0
cuda:0


### Get Box Features for the proposals

The proposals and features are then used by the ROI heads to get the predictions. In this case, the partial execution of layers becomes significant. We want the `box_features` to be the `fc2` outputs of the regions. Hence, I use only the layers that are needed until that step.

In [21]:
def get_box_features(model, features, proposals, n_images):
    features_list = [features[f] for f in ['p2', 'p3', 'p4', 'p5']]
    box_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    box_features = model.roi_heads.box_head.flatten(box_features)
    box_features = model.roi_heads.box_head.fc1(box_features)
    box_features = model.roi_heads.box_head.fc_relu1(box_features)
    box_features = model.roi_heads.box_head.fc2(box_features)

    box_features = box_features.reshape(n_images, 1000, 1024) # depends on your config and batch size HERE NUMBER OF IMAGES
    return box_features, features_list

box_features_test, features_list_test = get_box_features(model, features, proposals, 3)

### Get prediction logits and boxes
The prediction class logits and the box predictions from the ROI heads, this is used in the next step to get the boxes and scores from the `FastRCNNOutputs`


In [22]:
def get_prediction_logits(model, features_list, proposals):
    cls_features = model.roi_heads.box_pooler(features_list, [x.proposal_boxes for x in proposals])
    cls_features = model.roi_heads.box_head(cls_features)
    pred_class_logits, pred_proposal_deltas = model.roi_heads.box_predictor(cls_features)
    return pred_class_logits, pred_proposal_deltas

#pred_class_logits, pred_proposal_deltas = get_prediction_logits(model, features_list, proposals)
pred_class_logits_test, pred_proposal_deltas_test = get_prediction_logits(model, features_list_test, proposals)

### Get FastRCNN scores and boxes

This results in the softmax scores and the boxes.

In [23]:
def get_box_scores(cfg, pred_class_logits, pred_proposal_deltas):
    box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
    smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA

    outputs = FastRCNNOutputs(
        box2box_transform,
        pred_class_logits,
        pred_proposal_deltas,
        proposals,
        smooth_l1_beta,
    )

    boxes = outputs.predict_boxes()
    scores = outputs.predict_probs()
    image_shapes = outputs.image_shapes

    return boxes, scores, image_shapes

#boxes, scores, image_shapes = get_box_scores(cfg, pred_class_logits, pred_proposal_deltas)
boxes_test, scores_test, image_shapes_test = get_box_scores(cfg, pred_class_logits_test, pred_proposal_deltas_test)

In [24]:
print(boxes_test)

(tensor([[5.0165e+00, 1.2599e+02, 5.4441e+02,  ..., 1.7521e+02, 5.4957e+02,
         7.7213e+02],
        [4.8264e-01, 1.1108e+02, 5.4405e+02,  ..., 1.3701e+02, 5.7439e+02,
         7.7916e+02],
        [3.0443e+02, 3.2622e+01, 1.0087e+03,  ..., 3.4832e+01, 9.9732e+02,
         7.6673e+02],
        ...,
        [6.8528e+02, 5.7050e+02, 7.5886e+02,  ..., 5.7205e+02, 7.5744e+02,
         6.4177e+02],
        [3.4345e+02, 6.5554e+02, 4.0448e+02,  ..., 6.5773e+02, 4.0367e+02,
         7.1470e+02],
        [9.2576e+02, 9.5258e+01, 9.8316e+02,  ..., 1.0454e+02, 9.8668e+02,
         1.6626e+02]], device='cuda:0', grad_fn=<SplitWithSizesBackward0>), tensor([[553.2299, 526.1436, 783.7120,  ..., 508.2873, 780.4779, 914.5804],
        [ 22.1054, 409.3750, 270.0865,  ..., 405.0153, 266.0139, 915.0680],
        [ 51.8678, 414.9601, 397.3161,  ..., 411.6564, 393.8779, 925.1323],
        ...,
        [274.9309, 190.6020, 299.4752,  ..., 192.3023, 297.6428, 250.5865],
        [521.8513, 212.5611, 722.

### Rescale the boxes to original image size
We want to rescale the boxes to original size as this is done in the detectron2 library. This is done for sanity and to keep it similar to the visualbert repository.

In [25]:
def get_output_boxes(boxes, batched_inputs, image_size):
    proposal_boxes = boxes.reshape(-1, 4).clone()
    scale_x, scale_y = (batched_inputs["width"] / image_size[1], batched_inputs["height"] / image_size[0])
    output_boxes = Boxes(proposal_boxes)

    output_boxes.scale(scale_x, scale_y)
    output_boxes.clip(image_size)

    return output_boxes

#output_boxes = [get_output_boxes(boxes[i], batched_inputs[i], proposals[i].image_size) for i in range(len(proposals))]
output_boxes_test = [get_output_boxes(boxes_test[i], batched_inputs_test[i], proposals[i].image_size) for i in range(len(proposals))]

### Select the Boxes using NMS
We need two thresholds - NMS threshold for the NMS box section, and score threshold for the score based section.

First NMS is performed for all the classes and the max scores of each proposal box and each class is updated.

Then the class score threshold is used to select the boxes from those.

In [87]:
def select_boxes(cfg, output_boxes, scores):
    test_score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST
    test_nms_thresh = cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST
    cls_prob = scores.detach()
    cls_boxes = output_boxes.tensor.detach().reshape(1000,80,4) # dets must be a cuda tensor
    #print(f"Type of boxes: {type(cls_boxes)}") # torch tensor
    
    max_conf = torch.zeros((cls_boxes.shape[0]))
    for cls_ind in range(0, cls_prob.shape[1]-1):
        cls_scores = cls_prob[:, cls_ind+1]
        det_boxes = cls_boxes[:,cls_ind,:]
        # tensor.detach().cpu().numpy()
        #print(type(det_boxes))
        #print(type(nms(det_boxes, cls_scores, test_nms_thresh)))
        #print(type(nms(det_boxes, cls_scores, test_nms_thresh).detach().cpu().numpy())) # numpy array
        keep = np.array(nms(det_boxes, cls_scores, test_nms_thresh).detach().cpu().numpy()) # HIER FEHLER
        #print(type(cls_scores))
        # push to CPU
        cls_scores = cls_scores.detach().cpu()
        max_conf = max_conf.detach().cpu()
        max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep], cls_scores[keep], max_conf[keep])
    keep_boxes = torch.where(max_conf >= test_score_thresh)[0]
    return keep_boxes, max_conf

In [88]:
print(type(output_boxes_test)) # liste von Boxes
print(type(output_boxes_test[0])) # eine Box 
print(output_boxes_test[0].device) # ist auf cuda:0. Wir möchten 


# HIER FEHLER
temp_test = [
    select_boxes(cfg, output_boxes_test[i], scores_test[i])
    for i in range(len(scores_test))
]

#temp_test = [select_boxes(cfg, output_boxes_test[i], scores_test[i]) for i in range(len(scores_test))]
keep_boxes_test, max_conf_test = [],[]
for keep_box, mx_conf in temp_test:
    keep_boxes_test.append(keep_box)
    max_conf_test.append(mx_conf)

<class 'list'>
<class 'detectron2.structures.boxes.Boxes'>
cuda:0


### Limit the total number of boxes
In order to get the box features for the best few proposals and limit the sequence length, we set minimum and maximum boxes and pick those box features.



In [89]:
MIN_BOXES=10
MAX_BOXES=100
def filter_boxes(keep_boxes, max_conf, min_boxes, max_boxes):
    if len(keep_boxes) < min_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:min_boxes]
    elif len(keep_boxes) > max_boxes:
        keep_boxes = np.argsort(max_conf).numpy()[::-1][:max_boxes]
    return keep_boxes

#keep_boxes = [filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes, max_conf)]
keep_boxes_test = [filter_boxes(keep_box, mx_conf, MIN_BOXES, MAX_BOXES) for keep_box, mx_conf in zip(keep_boxes_test, max_conf_test)]

### Get the visual embeddings :)
Finally, the boxes are chosen using the `keep_boxes` indices and from the `box_features` tensor.

In [90]:
def get_visual_embeds(box_features, keep_boxes):
    return box_features[keep_boxes.copy()]

#visual_embeds = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features, keep_boxes)]
visual_embeds_test = [get_visual_embeds(box_feature, keep_box) for box_feature, keep_box in zip(box_features_test, keep_boxes_test)]
print(type(visual_embeds_test[0]))
print(visual_embeds_test[0])
print(len(visual_embeds_test))

<class 'torch.Tensor'>
tensor([[-0.7501,  0.4076, -0.3822,  ..., -1.3299, -1.4076,  0.2907],
        [-0.7095,  0.8197,  1.8599,  ..., -3.3833, -1.8338, -0.5927],
        [ 1.6250,  1.2148,  1.9470,  ..., -1.0207, -1.7997, -0.0202],
        ...,
        [-0.2814,  1.5401, -0.1353,  ..., -0.1910, -0.8598,  0.6094],
        [-0.1560, -0.0267, -0.0982,  ..., -0.4272, -1.7346,  0.3153],
        [ 0.5300,  1.4253,  0.6216,  ..., -2.3827, -0.6693, -2.3209]],
       device='cuda:0', grad_fn=<IndexBackward0>)
3


In [91]:
import pickle

# Assuming 'tensor_list' is a list of PyTorch tensors
with open('/kaggle/working/v_embeds_test.pkl', 'wb') as f:
    pickle.dump(visual_embeds_test, f) # WORKS!

Now I can use the embeddings to pass it to vilbert and to use them as input for a classifier MLP.

Can I save the embeddings somewhere so I can have another notebook for vilBERT and BERT or should I put everything together?

## Using the embeddings with VisualBert

In [None]:
import os
from getpass import getpass
import urllib
# %cd /content/
# user = input('User name: ')
# password = getpass('Password: ')
# password = urllib.parse.quote(password) # your password is converted into url format
# cmd_string = f'git clone -b add_visualbert --single-branch https://{user}:{password}@github.com/gchhablani/transformers.git'
# os.system(cmd_string)
# cmd_string, password = "", "" # removing the password from the variable
# %cd transformers
# !pip install -e ".[dev]"
!pip install transformers

In [None]:
from transformers import BertTokenizer, VisualBertForPreTraining

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:

#comments_train = train_df["DESCRIPTION"].tolist()
#comments_val = train_df["DESCRIPTION"].tolist() # LOAD DF
comments_test = train_df["DESCRIPTION"].tolist()
#comments = ["This is a test.", "This is a test.", "This is a test."]
#tokens = tokenizer(comments, padding='max_length', max_length=512) # limit for BERT
#tokens_train = tokenizer(comments_train, padding='max_length', max_length=512)
#tokens_val = tokenizer(comments_val, padding='max_length', max_length=512)
tokens_test = tokenizer(comments_test, padding='max_length', max_length=512)

In [None]:
#input_ids = torch.tensor(tokens["input_ids"])
#attention_mask = torch.tensor(tokens["attention_mask"])
#token_type_ids = torch.tensor(tokens["token_type_ids"])

input_ids_train = torch.tensor(tokens_train["input_ids"])
attention_mask_train = torch.tensor(tokens_train["attention_mask"])
token_type_ids_train = torch.tensor(tokens_train["token_type_ids"])

input_ids_val = torch.tensor(tokens_val["input_ids"])
attention_mask_val = torch.tensor(tokens_val["attention_mask"])
token_type_ids_val = torch.tensor(tokens_val["token_type_ids"])

input_ids_test = torch.tensor(tokens_test["input_ids"])
attention_mask_test = torch.tensor(tokens_test["attention_mask"])
token_type_ids_test = torch.tensor(tokens_test["token_type_ids"])

In [None]:
#visual_embeds = torch.stack(visual_embeds)
#visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
#visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)

visual_embeds_train = torch.stack(visual_embeds_train)
visual_attention_mask_train = torch.ones(visual_embeds_train.shape[:-1], dtype=torch.long)
visual_token_type_ids_train = torch.ones(visual_embeds_train.shape[:-1], dtype=torch.long)

visual_embeds_val = torch.stack(visual_embeds_val)
visual_attention_mask_val = torch.ones(visual_embeds_val.shape[:-1], dtype=torch.long)
visual_token_type_ids_val = torch.ones(visual_embeds_val.shape[:-1], dtype=torch.long)

visual_embeds_test = torch.stack(visual_embeds_test)
visual_attention_mask_test = torch.ones(visual_embeds_test.shape[:-1], dtype=torch.long)
visual_token_type_ids_test = torch.ones(visual_embeds_test.shape[:-1], dtype=torch.long)

In [None]:
model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-nlvr2-coco-pre') # this checkpoint has 1024 dimensional visual embeddings projection

In [None]:
# outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, visual_embeds=visual_embeds, visual_attention_mask=visual_attention_mask, visual_token_type_ids=visual_token_type_ids, output_hidden_states=True)

outputs_train = model(input_ids=input_ids_train, attention_mask=attention_mask_train, token_type_ids=token_type_ids_train, visual_embeds=visual_embeds_train, visual_attention_mask=visual_attention_mask_train, visual_token_type_ids=visual_token_type_ids_train, output_hidden_states=True)
outputs_val = model(input_ids=input_ids_val, attention_mask=attention_mask_val, token_type_ids=token_type_ids_val, visual_embeds=visual_embeds_val, visual_attention_mask=visual_attention_mask_val, visual_token_type_ids=visual_token_type_ids_val, output_hidden_states=True)
outputs_test = model(input_ids=input_ids_test, attention_mask=attention_mask_test, token_type_ids=token_type_ids_test, visual_embeds=visual_embeds_test, visual_attention_mask=visual_attention_mask_test, visual_token_type_ids=visual_token_type_ids_test, output_hidden_states=True)

In [None]:
outputs_train

In [None]:
print(len(outputs_train.hidden_states[-1]))
#last_hidden_state = outputs.hidden_states[-1].shape # KANN MAN DAS IRGENDWIE ANPASSEN, dass die visual features = joint features sind?
#last_hidden_state
#last_hidden_state = outputs.last_hidden_state # NOT IN TUTORIAL, ADDED FROM TUT BELOW
# CHECK DIMENSIONS!!!!!!
#last_hidden_state.shape

last_hidden_state_train = outputs_train.hidden_states[-1]
last_hidden_state_val = outputs_val.hidden_states[-1]
last_hidden_state_test = outputs_test.hidden_states[-1]

print(last_hidden_state_train.shape())
print(last_hidden_state_val.shape())
print(last_hidden_state.test.shape())