<a href="https://colab.research.google.com/github/rathianandk/demo_repo_2/blob/main/nhoodcls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate==0.27.2
!pip install transformers evaluate datasets

In [None]:
import requests
import torch
from PIL import Image
from transformers import *
from tqdm import tqdm
from transformers import AutoModelForZeroShotObjectDetection
from transformers import AutoProcessor, OwlViTForObjectDetection

device = "cuda" if torch.cuda.is_available() else "cpu"
from transformers import AutoProcessor, OwlViTForObjectDetection

#model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
model = AutoModelForZeroShotObjectDetection.from_pretrained("google/owlvit-base-patch32")

processor = AutoProcessor.from_pretrained("google/owlvit-base-patch32")


# Loading our Dataset

In [65]:
from datasets import load_dataset

# download & load the dataset
ds = load_dataset("rathi2023/owlvitnhood")
ds =  ds["train"].select(range(30))

Downloading readme:   0%|          | 0.00/448 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.63M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/41 [00:00<?, ? examples/s]

# Exploring the Data

In [66]:
print(ds)
ds = ds.train_test_split(test_size=0.10)

Dataset({
    features: ['image', 'image_id', 'objects'],
    num_rows: 30
})


# Preprocessing the Data

In [67]:
checkpoint = "google/owlvit-base-patch32"

from transformers import OwlViTProcessor, OwlViTForObjectDetection
from transformers import AutoImageProcessor
from transformers import AutoProcessor

image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")


loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--detr-resnet-50/snapshots/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b/preprocessor_config.json
Image processor DetrImageProcessor {
  "do_convert_annotations": true,
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "format": "coco_detection",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "DetrImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 1333,
    "shortest_edge": 800
  }
}



In [68]:
import albumentations
import numpy as np
from datasets import Dataset
import pandas as pd
# Apply transformations to the modified train dataset
import copy


# download & load the dataset
# Add a new column 'image_id' with unique identifiers
num_rows = len(ds)
unique_image_ids = np.arange(num_rows)

# Shuffle the unique image IDs to ensure randomness
np.random.shuffle(unique_image_ids)

from transformers import AutoImageProcessor
from transformers import AutoProcessor

image_processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50")

import albumentations
import numpy as np

transform = albumentations.Compose(
    [
        albumentations.Resize(480, 480),
        albumentations.HorizontalFlip(p=1.0),
        albumentations.RandomBrightnessContrast(p=1.0),
    ],
    bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
)
def formatted_anns(image_id, category, area, bbox):
    annotations = []
    for i in range(0, len(category)):
        new_ann = {
            "image_id": image_id,
            "category_id": category[i],
            "isCrowd": 0,
            "area": area,
            "bbox": list(bbox[i]),
        }
        annotations.append(new_ann)

    return annotations


# transforming a batch

# transforming a batch
# transforming a batch
def transform_aug_ann(examples):
    image_ids = examples["image_id"]
    images, bboxes, area, categories = [], [], [], []
    transformed_data = []
    for image, objects in zip(examples["image"], examples["objects"]):
        image = np.array(image.convert("RGB"))[:, :, ::-1]
        out = transform(image=image, bboxes=objects["bbox"], category=objects["category_id"])
        last_two_values = objects["bbox"][0][-2:]

    # Unpack the values into separate variables
        width, height = last_two_values

        area.append(width * height)
        area.append(area)
        images.append(out["image"])
        bboxes.append(out["bboxes"])
        categories.append(out["category"])
        transformed_data.append(processor(text="ASIN", images=image, return_tensors="pt"))


    return {"transformed_data":transformed_data}

def transform_aug_ann_labels(examples):
    image_ids = examples["image_id"]
    images, bboxes, area, categories = [], [], [], []
    for image, objects in zip(examples["image"], examples["objects"]):
        image = np.array(image.convert("RGB"))[:, :, ::-1]
        out = transform(image=image, bboxes=objects["bbox"], category=objects["category_id"])

        last_two_values = objects["bbox"][0][-2:]

    # Unpack the values into separate variables
        width, height = last_two_values

        area.append(width * height)
        images.append(out["image"])
        bboxes.append(out["bboxes"])
        categories.append(out["category"])

    targets = [
        {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
        for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
    ]

    return image_processor(images=images, annotations=targets, return_tensors="pt")

# Preprocessed Training Data

import pandas as pd
from datasets import Dataset

transform_1 = ds["train"].with_transform(transform_aug_ann)
transform_2 = ds["train"].with_transform(transform_aug_ann_labels)
data = []
for i in range(len(transform_1)):
    dict_ = {}
    dict_["input_ids"] = transform_1[i]["transformed_data"]["input_ids"]
    dict_["attention_mask"] = transform_1[i]["transformed_data"]["attention_mask"]
    dict_["pixel_values"] = transform_1[i]["transformed_data"]["pixel_values"][0]
    dict_["labels"] = transform_2[i]["labels"]
    data.append(dict_)

# Preprocessed Training Data
train_dataset = Dataset.from_list(data)
train_dataset.features

print(ds["train"][0])
# Using Detr-Loss calculation https://github.com/facebookresearch/detr/blob/main/models/matcher.py
# https://www.kaggle.com/code/bibhasmondal96/detr-from-scratch
class BoxUtils(object):
    @staticmethod
    def box_cxcywh_to_xyxy(x):
        x_c, y_c, w, h = x.unbind(-1)
        b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
             (x_c + 0.5 * w), (y_c + 0.5 * h)]
        return torch.stack(b, dim=-1)

    @staticmethod
    def box_xyxy_to_cxcywh(x):
        x0, y0, x1, y1 = x.unbind(-1)
        b = [(x0 + x1) / 2, (y0 + y1) / 2,
             (x1 - x0), (y1 - y0)]
        return torch.stack(b, dim=-1)

    @staticmethod
    def rescale_bboxes(out_bbox, size):
        img_h, img_w = size
        b = BoxUtils.box_cxcywh_to_xyxy(out_bbox)
        b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
        return b

    @staticmethod
    def box_area(boxes):
        """
        Computes the area of a set of bounding boxes, which are specified by its
        (x1, y1, x2, y2) coordinates.
        Arguments:
            boxes (Tensor[N, 4]): boxes for which the area will be computed. They
                are expected to be in (x1, y1, x2, y2) format
        Returns:
            area (Tensor[N]): area for each box
        """
        return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])

    @staticmethod
    # modified from torchvision to also return the union
    def box_iou(boxes1, boxes2):
        area1 = BoxUtils.box_area(boxes1)
        area2 = BoxUtils.box_area(boxes2)

        lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
        rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

        wh = (rb - lt).clamp(min=0)  # [N,M,2]
        inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

        union = area1[:, None] + area2 - inter

        iou = inter / union
        return iou, union

    @staticmethod
    def generalized_box_iou(boxes1, boxes2):
        """
        Generalized IoU from https://giou.stanford.edu/
        The boxes should be in [x0, y0, x1, y1] format
        Returns a [N, M] pairwise matrix, where N = len(boxes1)
        and M = len(boxes2)
        """
        # degenerate boxes gives inf / nan results
        # so do an early check
        assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
        assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
        iou, union = BoxUtils.box_iou(boxes1, boxes2)

        lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
        rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

        wh = (rb - lt).clamp(min=0)  # [N,M,2]
        area = wh[:, :, 0] * wh[:, :, 1]

        return iou - (area - union) / area

loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--facebook--detr-resnet-50/snapshots/1d5f47bd3bdd2c4bbfa585418ffe6da5028b4c0b/preprocessor_config.json
Image processor DetrImageProcessor {
  "do_convert_annotations": true,
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "format": "coco_detection",
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "DetrImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "longest_edge": 1333,
    "shortest_edge": 800
  }
}



{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=386x264 at 0x7A77575CFEE0>, 'image_id': '171849', 'objects': {'category_id': [41, 41, 42], 'bbox': [[0.358653, 0.471572, 0.641762, 0.717235], [0.79193, 0.2325, 0.164637, 0.232576], [0.800816, 0.696818, 0.235829, 0.305758]]}}


In [69]:
# Extract the labels from the "objects" field
train_labels = []
for example in ds["train"]:
    objects = example["objects"]["category_id"]
    train_labels.extend(objects)
# Create a set of unique labels
unique_labels = set(train_labels)
train_labels.sort()
# Create a dictionary to map each unique label to a unique integer ID
label2id = {label: idx for idx, label in enumerate(unique_labels)}

In [70]:
from transformers import OwlViTForObjectDetection, OwlViTFeatureExtractor
from transformers import TrainingArguments, Trainer
import torch
model= OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32",
    num_labels=len(unique_labels),
    id2label={str(i): c for i, c in enumerate(unique_labels)},
    label2id={c: str(i) for i, c in enumerate(unique_labels)},
    ignore_mismatched_sizes=True

)
model.to(device)



loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--owlvit-base-patch32/snapshots/cbc355fb364588351c5d51c7f74465e8e7ec6f72/config.json
text_config is None. Initializing the OwlViTTextConfig with default values.
vision_config is None. initializing the OwlViTVisionConfig with default values.
Model config OwlViTConfig {
  "_name_or_path": "google/owlvit-base-patch32",
  "architectures": [
    "OwlViTForObjectDetection"
  ],
  "id2label": {
    "0": 6,
    "1": 7,
    "10": 16,
    "11": 17,
    "12": 19,
    "13": 20,
    "14": 21,
    "15": 22,
    "16": 23,
    "17": 24,
    "18": 25,
    "19": 26,
    "2": 8,
    "20": 27,
    "21": 28,
    "22": 31,
    "23": 32,
    "24": 33,
    "25": 34,
    "26": 35,
    "27": 36,
    "28": 37,
    "29": 38,
    "3": 9,
    "30": 40,
    "31": 41,
    "32": 42,
    "33": 43,
    "34": 44,
    "35": 45,
    "36": 46,
    "37": 47,
    "4": 10,
    "5": 11,
    "6": 12,
    "7": 13,
    "8": 14,
    "9":

OwlViTForObjectDetection(
  (owlvit): OwlViTModel(
    (text_model): OwlViTTextTransformer(
      (embeddings): OwlViTTextEmbeddings(
        (token_embedding): Embedding(49408, 512)
        (position_embedding): Embedding(16, 512)
      )
      (encoder): OwlViTEncoder(
        (layers): ModuleList(
          (0-11): 12 x OwlViTEncoderLayer(
            (self_attn): OwlViTAttention(
              (k_proj): Linear(in_features=512, out_features=512, bias=True)
              (v_proj): Linear(in_features=512, out_features=512, bias=True)
              (q_proj): Linear(in_features=512, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=512, bias=True)
            )
            (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (mlp): OwlViTMLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=512, out_features=2048, bias=True)
              (fc2): Linear(in_features=2048, out_

# Defining the Metrics

In [71]:
from evaluate import load
import numpy as np

# load the accuracy and f1 metrics from the evaluate module
accuracy = load("accuracy")
f1 = load("f1")

def compute_metrics(eval_pred):
  # compute the accuracy and f1 scores & return them
  accuracy_score = accuracy.compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids)
  f1_score = f1.compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids, average="macro")
  return {**accuracy_score, **f1_score}

# Training the Model

In [72]:
def collate_fn(batch):
    print("hi:")

    print("Batch size:", len(batch))
    lbps=[]
    input_ids = torch.Tensor([item["input_ids"] for item in batch]).int()
    input_ids = input_ids.to(device)
    # attention_mask = torch.Tensor([item["attention_mask"].tolist() for item in batch]).int()
    attention_mask = torch.Tensor([item["attention_mask"] for item in batch]).int()
    attention_mask = attention_mask.to(device)
    # pixel_values = torch.Tensor([item["pixel_values"].tolist() for item in batch])
    pixel_values = torch.Tensor([item["pixel_values"] for item in batch])
    pixel_values = pixel_values.to(device)

    labels = []
    for item in batch:
        for (key, value) in item["labels"].items():
            item["labels"][key] = torch.Tensor(value).to(device)
        labels.append(item["labels"])

    print("inside batch")

    print(labels[0]["area"])
    # Return the batch with stacked tensors
    return {
        "input_ids": input_ids,
        "pixel_values": pixel_values,
        "labels": labels,
        "attention_mask": attention_mask,
    }




In [73]:

# for owlvit

from transformers import TrainingArguments
import os

training_args = TrainingArguments(
    "owlvit-base-patch32_FT_cppe5",
    per_device_train_batch_size=1,
    num_train_epochs=2,
    fp16=False,
    save_steps=200,
    logging_steps=50,
    learning_rate=1e-5,
    weight_decay=1e-4,
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=True,
    dataloader_pin_memory=False,
    gradient_accumulation_steps=1
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [12]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [115]:
# start training
trainer.train()

***** Running training *****
  Num examples = 27
  Num Epochs = 2
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 54
  Number of trainable parameters = 153,231,879
text_config is None. Initializing the OwlViTTextConfig with default values.
vision_config is None. initializing the OwlViTVisionConfig with default values.


hi:
Batch size: 1
inside batch
tensor([0.7011, 0.7011], device='cuda:0')
hi:
Batch size: 1
inside batch
tensor([0.8213, 0.8213], device='cuda:0')
torch.Size([1, 16])
odict_keys(['logits', 'pred_boxes', 'text_embeds', 'image_embeds', 'class_embeds', 'text_model_output', 'vision_model_output'])
before custom loss calling
inside custom loss calling
output type
<class 'transformers.models.owlvit.modeling_owlvit.OwlViTObjectDetectionOutput'>
target type
[{'area': tensor([0.7011, 0.7011], device='cuda:0'), 'boxes': tensor([[9.9850e-01, 1.7463e-03, 7.1053e-04, 1.1840e-03],
        [9.9924e-01, 1.7197e-03, 6.8695e-04, 1.3131e-03]], device='cuda:0'), 'class_labels': tensor([21., 22.], device='cuda:0'), 'image_id': tensor([2659.], device='cuda:0'), 'iscrowd': tensor([0., 0.], device='cuda:0'), 'orig_size': tensor([480., 480.], device='cuda:0'), 'size': tensor([800., 800.], device='cuda:0')}]
dict_keys(['logits', 'pred_boxes', 'text_embeds', 'image_embeds', 'class_embeds', 'text_model_output', 'v

Step,Training Loss
50,793.2741


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
         -4.0762e-02,  7.2119e-03,  5.9054e-02,  6.2488e-02,  3.4397e-02,
         -3.2750e-02, -1.7815e-02, -2.2360e-02,  4.6026e-03,  1.3219e-04,
         -1.4120e-02,  3.9542e-02,  3.1723e-02,  3.7171e-02, -4.3764e-02,
          1.5546e-02, -3.6013e-02,  1.9629e-02, -8.0525e-03,  1.6898e-02,
         -2.1162e-02,  1.2431e-01,  4.3401e-02, -1.7748e-02,  2.1356e-03,
          7.3626e-03, -1.0096e-02,  9.6790e-03, -1.7673e-02,  3.5076e-02,
          1.2322e-04, -8.6096e-03,  6.8733e-02, -3.3034e-02,  3.7413e-02,
          1.5366e-02,  2.5409e-02,  5.5760e-02,  1.9887e-02,  6.1556e-02,
          2.8903e-02,  1.6760e-02,  1.2373e-02, -2.0835e-02,  7.9721e-03,
         -3.9937e-02,  5.4924e-03, -1.8391e-04,  3.1114e-02,  2.2475e-02,
          1.2185e-02,  1.9588e-02,  9.9008e-03,  1.7765e-02,  1.1052e-02,
         -3.7955e-03, -2.7577e-02, -2.1330e-02, -2.2167e-03, -1.4000e-02,
         -1.3381e-02, -1.3538e-02,  1.9495e-02,



Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=54, training_loss=770.436781141493, metrics={'train_runtime': 55.6086, 'train_samples_per_second': 0.971, 'train_steps_per_second': 0.971, 'total_flos': 660875267520.0, 'train_loss': 770.436781141493, 'epoch': 2.0})

In [114]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight
from scipy.optimize import linear_sum_assignment

class HungarianMatcher(nn.Module):
    """This class computes an assignment between the targets and the predictions of the network
    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
    while the others are un-matched (and thus treated as non-objects).
    """

    def __init__(self, cost_class: float = 1, cost_bbox: float = 1, cost_giou: float = 1):
        """Creates the matcher
        Params:
            cost_class: This is the relative weight of the classification error in the matching cost
            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
        """
        super().__init__()
        self.cost_class = cost_class
        self.cost_bbox = cost_bbox
        self.cost_giou = cost_giou
        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"

    @torch.no_grad()
    def forward(self, outputs, targets):
        """ Performs the matching
        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
                           objects in the target) containing the class labels
                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
        """
        print(outputs.keys())
        bs, num_queries = outputs["logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = outputs["logits"].flatten(0, 1).softmax(-1)  # [batch_size * num_queries, num_classes]
        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]

        out_prob = torch.randn(576, 1)

        # Expand out_prob along the second dimension to have shape [576, 47]
        out_prob= out_prob.expand(-1, len(train_labels))
        # Also concat the target labels and boxes
        tgt_ids = torch.cat([v["class_labels"] for v in targets])
        print("Index ",type(tgt_ids))
        print(tgt_ids)
        tgt_ids = tgt_ids.int()
        print("Index ",type(tgt_ids))
        print(tgt_ids)

        tgt_bbox = torch.cat([v["boxes"] for v in targets])
        out_prob=out_prob.to(device)

        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
        # The 1 is a constant that doesn't change the matching, it can be ommitted.
        cost_class = -out_prob[:, tgt_ids]

        # Compute the L1 cost between boxes
        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)

        # Compute the giou cost betwen boxes
        cost_giou = -BoxUtils.generalized_box_iou(
            BoxUtils.box_cxcywh_to_xyxy(out_bbox),
            BoxUtils.box_cxcywh_to_xyxy(tgt_bbox)
        )

        # Final cost matrix
        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
        C = C.view(bs, num_queries, -1).cpu()

        sizes = [len(v["boxes"]) for v in targets]
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]

class SetCriterion(nn.Module):
    """ This class computes the loss for DETR.
    The process happens in two steps:
        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
    """
    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
        """ Create the criterion.
        Parameters:
            num_classes: number of object categories, omitting the special no-object category
            matcher: module able to compute a matching between targets and proposals
            weight_dict: dict containing as key the names of the losses and as values their relative weight.
            eos_coef: relative classification weight applied to the no-object category
            losses: list of all the losses to be applied. See get_loss for list of available losses.
        """
        super().__init__()
        self.num_classes = num_classes
        self.matcher = matcher
        self.weight_dict = weight_dict
        self.eos_coef = eos_coef
        self.losses = losses
        empty_weight = torch.ones(self.num_classes + 1)
        empty_weight[-1] = self.eos_coef
        self.register_buffer('empty_weight', empty_weight)

    def loss_labels(self, outputs, targets, indices, num_boxes):
        """Classification loss (NLL)
        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
        """
        print("loss_labels",outputs.keys())
        assert 'logits' in outputs
        src_logits = outputs['logits']

        idx = self._get_src_permutation_idx(indices)
        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]).to(torch.int64)
        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
                                    dtype=torch.int64, device=src_logits.device).to(torch.int64)
        target_classes[idx] = target_classes_o
        target_classes =target_classes
        src_logits = src_logits
        target_classes = target_classes
        self.empty_weight = self.empty_weight
        print(unique_labels)
        print("train_labels")
        print(train_labels)

        class_weights = compute_class_weight(
                                        class_weight = "balanced",
                                        classes = np.unique(train_labels),
                                        y = train_labels
                                    )
        weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
        weights = weights.to(device)

        print("class_weights")
        print(class_weights)
        loss_ce = F.cross_entropy(src_logits.transpose(1, 2),target_classes,reduction='none',ignore_index=28)

        print("loss_ce")
        print(loss_ce)
        losses = {'loss_ce': loss_ce}
        return losses

    @torch.no_grad()
    def loss_cardinality(self, outputs, targets, indices, num_boxes):
        """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes
        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients
        """
        pred_logits = outputs['logits']
        device = pred_logits.device
        tgt_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
        # Count the number of predictions that are NOT "no-object" (which is the last class)
        card_pred = (pred_logits.argmax(-1) != pred_logits.shape[-1] - 1).sum(1)
        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
        losses = {'cardinality_error': card_err}
        return losses

    def loss_boxes(self, outputs, targets, indices, num_boxes):
        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
        """
        assert 'pred_boxes' in outputs
        idx = self._get_src_permutation_idx(indices)
        print("loss_boxes",idx)
        src_boxes = outputs['pred_boxes'][idx]
        target_boxes = torch.cat([t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)

        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')

        losses = {}
        losses['loss_bbox'] = loss_bbox.sum() / num_boxes

        loss_giou = 1 - torch.diag(BoxUtils.generalized_box_iou(
            BoxUtils.box_cxcywh_to_xyxy(src_boxes),
            BoxUtils.box_cxcywh_to_xyxy(target_boxes))
        )
        losses['loss_giou'] = loss_giou.sum() / num_boxes
        return losses

    def _get_src_permutation_idx(self, indices):
        # permute predictions following indices
        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
        src_idx = torch.cat([src for (src, _) in indices])
        return batch_idx, src_idx

    def _get_tgt_permutation_idx(self, indices):
        # permute targets following indices
        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
        return batch_idx, tgt_idx

    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
        loss_map = {
            'labels': self.loss_labels,
            'cardinality': self.loss_cardinality,
            'boxes': self.loss_boxes,
        }
        assert loss in loss_map, f'do you really want to compute {loss} loss?'
        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)

    def forward(self, outputs, targets):
        """ This performs the loss computation.
        Parameters:
             outputs: dict of tensors, see the output specification of the model for the format
             targets: list of dicts, such that len(targets) == batch_size.
                      The expected keys in each dict depends on the losses applied, see each loss' doc
        """
        print("output type")
        print(type(outputs))
        print("target type")
        print(targets)
        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}

        # Retrieve the matching between the outputs of the last layer and the targets
        indices = self.matcher(outputs_without_aux, targets)

        # Compute the average number of target boxes accross all nodes, for normalization purposes
        num_boxes = sum(len(t["class_labels"]) for t in targets)
        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)

        # Compute all the requested losses
        losses = {}
        for loss in self.losses:
            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))

        return losses


# custom loss
def custom_loss(logits, labels):
    num_classes = len(unique_labels)
    matcher = HungarianMatcher(cost_class = 1, cost_bbox = 5, cost_giou = 2)
    weight_dict = {'loss_ce': 1, 'loss_bbox': 5, 'loss_giou': 2}
    losses = ['labels', 'boxes', 'cardinality']
    criterion = SetCriterion(num_classes, matcher=matcher, weight_dict=weight_dict, eos_coef=0.1, losses=losses)
    #criterion.to(device)
    print("inside custom loss calling")
    loss = criterion(logits, labels)
    print("inside",loss)
    print("loss val")
    simu =loss.values()
    scalar_values = []
    print("before simu",simu)
    for tensor in simu:
      if torch.is_tensor(tensor):
        if tensor.numel() == 1:  # Check if tensor has only one element
          scalar_values.append(tensor.item())
        else:
          scalar_values.extend(tensor.flatten().tolist())  # Flatten the tensor and extract elements



# Sum the scalar values
    total_loss = sum(scalar_values)
    print("simu",total_loss)

    print("fina;")


    return torch.tensor(total_loss,requires_grad=True)
# subclass trainer
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        inputs["input_ids"] = inputs["input_ids"][0]
        inputs["attention_mask"] = inputs["attention_mask"][0]
        print(inputs["attention_mask"].shape)
        outputs = model(**inputs, return_dict=True)

        print(outputs.keys())

        print("before custom loss calling")
        loss = custom_loss(outputs, labels)
        print("after custom loss calling")
        return (loss, outputs) if return_outputs else loss


# use new trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    tokenizer=processor
)

In [None]:
pip install timm

In [None]:

import torch
from transformers import DetrForObjectDetection, DetrImageProcessor
from transformers import DetrFeatureExtractor
from PIL import Image
import requests
import numpy as np
import requests

from transformers import OwlViTProcessor, OwlViTForObjectDetection
from transformers.image_utils import ImageFeatureExtractionMixin
import torch

import cv2
from PIL import Image
image = Image.open(requests.get("https://aft-vbi-pds.s3.amazonaws.com/bin-images/00934.jpg", stream=True).raw).convert('RGB')

model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
text_queries = ["asin and caption"]
inputs = processor(text=text_queries, images=image, return_tensors="pt").to(device)

# Set model in evaluation mode
model = model.to(device)
model.eval()

# Get predictions
with torch.no_grad():
 outputs = model(**inputs)
# Get prediction logits
logits = torch.max(outputs["logits"][0], dim=-1)
scores = torch.sigmoid(logits.values).cpu().detach().numpy()

# Get prediction labels and boundary boxes
labels = logits.indices.cpu().detach().numpy()
boxes = outputs["pred_boxes"][0].cpu().detach().numpy()
def draw_prediction(image, text_queries, scores, boxes, labels):
 for score, box, label in zip(scores, boxes, labels):

  if score < 0.5:
    continue

  cx, cy, w, h = box
  height, width = image.shape[:2]
  x1 = int(round((cx-w/2)*width))
  x2 = int(round((cx+w/2)*width))
  y1 = int(round((cy-h/2)*height))
  y2 = int(round((cy+h/2)*height))
  cv2.rectangle(image,(x1,y1), (x2,y2),(0,0,255),2)
  cv2.putText(image, f"{text_queries[label]}:{score:1.2f}", (x1,y1-4), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0,0,255),2)

  cv2_imshow("image",image)
  cv2.waitKey(0)
  cv2.destroyAllWindows()
draw_prediction(image, text_queries, scores, boxes, labels)

In [None]:
!pip install cv2_imshow

In [None]:
!pip install cv2_imshow

In [None]:
import cv2_imshow
cv2.imshow("image",image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [63]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
import os
import csv
import json
from collections import defaultdict

def parse_text_file(file_path):
    image_id = os.path.splitext(os.path.basename(file_path))[0]
    category_ids = []
    bboxes = []
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            category_id = int(parts[0])
            bbox = list(map(float, parts[1:]))
            category_ids.append(category_id)
            bboxes.append(bbox)
    objects = {"category_id": category_ids, "bbox": bboxes}
    return image_id, objects

def write_to_csv(data, csv_file):
    with open(csv_file, 'w', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=["image_id", "objects", "text_input"])
        writer.writeheader()
        for image_id, objects in data:
            writer.writerow({"image_id": image_id, "objects": json.dumps(objects), "text_input": ""})

folder_path = "/content/drive/MyDrive/nhoowlvit/label"

data = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        image_id, objects = parse_text_file(file_path)
        data.append((image_id, objects))

csv_file = "/content/drive/MyDrive/nhoowlvit/label/output.csv"
write_to_csv(data, csv_file)



In [None]:
import csv
import json

def create_metadata_jsonl(csv_file, output_file):
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        with open(output_file, 'w') as out_file:
            for row in reader:
                image_id = row['image_id']
                objects = json.loads(row['objects'])
                file_name = f"{image_id}.jpg"
                metadata = { "file_name": file_name,"image_id": image_id, "objects": objects}
                json.dump(metadata, out_file)
                out_file.write('\n')

csv_file = "/content/drive/MyDrive/nhoowlvit/labelimg/images/metadata.csv"
output_file = "/content/drive/MyDrive/nhoowlvit/labelimg/images/metadata.jsonl"
create_metadata_jsonl(csv_file, output_file)



In [None]:
!pip install --upgrade huggingface_hub datasets

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [64]:
from datasets import load_dataset
import os

dataset = load_dataset("imagefolder", data_dir="/content/drive/MyDrive/nhoowlvit/labelimg/images")

# next: push to the hub (assuming git-LFS is installed)

dataset.push_to_hub("rathi2023/owlvitnhood",max_shard_size="700mb")

Resolving data files:   0%|          | 0/42 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/rathi2023/owlvitnhood/commit/7c4895c64e76fefbd024e48bae59f4d5df13dc8a', commit_message='Upload dataset', commit_description='', oid='7c4895c64e76fefbd024e48bae59f4d5df13dc8a', pr_url=None, pr_revision=None, pr_num=None)