pip install torch torchvision
pip install git+https://github.com/openai/CLIP.git
pip install transformers
pip install datasets
# Task: `Vision-Language Model`

Given an image and a caption describing a target in that image, return a bounding box corresponding to the target’s location within the image.

Note that targets within a given image are not uniquely identified by their object class (e.g. ”airplane”, “helicopter”); multiple targets within an image may be members of the same object class. Instead, targets provided will correspond to a particular target description (e.g. “black and white drone”).

Not all possible target descriptions will be represented in the training dataset provided to participants. There will also be unseen targets and novel descriptions in the test data used in the hidden test cases of the Virtual Qualifiers, Semi-Finals / Finals. As such, Guardians will have to develop vision models capable of understanding **natural language** to identify the correct target from the scene.

For the **image datasets** provided to both Novice and Advanced Guardians, there will be no noise present. However, it is worth noting that your models will have to be adequately robust as the hidden test cases for the Virtual Qualifiers and the Semi-Finals/Finals will have increasing amounts of noise introduced. This is especially crucial for **Advanced Guardians**, due to the degradation of their robot sensors.

In [1]:
# !pip install torch torchvision
# !pip install git+https://github.com/openai/CLIP.git
# !pip install transformers
# !pip install datasets
# !pip install -q ultralytics

In [2]:
##import all the libraries

import wandb
from PIL import Image
import IPython.display as display
import torch
import requests
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt
import torch
import json
from sklearn.model_selection import train_test_split
import random
from torch.utils.data import IterableDataset, DataLoader
from ultralytics import YOLO

from tqdm import tqdm
import torchvision
from torchvision.transforms import functional as F
from torchvision import transforms
from torchinfo import summary
import urllib
import os
import torchvision.transforms as T
import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np
import jsonlines
import cv2
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
## random directories that will be needed

cur_dir = os.getcwd()
vlm_dir = os.path.dirname(cur_dir)
til_dir = os.path.dirname(vlm_dir)
home_dir = os.path.dirname(til_dir)
test_dir = os.path.join(home_dir, 'novice')
img_dir = os.path.join(test_dir, 'images')
data_dir = os.path.join(cur_dir, 'data')

##training data to be added to tune the models
metadata_path = os.path.join(test_dir, 'vlm.jsonl')

# paths for converting datasets to manifest files
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
val_dir = os.path.join(data_dir, "val")

## <span style="color:blue;">Prepare the data set merge the image and captions together</span>

In [None]:
import json
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import clip
from transformers import DetrForObjectDetection, DetrImageProcessor

def split_data(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, seed=42):
    
    random.seed(seed)

    total_examples = len(data['image'])
    indices = list(range(total_examples))
    random.shuffle(indices)
    
    train_end = int(train_ratio * total_examples)
    val_end = train_end + int(val_ratio * total_examples)
    
    train_indices = indices[:train_end]
    val_indices = indices[train_end:val_end]
    test_indices = indices[val_end:]
    
    train_data = {'image': [data['image'][i] for i in train_indices], 'annotations': [data['annotations'][i] for i in train_indices]}
    val_data = {'image': [data['image'][i] for i in val_indices], 'annotations': [data['annotations'][i] for i in val_indices]}
    test_data = {'image': [data['image'][i] for i in test_indices], 'annotations': [data['annotations'][i] for i in test_indices]}
    
    return train_data, val_data, test_data

MAX_FILE_COUNT = None # Set if only want max files

data = {'image': [], 'annotations': []}
data_path = os.path.join(test_dir, "vlm.jsonl")
with jsonlines.open(metadata_path) as reader:
    for obj in reader:
        if MAX_FILE_COUNT and len(data['image']) >= MAX_FILE_COUNT:
            break
        data['image'].append(os.path.join(img_dir, obj['image']))
        data['annotations'].append(obj['annotations'])

In [5]:
print(len(captions_set))

126


Original: blue and red light aircraft
Colors: ['blue', 'red']
Objects: ['light aircraft']
------------------------------
Original: black drone
Colors: ['black']
Objects: ['drone']
------------------------------
Original: white, black, and grey missile
Colors: ['white', 'grey', 'black']
Objects: ['missile']
------------------------------
Original: yellow helicopter
Colors: ['yellow']
Objects: ['helicopter']
------------------------------
Original: white fighter jet
Colors: ['white']
Objects: ['fighter jet']
------------------------------
Original: grey fighter jet
Colors: ['grey']
Objects: ['fighter jet']
------------------------------
Original: red helicopter
Colors: ['red']
Objects: ['helicopter']
------------------------------
Original: white and orange light aircraft
Colors: ['orange', 'white']
Objects: ['light aircraft']
------------------------------
Original: green and black camouflage helicopter
Colors: ['green', 'camouflage', 'black']
Objects: ['helicopter']
-------------------

In [9]:
objects_set

{'cargo aircraft',
 'commercial aircraft',
 'drone',
 'fighter jet',
 'fighter plane',
 'helicopter',
 'light aircraft',
 'missile'}

KeyError: 'color'

## We will train on 126 classes

We can create a mapping

In [25]:
# Create a mapping from captions to integers
caption_to_label = {caption: idx for idx, caption in enumerate(captions_set)}

print("Caption to Label Mapping:")
# print(json.dumps(caption_to_label, indent=2))

# Apply the mapping to the annotations
for img_annotations in data['annotations']:
    for annotation in img_annotations:
        annotation['label'] = caption_to_label[annotation['caption']]
        


Caption to Label Mapping:


In [None]:
train_data, val_data, test_data = split_data(data)

In [26]:
# first_image_path = train_data['image'][0]
# first_image_annotations = train_data['annotations'][0]

# print(f"First image path: {first_image_path}")
# print(f"Annotations: {first_image_annotations}")

# # Load and display the image
# image = cv2.imread(first_image_path)
# if image is None:
#     print(f"Failed to load image at {first_image_path}")
# else:
#     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#     plt.imshow(image)
#     plt.title(f"Annotations: {first_image_annotations}")
#     plt.axis('off')
#     plt.show()

In [1]:
class CLIPPreprocessor:
    def __init__(self, dataset, output_dir, batch_size=32, device="cuda"):
        self.dataset = dataset
        self.output_dir = output_dir
        self.batch_size = batch_size
        self.device = device
        self.clip_model, self.preprocess_clip = clip.load("ViT-B/32", device=device)
        self.augmentations = A.Compose([
            A.GaussianBlur(blur_limit=(3, 7), p=0.2)
        ])
        self.unique_labels = set()

    def preprocess_and_save_batches(self):
        images = self.dataset['image']
        annotations = self.dataset['annotations']
        num_batches = (len(images) + self.batch_size - 1) // self.batch_size

        for batch_idx in tqdm(range(num_batches), desc="Processing Batches"):
            batch_images = images[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size]
            batch_annotations = annotations[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size]
            batch_data = list(zip(batch_images, batch_annotations))
            image_features, text_features, cropped_image_features, bboxes, labels = self.process_batch(batch_data)

            self.pad_batch(image_features, bboxes, labels, botext_featuresxes, cropped_image_features)
            self.save_batch(batch_idx, image_tensors, image_features, text_features, boxes, labels)

        return num_batches

    def process_batch(self, batch_data):
        image_tensors = []
        all_image_features = []
        all_text_features = []
        all_boxes = []
        all_labels = []

        for image_path, image_annotations in batch_data:
            try:
                image = Image.open(image_path).convert("RGB")
                image_np = np.array(image)  # Convert PIL image to numpy array
                augmented = self.augmentations(image=image_np)  # Apply Gaussian blur
                image_np = augmented['image']
                raw_image_tensor = T.ToTensor()(image_np)  # Convert numpy array to tensor
                image_tensor = self.preprocess_clip(Image.fromarray(image_np)).unsqueeze(0).to(self.device)  # Preprocess image for CLIP
                image_features = clip_model.encode_image(image_tensor).squeeze(0).cpu().numpy()
            except Exception as e:
                print(f"Skipping invalid image: {image_path}. Error: {e}")
                continue
            
            
            image_failed = False
            valid_annotations = 0
            image_features = []
            text_features = []
            boxes = []
            labels = []
            
            for idx, annotation in enumerate(image_annotations):
                caption = annotation['caption']
                bbox = annotation['bbox']
                x, y, w, h = bbox

                # Crop the image according to the bounding box
                cropped_image_np = image_np[y:y + h, x:x + w]
                if cropped_image_np.size == 0:
                    if idx == len(image_annotations) - 1 and valid_annotations == 0:
                        image_failed = True
                        print(f"Skipping image with no valid annotations: {image_path}")
                        break
                    continue

                cropped_image_pil = Image.fromarray(cropped_image_np)
                cropped_image_tensor = self.preprocess_clip(cropped_image_pil).unsqueeze(0).to(self.device)

                with torch.no_grad():
                    image_feature = self.clip_model.encode_image(cropped_image_tensor).cpu().numpy()
                    text_feature = self.clip_model.encode_text(clip.tokenize([caption]).to(self.device)).cpu().numpy()

                image_features.append(image_feature)
                text_features.append(text_feature)
                boxes.append(bbox)
                label = annotation.get('label', 0)
                labels.append(label)
                self.unique_labels.add(label)

            if not image_failed:
                image_tensors.append(raw_image_tensor)
                all_image_features.append(image_features)
                all_text_features.append(text_features)
                all_boxes.append(boxes)
                all_labels.append(labels)

        return image_tensors, all_image_features, all_text_features, all_boxes, all_labels

    def pad_batch(self, image_tensors, image_features, text_features, boxes, labels):
        while len(image_tensors) < self.batch_size:
            dummy_image = torch.zeros_like(image_tensors[0])
            image_tensors.append(dummy_image)
            dummy_feature = np.zeros_like(image_features[0][0])
            image_features.append([dummy_feature])
            text_features.append([dummy_feature])
            boxes.append([[0, 0, 0, 0]])
            labels.append([0])

    def save_batch(self, batch_idx, image_tensors, image_features, text_features, boxes, labels):
        batch_output_dir = os.path.join(self.output_dir, f"batch_{batch_idx}")
        os.makedirs(batch_output_dir, exist_ok=True)

        np.save(os.path.join(batch_output_dir, "image_tensors.npy"), np.array([img.cpu().numpy() for img in image_tensors]))
        np.save(os.path.join(batch_output_dir, "image_features.npy"), image_features, allow_pickle=True)
        np.save(os.path.join(batch_output_dir, "text_features.npy"), text_features, allow_pickle=True)
        np.save(os.path.join(batch_output_dir, "boxes.npy"), boxes, allow_pickle=True)
        np.save(os.path.join(batch_output_dir, "labels.npy"), text_features, allow_pickle=True)
        

In [67]:
train_preprocessor = CLIPPreprocessor(train_data, train_dir)
val_preprocessor = CLIPPreprocessor(val_data, val_dir)
test_preprocessor = CLIPPreprocessor(test_data, test_dir)

In [53]:
os.makedirs(data_dir, exist_ok=True)
train_num_batches = train_preprocessor.preprocess_and_save_batches()
val_num_batches = val_preprocessor.preprocess_and_save_batches()
test_num_batches = test_preprocessor.preprocess_and_save_batches()


Processing Batches:   0%|          | 0/2 [00:13<?, ?it/s][A


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (64,) + inhomogeneous part.

In [27]:
# def preprocess_and_save_batches(dataset, augmentations, output_dir, batch_size=64, device="cuda"):
#     clip_model, preprocess_clip = clip.load("ViT-B/32", device=device)
#     images = dataset['image']
#     annotations = dataset['annotations']
#     num_batches = (len(images) + batch_size - 1) // batch_size

#     for batch_idx in tqdm(range(num_batches), desc="Processing Batches"):
#         batch_images = images[batch_idx * batch_size:(batch_idx + 1) * batch_size]
#         batch_annotations = annotations[batch_idx * batch_size:(batch_idx + 1) * batch_size]
#         batch_data = list(zip(batch_images, batch_annotations))
#         image_tensors = []
#         all_bboxes = []
#         all_labels = []
#         image_features = []
#         text_features = []

#         for image_path, image_annotations in batch_data:
#             # Load image
#             image = cv2.imread(image_path)
#             image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

#             if image is None or image.shape[0] == 0 or image.shape[1] == 0:
#                 print(f"Skipping invalid image: {image_path}")
#                 continue

#             # Apply augmentations
#             augmented = augmentations(image=image)
#             augmented_image = augmented['image'].permute(1, 2, 0).numpy()
#             augmented_image = (augmented_image * 255).astype(np.uint8)
            
#             bboxes = []
#             labels = []

#             num_annotations = len(image_annotations)
#             annotated = 0
#             image_failed = False
#             for idx, annotation in enumerate(image_annotations):
#                 caption = annotation['caption']
#                 bbox = annotation['bbox']

#                 x, y, w, h = bbox
#                 if w == 0 or h == 0:
#                     print(f"Skipping zero-sized bbox: {bbox} in image: {image_path}")
#                     if idx == num_annotations - 1 and annotated == 0:
#                         image_failed = True
#                         break
#                     else:
#                         continue

#                 # Attempt to crop augmented image
#                 cropped_image = augmented_image[y:y + h, x:x + w]
#                 if cropped_image.size == 0:
#                     # Fall back to original image if augmented crop is invalid
#                     print(f"Fallback to original image for bbox: {bbox} in image: {image_path}")
#                     cropped_image = image[y:y + h, x:x + w]

#                 if cropped_image.size == 0:
#                     print(f"Skipping empty cropped image for bbox: {bbox} in image: {image_path}")
#                     if idx == num_annotations - 1 and annotated == 0:
#                         image_failed = True
#                         break
#                     else:
#                         continue
                    
#                 bboxes.append(bbox)
#                 labels.append(caption)
                
#                 annotated += 1

#                 cropped_pil_image = Image.fromarray(cropped_image.astype('uint8'))

#                 # Preprocess the image for CLIP
#                 cropped_preprocessed = preprocess_clip(cropped_pil_image).unsqueeze(0).to(device)
                
#                 # Encode features using CLIP
#                 with torch.no_grad():
#                     if cropped_preprocessed.shape[1] == 3:
#                         image_feature = clip_model.encode_image(cropped_preprocessed).cpu().numpy()
#                         text_feature = clip_model.encode_text(clip.tokenize([caption]).to(device)).cpu().numpy()
#                         image_features.append(image_feature)
#                         text_features.append(text_feature)
#                     else:
#                         print(f"Skipping encoding due to incorrect shape: {cropped_preprocessed.shape}")

#             if image_failed:
#                 continue

#             image_tensors.append(augmented['image'])
#             all_bboxes.append(bboxes)
#             all_labels.append(labels)
        
#         if len(image_tensors) < batch_size:
#             print(f"Padding batch {batch_idx} with dummy data to reach target batch size of {batch_size}")
#         while len(image_tensors) < batch_size:
#             dummy_image = torch.zeros_like(augmented['image'])
#             image_tensors.append(dummy_image)
#             all_bboxes.append([])
#             all_labels.append("")
#             image_features.append(np.zeros_like(image_features[0]))
#             text_features.append(np.zeros_like(text_features[0]))
        
#         # Create batch directory
#         batch_output_dir = os.path.join(output_dir, f"batch_{batch_idx}")
#         os.makedirs(batch_output_dir, exist_ok=True)

#         # Save batch data
#         image_batch_memmap_path = os.path.join(batch_output_dir, "image_batch.npy")
#         np.save(image_batch_memmap_path, np.array([img.cpu().numpy() for img in image_tensors]))

#         bboxes_path = os.path.join(batch_output_dir, "bboxes_batch.npy")
#         np.save(bboxes_path, np.array(all_bboxes, dtype=object), allow_pickle=True)

#         labels_path = os.path.join(batch_output_dir, "labels_batch.npy")
#         np.save(labels_path, np.array(all_labels, dtype=object), allow_pickle=True)

#         image_features_path = os.path.join(batch_output_dir, "image_features_batch.npy")
#         np.save(image_features_path, np.array(image_features))

#         text_features_path = os.path.join(batch_output_dir, "text_features_batch.npy")
#         np.save(text_features_path, np.array(text_features))
        
#     return num_batches

In [28]:
# os.makedirs(data_dir, exist_ok=True)
# train_num_batches = preprocess_and_save_batches(train_data, augmentations, train_dir)
# val_num_batches = preprocess_and_save_batches(val_data, augmentations, val_dir)
# test_num_batches = preprocess_and_save_batches(test_data, augmentations, test_dir)

In [29]:
# print(f"Number of batches for train: {train_num_batches}")
# print(f"Number of batches for val: {val_num_batches}")
# print(f"Number of batches for test: {test_num_batches}")

train_num_batches = 63
val_num_batches = 8
test_num_batches = 8

In [30]:
# import matplotlib.pyplot as plt
# import os
# import numpy as np
# import matplotlib.patches as patches

# def visualize_image_with_bbox(image_tensor, bbox, label):
#     image = image_tensor.permute(1, 2, 0).numpy()  # Convert from CHW to HWC format
#     image = (image * 255).astype(np.uint8)  # Convert to uint8

#     fig, ax = plt.subplots(1)
#     ax.imshow(image[..., ::-1])
    
#     # Draw the bounding box
#     x, y, w, h = bbox
#     rect = patches.Rectangle((x, y), w, h, linewidth=2, edgecolor='r', facecolor='none')
#     ax.add_patch(rect)
    
#     # Add the label
#     plt.text(x, y - 10, label, color='red', fontsize=12, backgroundcolor='black')
    
#     plt.show()

# # Load one batch and visualize
# def load_and_visualize_batch(batch_dir):
#     image_batch_path = os.path.join(batch_dir, "image_batch.npy")
#     bboxes_path = os.path.join(batch_dir, "bboxes_batch.npy")
#     labels_path = os.path.join(batch_dir, "labels_batch.npy")

#     images = np.load(image_batch_path)
#     bboxes = np.load(bboxes_path, allow_pickle=True)
#     labels = np.load(labels_path, allow_pickle=True)

#     # Assuming the first image in the batch
#     image_tensor = torch.tensor(images[0])
#     bbox = bboxes[0][0]
#     label = labels[0][0]

#     # Visualize the image with the bounding box
#     visualize_image_with_bbox(image_tensor, bbox, label)

# # Example usage
# batch_output_dir = "./data/train"  # Adjust this to your actual batch output directory
# batch_idx = 0  # Index of the batch to visualize
# load_and_visualize_batch(os.path.join(batch_output_dir, f"batch_{batch_idx}"))

In [40]:
class MemmapIterableDataset(IterableDataset):
    def __init__(self, type_dir, num_batches, shuffle=False):
        self.type_dir = type_dir
        self.num_batches = num_batches
        self.shuffle=shuffle

    def __iter__(self):
        for batch_idx in range(self.num_batches):
            batch_path = os.path.join(self.type_dir, f"batch_{batch_idx}")

            image_batch_memmap_path = os.path.join(batch_path, "image_batch.npy")
            image_batch = np.load(image_batch_memmap_path, mmap_mode='r')
            image_batch = torch.tensor(image_batch)

            bboxes_path = os.path.join(batch_path, "bboxes_batch.npy")
            bboxes_batch = np.load(bboxes_path, allow_pickle=True)
            bboxes_batch = [torch.tensor(b) for b in bboxes_batch]

            labels_path = os.path.join(batch_path, "labels_batch.npy")
            labels_batch = np.load(labels_path, allow_pickle=True)
            labels_batch = [l for l in labels_batch]
            
            print(labels_batch[0])

            image_features_path = os.path.join(batch_path, "image_features_batch.npy")
            image_features_batch = np.load(image_features_path, mmap_mode='r')
            image_features_batch = torch.tensor(image_features_batch)

            text_features_path = os.path.join(batch_path, "text_features_batch.npy")
            text_features_batch = np.load(text_features_path, mmap_mode='r')
            text_features_batch = torch.tensor(text_features_batch)

            yield image_batch, bboxes_batch, labels_batch, image_features_batch, text_features_batch

In [35]:
class YOLOCLIPModel(pl.LightningModule):
    def __init__(self, yolo_model_path):
        super(YOLOCLIPModel, self).__init__()
        self.yolo_model = YOLO(yolo_model_path)

    def forward(self, image):
        return self.yolo_model(image)
    
    def compute_yolo_loss(self, yolo_results, bboxes, labels):
        # This is a simplified YOLO loss function
        # You should replace it with the actual YOLO loss function used in your project
        yolo_pred = yolo_results.xyxy[0]  # Example extraction of YOLO prediction
        bboxes = torch.stack(bboxes).to(yolo_pred.device)
        
        # Assuming yolo_pred and bboxes are of the same shape for simplicity
        bbox_loss = F.mse_loss(yolo_pred[:, :4], bboxes)
        obj_loss = F.binary_cross_entropy(yolo_pred[:, 4], torch.ones_like(yolo_pred[:, 4]))
        
        # If labels are provided, compute classification loss
        class_loss = 0
        if labels:
            labels = torch.stack(labels).to(yolo_pred.device)
            class_loss = F.cross_entropy(yolo_pred[:, 5:], labels)
        
        total_loss = bbox_loss + obj_loss + class_loss
        return total_loss

    def training_step(self, batch, batch_idx):
        # print("In training Step")
        image_batch, bboxes_batch, labels_batch, image_features_batch, text_features_batch = batch
        total_loss = 0

        for image_tensor, bboxes, labels, image_features, text_features in zip(image_batch, bboxes_batch, labels_batch, image_features_batch, text_features_batch):
            yolo_results = self(image_tensor)  # YOLO inference
            boxes = yolo_results.xyxy[0]  # Extract bounding boxes

            yolo_loss = self.compute_yolo_loss(yolo_results, bboxes, labels)
            total_loss += yolo_loss

            for image_feature, text_feature in zip(image_features, text_features):
                image_feature = image_feature.to("cuda")
                text_feature = text_feature.to("cuda")

                clip_loss = 1 - F.cosine_similarity(image_feature, text_feature)
                total_loss += clip_loss

        total_loss = total_loss / (len(image_batch) * len(image_features_batch))
        self.log("train_loss", total_loss)
        
        # # Free memory after processing the batch
        # del image_batch, bboxes_batch, labels_batch, image_features_batch, text_features_batch
        # torch.cuda.empty_cache()
        
        return total_loss

    def validation_step(self, batch, batch_idx):
        image_batch, bboxes_batch, labels_batch, image_features_batch, text_features_batch = batch
        total_loss = 0

        for image_tensor, bboxes, labels, image_features, text_features in zip(image_batch, bboxes_batch, labels_batch, image_features_batch, text_features_batch):
            yolo_results = self(image_tensor)  # YOLO inference
            boxes = yolo_results.xyxy[0]  # Extract bounding boxes

            yolo_loss = self.compute_yolo_loss(yolo_results, bboxes, labels)
            total_loss += yolo_loss

            for image_feature, text_feature in zip(image_features, text_features):
                image_feature = image_feature.to("cuda")
                text_feature = text_feature.to("cuda")

                clip_loss = 1 - F.cosine_similarity(image_feature, text_feature)
                total_loss += clip_loss

        total_loss = total_loss / (len(image_batch) * len(image_features_batch))
        self.log("val_loss", total_loss)
        
        # # Free memory after processing the batch
        # del image_batch, bboxes_batch, labels_batch, image_features_batch, text_features_batch
        # torch.cuda.empty_cache()
        
        return total_loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-4)

In [36]:
class VLMDataModule(pl.LightningDataModule):
    def __init__(self, train_data, val_data, test_data, num_workers=0):
        super().__init__()
        self.train_dir, self.train_num_batches = train_data
        self.val_dir, self.val_num_batches = val_data
        self.test_dir, self.test_num_batches = test_data
        self.num_workers = num_workers

    def setup(self, stage=None):
        self.train_dataset = MemmapIterableDataset(self.train_dir, self.train_num_batches, shuffle=True)
        self.val_dataset = MemmapIterableDataset(self.val_dir, self.val_num_batches)
        self.test_dataset = MemmapIterableDataset(self.test_dir, self.test_num_batches)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=None, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=None, num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=None, num_workers=self.num_workers)

In [41]:
data_module = VLMDataModule(
    train_data=(train_dir, train_num_batches),
    val_data=(val_dir, val_num_batches),
    test_data=(test_dir, test_num_batches),
    num_workers=0
)

# Initialize the model
model = YOLOCLIPModel("yolov8n.pt")

# Early stopping callback
early_stopping_callback = EarlyStopping(
    monitor='val_loss',  # metric to monitor
    patience=3,          # no of epochs with no improvement to wait before stopping
    verbose=True,        # logging
    mode='min'           # minimize the monitored metric
)

# Model checkpoint callback
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    monitor='val_loss',
    dirpath='model_checkpoints',
    filename='asr_model-{epoch:02d}-{val_loss:.2f}',
    save_top_k=3,
    mode='min',
)

# Initialize the Trainer
trainer = pl.Trainer(
    max_steps=100*train_num_batches,  # Maximum number of steps (batches) to train for
    callbacks=[checkpoint_callback, early_stopping_callback],
    val_check_interval=train_num_batches,  # Validation check interval
    limit_val_batches=val_num_batches,  # Limit the number of validation batches
    logger=False
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [42]:
print(data_module.train_dir)

/home/jupyter/til-24-base/vlm/src/data/train


In [None]:
# Train the model
trainer.fit(model, datamodule=data_module)

# Test the model
trainer.test(model, datamodule=data_module)

## <span style="color:blue;">Finetune the clip model</span>

In [None]:
import torch.optim as optim

# Define loss and optimizer
clip_criterion = torch.nn.CrossEntropyLoss()
clip_optimizer = optim.Adam(clip_model.parameters(), lr=5e-5)

# Fine-tune the CLIP model
clip_model.train()
for epoch in range(10):  # Number of epochs
    for clip_inputs, captions, _, _ in unified_dataloader:
        clip_inputs = clip_inputs.to(device)
        caption_tokens = torch.cat([clip.tokenize(caption) for caption_list in captions for caption in caption_list]).to(device)
        
        image_features = clip_model.encode_image(clip_inputs)
        text_features = clip_model.encode_text(caption_tokens)
        
        logits_per_image, logits_per_text = clip_model(clip_inputs, caption_tokens)
        loss = clip_criterion(logits_per_image, torch.arange(len(image_features)).to(device))
        
        clip_optimizer.zero_grad()
        loss.backward()
        clip_optimizer.step()
        
    print(f"Epoch {epoch+1}, CLIP Loss: {loss.item()}")


## <span style="color:blue;">Finetune the DETR model</span>

In [None]:
# Define optimizer
detr_optimizer = optim.AdamW(detr_model.parameters(), lr=1e-4)

# Fine-tune the DETR model
detr_model.train()
for epoch in range(10):  # Number of epochs
    for _, _, pixel_values, target in unified_dataloader:
        pixel_values = pixel_values.to(device)
        target = {k: v.to(device) for k, v in target.items()}
        
        outputs = detr_model(pixel_values=pixel_values, labels=target)
        loss = outputs.loss
        
        detr_optimizer.zero_grad()
        loss.backward()
        detr_optimizer.step()
        
    print(f"Epoch {epoch+1}, DETR Loss: {loss.item()}")


## Transfer Learning to make new model adapt to current army data set
Feature extraction

1. **Instantiating a Pre-Trained Model with Weights**
   - Initialize a pre-trained model with its pre-existing weights.

2. **Replacing Classifier Heads**
   - Replace the output layer with a new one that corresponds to the number of categories in our target dataset.

3. **Task-Specific Training**
   - Freeze all the layers from the pre-trained model, leaving only the outer layer (classifier head) to be trained.

since smaller dataset

In [None]:
### placeholder


import torch
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor
from PIL import Image
from transformers import CLIPModel, CLIPProcessor
from transformers import DetrForObjectDetection, DetrImageProcessor

# Define image preprocessing transformations
image_transform = Compose([
    Resize((224, 224)),  # Resize image to a fixed size
    ToTensor(),          # Convert image to tensor
])

# Define your dataset class with feature selection and preprocessing
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, image_transform):
        self.data = data
        self.image_transform = image_transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path, caption, selected_features, bounding_box = self.data[idx]
        
        # Load and preprocess image
        image = Image.open(image_path)
        image = self.image_transform(image)
        
        return image, caption, selected_features, bounding_box

# Define your custom data loader
def collate_fn(batch):
    images, captions, selected_features, bounding_boxes = zip(*batch)
    return images, captions, selected_features, bounding_boxes

# Initialize CLIP and DETR models
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

detr_model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')
detr_processor = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50')

# Define loss functions
clip_loss_fn = ...
detr_loss_fn = ...

# Define optimizer
clip_optimizer = torch.optim.Adam(clip_model.parameters(), lr=clip_lr)
detr_optimizer = torch.optim.Adam(detr_model.parameters(), lr=detr_lr)

# Define your dataset and data loader
train_dataset = CustomDataset(data=your_data, image_transform=image_transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Training loop
for epoch in range(num_epochs):
    for batch in train_loader:
        images, captions, selected_features, bounding_boxes = batch
        
        # CLIP forward pass
        clip_inputs = clip_processor(text=captions, images=images, return_tensors="pt", padding=True)
        clip_outputs = clip_model(**clip_inputs)
        
        # Combine embeddings
        image_embedding = clip_outputs.last_hidden_state[:, 0, :]
        caption_embedding = clip_outputs.last_hidden_state[:, 1, :]
        combined_embedding = torch.cat((image_embedding, caption_embedding, selected_features), dim=1)
        
        # DETR forward pass
        detr_outputs = detr_model.forward(features=combined_embedding)
        
        # Compute losses
        clip_loss = clip_loss_fn(...)
        detr_loss = detr_loss_fn(...)
        
        # Backpropagation and optimization
        clip_optimizer.zero_grad()
        detr_optimizer.zero_grad()
        clip_loss.backward()
        detr_loss.backward()
        clip_optimizer.step()
        detr_optimizer.step()

# Save trained models
torch.save(clip_model.state_dict(), "clip_model.pth")
torch.save(detr_model.state_dict(), "detr_model.pth")

In [None]:
import os
import json
import numpy as np
from tqdm import tqdm
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import clip
import albumentations as A

def preprocess_and_save_batches(dataset, augmentations, data_dir, batch_size=64):
    clip_model, preprocess_clip = clip.load("ViT-B/32", device="cpu")  # Use CPU for preprocessing

    images = dataset['image']
    annotations = dataset['annotations']
    num_batches = (len(annotations) + batch_size - 1) // batch_size

    for batch_idx in tqdm(range(num_batches), desc="Processing Batches"):
        batch_images = images[batch_idx * batch_size:(batch_idx + 1) * batch_size]
        batch_annotations = annotations[batch_idx * batch_size:(batch_idx + 1) * batch_size]
        batch_data = list(zip(batch_images, batch_annotations))
        image_tensors = []
        cropped_images = []
        tokenized_captions = []
        image_features = []
        text_features = []

        for image_path, annotations in batch_data:
            # Load image
            image = load_image(image_path)

            for annotation in annotations:
                caption = annotation['caption']
                bbox = annotation['bbox']
                cropped_image = image[bbox[1]:bbox[1] + bbox[3], bbox[0]:bbox[0] + bbox[2]]
                
                # Apply augmentations to cropped image
                augmented = augmentations(image=cropped_image)
                cropped_image_tensor = torch.tensor(augmented["image"]).permute(2, 0, 1)  # Change to (C, H, W) format for PyTorch
                
                # Preprocess using CLIP's preprocessor
                cropped_image_preprocessed = preprocess_clip(cropped_image_tensor).unsqueeze(0)  # Add batch dimension

                # Encode features using CLIP
                with torch.no_grad():
                    image_feature = clip_model.encode_image(cropped_image_preprocessed).numpy()
                    text_feature = clip_model.encode_text(clip.tokenize([caption])).numpy()

                cropped_images.append(cropped_image_preprocessed.numpy())
                tokenized_captions.append(clip.tokenize([caption]).numpy())
                image_features.append(image_feature)
                text_features.append(text_feature)
        
        # Save batch to memmap files
        image_batch_memmap_path = os.path.join(data_dir, f"image_batch_{batch_idx}.npy")
        np.save(image_batch_memmap_path, np.array(image_tensors))
        
        cropped_images_memmap_path = os.path.join(data_dir, f"cropped_images_batch_{batch_idx}.npy")
        np.save(cropped_images_memmap_path, np.array(cropped_images))
        
        tokenized_captions_path = os.path.join(data_dir, f"tokenized_captions_batch_{batch_idx}.npy")
        np.save(tokenized_captions_path, np.array(tokenized_captions))
        
        image_features_path = os.path.join(data_dir, f"image_features_batch_{batch_idx}.npy")
        np.save(image_features_path, np.array(image_features))
        
        text_features_path = os.path.join(data_dir, f"text_features_batch_{batch_idx}.npy")
        np.save(text_features_path, np.array(text_features))

def load_image(image_path):
    with Image.open(image_path) as img:
        img = img.convert('RGB')  # Ensure image is in RGB format
        image_array = np.array(img)
    return image_array

# Example usage
data = {
    "image": ["image_0.jpg", "image_1.jpg"],
    "annotations": [
        [{"caption": "blue and white missile", "bbox": [1224, 284, 44, 36]}, {"caption": "green light aircraft", "bbox": [688, 400, 56, 36]}, {"caption": "blue and white commercial aircraft", "bbox": [800, 320, 128, 36]}],
        [{"caption": "red car", "bbox": [100, 150, 200, 100]}, {"caption": "black bike", "bbox": [300, 350, 50, 50]}]
    ]
}

augmentations = A.Compose([
    A.Resize(224, 224),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

data_dir = 'data/'
preprocess_and_save_batches(img_dir, augmentations, metadata_path)


In [None]:
# Custom unified dataset class
class UnifiedDataset(Dataset):
    def __init__(self, annotations, image_folder, clip_preprocess, detr_processor):
        self.annotations = annotations
        self.image_folder = image_folder
        self.clip_preprocess = clip_preprocess
        self.detr_processor = detr_processor

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        item = self.annotations[idx]
        image_path = os.path.join(self.image_folder, item['image'])
        image = Image.open(image_path).convert("RGB")
        captions = [ann['caption'] for ann in item['annotations']]
        
        clip_input = self.clip_preprocess(image)
        
        boxes = torch.tensor([ann['bbox'] for ann in item['annotations']], dtype=torch.float32)
        labels = torch.tensor([0] * len(item['annotations']), dtype=torch.int64)  # Assuming all annotations belong to the same class
        target = {
            "image_id": torch.tensor([idx]),
            "boxes": boxes,
            "labels": labels
        }
        detr_input = self.detr_processor(images=image, annotations=target, return_tensors="pt")
        detr_input["labels"] = {k: v.squeeze() for k, v in detr_input["labels"].items()}
        
        return clip_input, captions, detr_input["pixel_values"].squeeze(), detr_input["labels"]

# Load CLIP model and preprocessing
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


# Load DETR model and processor
detr_model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50').to(device)
detr_processor = DetrImageProcessor.from_pretrained('facebook/detr-resnet-50')

# Create unified dataset and dataloader
unified_dataset = UnifiedDataset(annotations, image_folder, clip_preprocess, detr_processor)
unified_dataloader = DataLoader(unified_dataset, batch_size=2, shuffle=True)