In [1]:
from PIL import Image
import IPython.display as display
import torch
import requests
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt
import torch
import json
from sklearn.model_selection import train_test_split
import random
from torch.utils.data import IterableDataset, DataLoader
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights

from tqdm import tqdm
import torchvision
from torchvision.transforms import functional as F
from torchvision import transforms
from torchinfo import summary
import urllib
import os
import json
import torchvision.transforms as T
import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np
import jsonlines
import cv2
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

import clip
from transformers import DetrForObjectDetection, DetrImageProcessor

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
cur_dir = os.getcwd()
vlm_dir = os.path.dirname(cur_dir)
til_dir = os.path.dirname(vlm_dir)
home_dir = os.path.dirname(til_dir)
test_dir = os.path.join(home_dir, 'novice')
img_dir = os.path.join(test_dir, 'images')
data_dir = os.path.join(cur_dir, 'data')

##training data to be added to tune the models
metadata_path = os.path.join(test_dir, 'vlm.jsonl')

# paths for converting datasets to manifest files
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
val_dir = os.path.join(data_dir, "val")

rcnn_img_dir = os.path.join(data_dir, "rcnn") # for normal res images
clip_img_dir = os.path.join(data_dir, "clip") # for 224x224 images

for dir in [train_dir, test_dir, val_dir]:
    os.makedirs(dir, exist_ok=True)

In [3]:
def split_data(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, seed=42):
    
    random.seed(seed)

    total_examples = len(data['image'])
    indices = list(range(total_examples))
    random.shuffle(indices)
    
    train_end = int(train_ratio * total_examples)
    val_end = train_end + int(val_ratio * total_examples)
    
    train_indices = indices[:train_end]
    val_indices = indices[train_end:val_end]
    test_indices = indices[val_end:]
    
    train_data = {'image': [data['image'][i] for i in train_indices], 'annotations': [data['annotations'][i] for i in train_indices]}
    val_data = {'image': [data['image'][i] for i in val_indices], 'annotations': [data['annotations'][i] for i in val_indices]}
    test_data = {'image': [data['image'][i] for i in test_indices], 'annotations': [data['annotations'][i] for i in test_indices]}
    
    return train_data, val_data, test_data

MAX_FILE_COUNT = None # Set if only want max files

data = {'image': [], 'annotations': []}
data_path = os.path.join(test_dir, "vlm.jsonl")
with jsonlines.open(metadata_path) as reader:
    for obj in reader:
        if MAX_FILE_COUNT and len(data['image']) >= MAX_FILE_COUNT:
            break
        for annotation in obj['annotations']:
            data['image'].append(os.path.join(img_dir, obj['image']))
            data['annotations'].append(annotation)
            
captions_set = set()

# Loop through the annotations and extract the text_features
for annotation in data['annotations']:
    captions_set.add(annotation['caption'])

# Print the resulting set of text_features
print(captions_set)

{'blue missile', 'green and brown camouflage fighter jet', 'red helicopter', 'black camouflage fighter jet', 'white, red, and blue commercial aircraft', 'grey and white light aircraft', 'blue and red commercial aircraft', 'red and grey missile', 'white and grey helicopter', 'grey and green cargo aircraft', 'white drone', 'blue and grey fighter jet', 'red and white missile', 'yellow and green helicopter', 'black and white missile', 'white, blue, and red commercial aircraft', 'black helicopter', 'white and black cargo aircraft', 'orange light aircraft', 'white commercial aircraft', 'grey and red commercial aircraft', 'white and red helicopter', 'white, black, and grey missile', 'blue and green fighter plane', 'grey and black helicopter', 'green and black missile', 'green fighter plane', 'yellow, black, and red helicopter', 'white light aircraft', 'white and blue light aircraft', 'blue, yellow, and white cargo aircraft', 'blue commercial aircraft', 'yellow, red, and blue fighter plane', '

In [4]:
train_data, val_data, test_data = split_data(data)

In [5]:
def write_to_jsonl(data, file_name):
    with jsonlines.open(file_name, mode='w') as writer:
        for img, ann in zip(data['image'], data['annotations']):
            writer.write({'image': img, 'annotations': ann})

# Write each dataset to a separate JSONL file
write_to_jsonl(train_data, os.path.join(train_dir, "train.jsonl"))
write_to_jsonl(val_data, os.path.join(val_dir, "val.jsonl"))
write_to_jsonl(test_data, os.path.join(test_dir, "test.jsonl"))

In [6]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define the custom matcher for multi-word expressions
matcher = Matcher(nlp.vocab)

# Add patterns for multi-word expressions (objects)
multi_word_patterns = [
    {"label": "OBJECT", "pattern": [{"LOWER": "fighter"}, {"LOWER": "jet"}]},
    {"label": "OBJECT", "pattern": [{"LOWER": "light"}, {"LOWER": "aircraft"}]},
    {"label": "OBJECT", "pattern": [{"LOWER": "commercial"}, {"LOWER": "aircraft"}]},
    {"label": "OBJECT", "pattern": [{"LOWER": "fighter"}, {"LOWER": "plane"}]},
    {"label": "OBJECT", "pattern": [{"LOWER": "cargo"}, {"LOWER": "aircraft"}]}
]

for pattern in multi_word_patterns:
    matcher.add(pattern["label"], [pattern["pattern"]])

# Add patterns for single-word objects
single_word_objects = ["missile", "helicopter", "drone"]
for obj in single_word_objects:
    matcher.add("OBJECT", [[{"LOWER": obj}]])

# Define custom colors
valid_colors = {"white", "blue", "green", "black", "red", "yellow", "grey", "orange", "silver", "camouflage"}

excluded_adjectives = {"light", "commercial"}

# Function to identify custom colors
def is_valid_color(token):
    return token.text.lower() in valid_colors

def filter_colors(token):
    return (token.pos_ == "ADJ" or is_valid_color(token)) and token.text.lower() not in excluded_adjectives

# Function to filter out overlapping spans
def filter_overlapping_spans(spans):
    sorted_spans = sorted(spans, key=lambda span: (span.start, span.end))
    filtered_spans = []
    last_end = -1
    for span in sorted_spans:
        if span.start >= last_end:
            filtered_spans.append(span)
            last_end = span.end
    return filtered_spans

# Process each string
results = {}
colors_set = set()
objects_set = set()

for string in captions_set:
    doc = nlp(string)
    matches = matcher(doc)
    spans = [Span(doc, start, end, label=label) for match_id, start, end in matches for label in [nlp.vocab.strings[match_id]]]
    
    # Filter out overlapping spans
    spans = filter_overlapping_spans(spans)
    
    doc.ents = spans  # Set the identified multi-word expressions as named entities
    
    colors = set([token.text for token in doc if filter_colors(token)])
    objects = set([ent.text for ent in doc.ents if ent.label_ == "OBJECT"])
    
    colors_set.update(colors)
    objects_set.update(objects)
    
    results[string] = {"colors": list(colors), "objects": list(objects)}

first_two_pairs = {k: results[k] for k in list(results.keys())[:2]}
print(json.dumps(first_two_pairs, indent=2))

{
  "blue missile": {
    "colors": [
      "blue"
    ],
    "objects": [
      "missile"
    ]
  },
  "green and brown camouflage fighter jet": {
    "colors": [
      "camouflage",
      "brown",
      "green"
    ],
    "objects": [
      "fighter jet"
    ]
  }
}


In [7]:
# {
#     "img_paths": []
#     "labels": []
#     "bbox": []
#     "text_features": []
# }

objects_set

{'cargo aircraft',
 'commercial aircraft',
 'drone',
 'fighter jet',
 'fighter plane',
 'helicopter',
 'light aircraft',
 'missile'}

In [8]:
weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT


In [9]:
label_to_id_mapping = {
    'padding': 0,
    'cargo aircraft': 1,
    'commercial aircraft': 2,
    'drone': 3,
    'fighter jet': 4,
    'fighter plane': 5,
    'helicopter': 6,
    'light aircraft': 7,
    'missile': 8
}

class ImagePreprocessor:
    def __init__(self, dataset, output_dir, batch_size=2, max_caption_length=20):
        self.dataset = dataset
        self.output_dir = output_dir
        self.batch_size = batch_size
        self.clip_preprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.rcnn_preprocessor = weights.transforms()
        self.rcnn_file_path, self.clip_file_path = self.create_dummy_images()
        self.max_caption_length = max_caption_length

    def preprocess_and_save_batches(self):
        images = self.dataset['image']
        annotations = self.dataset['annotations']
        num_batches = (len(images) + self.batch_size - 1) // self.batch_size

        for batch_idx in tqdm(range(num_batches), desc="Processing Batches"):
            batch_images = images[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size]
            batch_annotations = annotations[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size]
            batch_data = list(zip(batch_images, batch_annotations))
            rcnn_img_paths, clip_img_paths, text_data, labels, bboxes = self.process_batch(batch_data)
            rcnn_img_paths, clip_img_paths, text_data, labels, bboxes = self.pad_batch(rcnn_img_paths, clip_img_paths, text_data, labels, bboxes)
            self.save_batch(batch_idx, rcnn_img_paths, clip_img_paths, text_data, labels, bboxes)
        return num_batches

    def process_batch(self, batch_data):
        rcnn_img_paths = []
        clip_img_paths = []
        text_data = []
        labels = []
        bboxes = []
        
        prev_image_path = None
        prev_image = None
        for image_path, annotation in batch_data:
            try:
                if image_path != prev_image_path:
                    image = Image.open(image_path).convert("RGB")
                    
                    preprocessed_rcnn = self.rcnn_preprocessor(image).to('cuda')  # Move to GPU
                    image_array_rcnn = preprocessed_rcnn.permute(1, 2, 0).cpu().numpy()  # Move back to CPU for saving
                    rcnn_output_path = image_path.replace('/novice/images/', '/til-24-base/vlm/src/data/rcnn/').replace('.jpg', '.npy')
                    np.save(rcnn_output_path, image_array_rcnn)
                    
                    # Preprocess image for CLIP
                    preprocessed_clip = self.clip_preprocessor(images=image, return_tensors="pt")
                    preprocessed_clip = {k: v.to('cuda') for k, v in preprocessed_clip.items()}  # Move to GPU
                    image_array_clip = preprocessed_clip['pixel_values'].squeeze().cpu().numpy()  # Move back to CPU for saving
                    clip_output_path = image_path.replace('/novice/images/', '/til-24-base/vlm/src/data/clip/').replace('.jpg', '_clip.npy')
                    np.save(clip_output_path, image_array_clip)  # Save the numpy array for CLIP
                    
                caption = annotation['caption']
                text_input = self.clip_preprocessor(text=[caption], return_tensors="pt", padding=True, truncation=True, max_length=self.max_caption_length)
                text_input = {k: v.to('cuda') for k, v in text_input.items()}  # Move to GPU
                text_input_serializable = {key: value.cpu().tolist() for key, value in text_input.items()}  # Move back to CPU for saving
                
                bbox = annotation['bbox']
                x, y, w, h = bbox
                bbox = self.correct_bbox_format(bbox)
                
                rcnn_img_paths.append(rcnn_output_path)
                clip_img_paths.append(clip_output_path)
                
                true_label = results.get(caption)
                labels.append(label_to_id_mapping[true_label['objects'][0]])
                bboxes.append(bbox)
                text_data.append(text_input_serializable)
                
            except Exception as e:
                print(f"Skipping invalid image: {image_path}. Error: {e}")
                continue

        return rcnn_img_paths, clip_img_paths, text_data, labels, bboxes

    def pad_batch(self, rcnn_img_paths, clip_img_paths, text_data, labels, bboxes):
        while len(rcnn_img_paths) < self.batch_size:
            rcnn_img_paths.append(self.rcnn_file_path)
            clip_img_paths.append(self.clip_file_path)
            
            caption = ""
            text_input = self.clip_preprocessor(text=[caption], return_tensors="pt", padding=True, truncation=True, max_length=self.max_caption_length)
            text_input = {k: v.to('cuda') for k, v in text_input.items()}  # Move to GPU
            text_input_serializable = {key: value.cpu().tolist() for key, value in text_input.items()}  # Move back to CPU for saving
            text_data.append(text_input_serializable)
            
            labels.append(label_to_id_mapping['padding'])
            bboxes.append([0,0,0,0])
            
        return rcnn_img_paths, clip_img_paths, text_data, labels, bboxes
    
    def create_dummy_images(self):
        # Create a dummy image with the target size and filled with zeros
        rcnn_file_path = os.path.join(data_dir, 'rcnn', 'dummy.npy') # rcnn
        if not os.path.exists(rcnn_file_path):
            image = Image.new('RGB', (1520, 870), (0, 0, 0))
            preprocessed_rcnn = self.rcnn_preprocessor(image).to('cuda')  # Move to GPU
            image_array_rcnn = preprocessed_rcnn.permute(1, 2, 0).cpu().numpy()  # Move back to CPU for saving
            np.save(rcnn_file_path, image_array_rcnn)
            print(f"RCNN Image saved to {rcnn_file_path}")
        
        clip_file_path = os.path.join(data_dir, 'clip', 'dummy.npy') # clip
        if not os.path.exists(clip_file_path):
            image = Image.new('RGB', (1520, 870), (0, 0, 0))
            preprocessed_clip = self.clip_preprocessor(images=image, return_tensors="pt")
            preprocessed_clip = {k: v.to('cuda') for k, v in preprocessed_clip.items()}  # Move to GPU
            image_array_clip = preprocessed_clip['pixel_values'].squeeze().cpu().numpy()  # Move back to CPU for saving
            np.save(clip_file_path, image_array_clip)  # Save the numpy array for CLIP
            print(f"CLIP Image saved to {clip_file_path}")
        
        return rcnn_file_path, clip_file_path

    def save_batch(self, batch_idx, rcnn_img_paths, clip_img_paths, text_data, labels, bboxes):
        batch_output_dir = os.path.join(self.output_dir, f"batch_{batch_idx}")
        os.makedirs(batch_output_dir, exist_ok=True)
        
        # Save image paths as JSON
        with open(os.path.join(batch_output_dir, "rcnn_img_paths.json"), 'w') as f:
            json.dump(rcnn_img_paths, f)
            
        with open(os.path.join(batch_output_dir, "clip_img_paths.json"), 'w') as f:
            json.dump(clip_img_paths, f)

        # Save inputs as JSON
        with open(os.path.join(batch_output_dir, "text_data.json"), 'w') as f:
            json.dump(text_data, f)

        # Save bounding boxes as numpy arrays
        np.save(os.path.join(batch_output_dir, "bboxes.npy"), np.array(bboxes))

        # Save labels as numpy arrays
        np.save(os.path.join(batch_output_dir, "labels.npy"), np.array(labels))
        
    @staticmethod
    def correct_bbox_format(bbox):
        x, y, width, height = bbox
        xmin = x
        ymin = y
        xmax = x + width
        ymax = y + height
        return [xmin, ymin, xmax, ymax]
        


In [10]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



In [11]:
train_processor = ImagePreprocessor(train_data, output_dir=os.path.join(train_dir))
val_processor = ImagePreprocessor(val_data, output_dir=os.path.join(val_dir))
test_processor = ImagePreprocessor(test_data,output_dir=os.path.join(test_dir))

In [12]:
train_processor.preprocess_and_save_batches()
val_processor.preprocess_and_save_batches()
test_processor.preprocess_and_save_batches()

Processing Batches: 100%|██████████| 5982/5982 [1:14:06<00:00,  1.35it/s]
Processing Batches: 100%|██████████| 748/748 [09:58<00:00,  1.25it/s]
Processing Batches: 100%|██████████| 748/748 [09:41<00:00,  1.29it/s]


748

In [13]:
# following code is probably wrong
# train_num_batches = (len(train_processor.dataset['image']) + train_processor.batch_size - 1)
# val_num_batches = (len(val_processor.dataset['image']) + val_processor.batch_size - 1)
# test_num_batches = (len(test_processor.dataset['image']) + test_processor.batch_size - 1)

# print(train_num_batches)
# print(val_num_batches)
# print(test_num_batches)

In [14]:
stop

NameError: name 'stop' is not defined

In [None]:
#     @staticmethod
#     def adjust_bbox_for_resize_and_pad(bbox, scale, padding): # RESIZE
#         """
#         Adjust the bounding box coordinates based on the image scaling and padding.

#         Args:
#         bbox (list): The bounding box in format [x, y, width, height].
#         scale (tuple): The scaling factors (scale_x, scale_y).
#         padding (tuple): Padding added (pad_left, pad_top).

#         Returns:
#         list: The adjusted bounding box.
#         """
        
#         x, y, width, height = bbox
#         x1 = x
#         y1 = y
#         x2 = x + width
#         y2 = y + height
        
#         scale_x, scale_y = scale
#         pad_x, pad_y = padding

#         # Scale and adjust for padding
#         new_x1 = x1 * scale_x + pad_x
#         new_y1 = y1 * scale_y + pad_y
#         new_x2 = x2 * scale_x + pad_x
#         new_y2 = y2 * scale_y + pad_y

#         return [new_x1, new_y1, new_x2, new_y2]

In [None]:
def resize_image_and_bbox(image, bbox, target_size=224):
    """
    Resize an image and adjust the bounding boxes. Maintain aspect ratio and pad if necessary.

    Args:
    image (PIL.Image): The original image.
    bbox (list): Bounding box with format [x_min, y_min, x_max, y_max].
    target_size (int): The target size to which the longer side of the image will be resized.

    Returns:
    PIL.Image: Resized and padded image.
    list: Adjusted bounding box coordinates.
    """
    original_width, original_height = image.size
    ratio = min(target_size / original_width, target_size / original_height)
    new_width = int(original_width * ratio)
    new_height = int(original_height * ratio)

    # Resize the image
    resized_image = image.resize((new_width, new_height), Image.ANTIALIAS)

    # Calculate padding to make the image square
    pad_width = (target_size - new_width) // 2
    pad_height = (target_size - new_height) // 2

    # Pad the image
    padded_image = Image.new('RGB', (target_size, target_size), (128, 128, 128))
    padded_image.paste(resized_image, (pad_width, pad_height))

    # Adjust bounding box coordinates
    x_min, y_min, x_max, y_max = bbox
    x_min = x_min * ratio + pad_width
    y_min = y_min * ratio + pad_height
    x_max = x_max * ratio + pad_width
    y_max = y_max * ratio + pad_height
    adjusted_bbox = [x_min, y_min, x_max, y_max]

    return padded_image, adjusted_bbox

In [None]:
def resize_and_pad_image(image, target_size=224):
        """
        Resize the image to maintain aspect ratio and pad to make it square (target_size x target_size).

        Args:
        image (PIL.Image): The original image.
        target_size (int): The desired size of the square image (both width and height).

        Returns:
        PIL.Image: The resized and padded image.
        tuple: The scaling factors (scale_x, scale_y).
        tuple: Padding added (pad_left, pad_top).
        """
        # Calculate scaling factors to maintain aspect ratio
        original_width, original_height = image.size
        ratio = min(target_size / original_width, target_size / original_height)
        new_width = int(original_width * ratio)
        new_height = int(original_height * ratio)

        # Resize the image
        resized_image = image.resize((new_width, new_height), Image.ANTIALIAS)

        # Calculate padding to make the image square
        pad_width = (target_size - new_width) // 2
        pad_height = (target_size - new_height) // 2

        # Pad the resized image
        padded_image = Image.new('RGB', (target_size, target_size), (0, 0, 0))
        padded_image.paste(resized_image, (pad_width, pad_height))

        return padded_image, (ratio, ratio), (pad_width, pad_height)