In [1]:
from PIL import Image
import IPython.display as display
import torch
import requests
from transformers import CLIPProcessor, CLIPModel
import matplotlib.pyplot as plt
import torch
import json
from sklearn.model_selection import train_test_split
import random
from torch.utils.data import IterableDataset, DataLoader
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights

from tqdm import tqdm
import torchvision
from torchvision.transforms import functional as F
from torchvision import transforms
from torchinfo import summary
import urllib
import os
import json
import torchvision.transforms as T
import albumentations as A
from albumentations.pytorch import ToTensorV2
import numpy as np
import jsonlines
import cv2
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

import clip
from transformers import DetrForObjectDetection, DetrImageProcessor

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /var/tmp/tmpecm2hxfl
INFO:torch.distributed.nn.jit.instantiator:Writing /var/tmp/tmpecm2hxfl/_remote_module_non_scriptable.py


In [2]:
cur_dir = os.getcwd()
vlm_dir = os.path.dirname(cur_dir)
til_dir = os.path.dirname(vlm_dir)
home_dir = os.path.dirname(til_dir)
test_dir = os.path.join(home_dir, 'novice')
img_dir = os.path.join(test_dir, 'images')
data_dir = os.path.join(cur_dir, 'data')

##training data to be added to tune the models
metadata_path = os.path.join(test_dir, 'vlm.jsonl')

# paths for converting datasets to manifest files
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")
val_dir = os.path.join(data_dir, "val")

for dir in [train_dir, test_dir, val_dir]:
    os.makedirs(dir, exist_ok=True)

In [3]:
def split_data(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, seed=42):
    
    random.seed(seed)

    total_examples = len(data['image'])
    indices = list(range(total_examples))
    random.shuffle(indices)
    
    train_end = int(train_ratio * total_examples)
    val_end = train_end + int(val_ratio * total_examples)
    
    train_indices = indices[:train_end]
    val_indices = indices[train_end:val_end]
    test_indices = indices[val_end:]
    
    train_data = {'image': [data['image'][i] for i in train_indices], 'annotations': [data['annotations'][i] for i in train_indices]}
    val_data = {'image': [data['image'][i] for i in val_indices], 'annotations': [data['annotations'][i] for i in val_indices]}
    test_data = {'image': [data['image'][i] for i in test_indices], 'annotations': [data['annotations'][i] for i in test_indices]}
    
    return train_data, val_data, test_data

MAX_FILE_COUNT = None # Set if only want max files

data = {'image': [], 'annotations': []}
data_path = os.path.join(test_dir, "vlm.jsonl")
with jsonlines.open(metadata_path) as reader:
    for obj in reader:
        if MAX_FILE_COUNT and len(data['image']) >= MAX_FILE_COUNT:
            break
        for annotation in obj['annotations']:
            data['image'].append(os.path.join(img_dir, obj['image']))
            data['annotations'].append(annotation)
            
captions_set = set()

# Loop through the annotations and extract the text_features
for annotation in data['annotations']:
    captions_set.add(annotation['caption'])

# Print the resulting set of text_features
print(captions_set)

{'white and blue fighter plane', 'blue, yellow, and black helicopter', 'black fighter plane', 'green and brown camouflage helicopter', 'blue and red commercial aircraft', 'white and blue light aircraft', 'blue missile', 'grey and black helicopter', 'blue helicopter', 'white and black fighter plane', 'red fighter plane', 'white and blue fighter jet', 'black and yellow drone', 'blue and red light aircraft', 'red light aircraft', 'white fighter jet', 'grey and white fighter plane', 'green and yellow fighter plane', 'blue commercial aircraft', 'blue and white commercial aircraft', 'grey commercial aircraft', 'green fighter plane', 'white and red commercial aircraft', 'white commercial aircraft', 'black fighter jet', 'black cargo aircraft', 'yellow helicopter', 'grey drone', 'grey and black fighter plane', 'black and brown camouflage helicopter', 'white and blue commercial aircraft', 'white and red light aircraft', 'black and white cargo aircraft', 'green helicopter', 'white and orange ligh

In [4]:
train_data, val_data, test_data = split_data(data)

In [5]:
def write_to_jsonl(data, file_name):
    with jsonlines.open(file_name, mode='w') as writer:
        for img, ann in zip(data['image'], data['annotations']):
            writer.write({'image': img, 'annotations': ann})

# Write each dataset to a separate JSONL file
write_to_jsonl(train_data, os.path.join(train_dir, "train.jsonl"))
write_to_jsonl(val_data, os.path.join(val_dir, "val.jsonl"))
write_to_jsonl(test_data, os.path.join(test_dir, "test.jsonl"))

In [6]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define the custom matcher for multi-word expressions
matcher = Matcher(nlp.vocab)

# Add patterns for multi-word expressions (objects)
multi_word_patterns = [
    {"label": "OBJECT", "pattern": [{"LOWER": "fighter"}, {"LOWER": "jet"}]},
    {"label": "OBJECT", "pattern": [{"LOWER": "light"}, {"LOWER": "aircraft"}]},
    {"label": "OBJECT", "pattern": [{"LOWER": "commercial"}, {"LOWER": "aircraft"}]},
    {"label": "OBJECT", "pattern": [{"LOWER": "fighter"}, {"LOWER": "plane"}]},
    {"label": "OBJECT", "pattern": [{"LOWER": "cargo"}, {"LOWER": "aircraft"}]}
]

for pattern in multi_word_patterns:
    matcher.add(pattern["label"], [pattern["pattern"]])

# Add patterns for single-word objects
single_word_objects = ["missile", "helicopter", "drone"]
for obj in single_word_objects:
    matcher.add("OBJECT", [[{"LOWER": obj}]])

# Define custom colors
valid_colors = {"white", "blue", "green", "black", "red", "yellow", "grey", "orange", "silver", "camouflage"}

excluded_adjectives = {"light", "commercial"}

# Function to identify custom colors
def is_valid_color(token):
    return token.text.lower() in valid_colors

def filter_colors(token):
    return (token.pos_ == "ADJ" or is_valid_color(token)) and token.text.lower() not in excluded_adjectives

# Function to filter out overlapping spans
def filter_overlapping_spans(spans):
    sorted_spans = sorted(spans, key=lambda span: (span.start, span.end))
    filtered_spans = []
    last_end = -1
    for span in sorted_spans:
        if span.start >= last_end:
            filtered_spans.append(span)
            last_end = span.end
    return filtered_spans

# Process each string
results = {}
colors_set = set()
objects_set = set()

for string in captions_set:
    doc = nlp(string)
    matches = matcher(doc)
    spans = [Span(doc, start, end, label=label) for match_id, start, end in matches for label in [nlp.vocab.strings[match_id]]]
    
    # Filter out overlapping spans
    spans = filter_overlapping_spans(spans)
    
    doc.ents = spans  # Set the identified multi-word expressions as named entities
    
    colors = set([token.text for token in doc if filter_colors(token)])
    objects = set([ent.text for ent in doc.ents if ent.label_ == "OBJECT"])
    
    colors_set.update(colors)
    objects_set.update(objects)
    
    results[string] = {"colors": list(colors), "objects": list(objects)}

first_two_pairs = {k: results[k] for k in list(results.keys())[:2]}
print(json.dumps(first_two_pairs, indent=2))

{
  "white and blue fighter plane": {
    "colors": [
      "white",
      "blue"
    ],
    "objects": [
      "fighter plane"
    ]
  },
  "blue, yellow, and black helicopter": {
    "colors": [
      "black",
      "blue",
      "yellow"
    ],
    "objects": [
      "helicopter"
    ]
  }
}


In [7]:
# {
#     "img_paths": []
#     "labels": []
#     "bbox": []
#     "text_features": []
# }

objects_set

{'cargo aircraft',
 'commercial aircraft',
 'drone',
 'fighter jet',
 'fighter plane',
 'helicopter',
 'light aircraft',
 'missile'}

In [8]:
weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
preprocess = weights.transforms()

In [9]:
label_to_id_mapping = {
    'padding': 0,
    'cargo aircraft': 1,
    'commercial aircraft': 2,
    'drone': 3,
    'fighter jet': 4,
    'fighter plane': 5,
    'helicopter': 6,
    'light aircraft': 7,
    'missile': 8
}

class ImagePreprocessor:
    def __init__(self, dataset, model, processor, output_dir, batch_size=7, device="cuda", max_caption_length=77):
        self.dataset = dataset
        self.output_dir = output_dir
        self.batch_size = batch_size
        self.device = device
        self.transform = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
        ])
        self.clip_model = model.to(device)
        self.clip_processor = processor
        self.max_caption_length = max_caption_length
        self.padding_image = self.create_dummy_image()
        self.dummy_text = self.create_dummy_text()

    def preprocess_and_save_batches(self):
        images = self.dataset['image']
        annotations = self.dataset['annotations']
        num_batches = (len(images) + self.batch_size - 1) // self.batch_size

        for batch_idx in tqdm(range(num_batches), desc="Processing Batches"):
            batch_images = images[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size]
            batch_annotations = annotations[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size]
            batch_data = list(zip(batch_images, batch_annotations))
            img_paths, cropped_images, text_features, labels, bboxes = self.process_batch(batch_data)
            self.pad_batch(img_paths, cropped_images, text_features, labels, bboxes)
            self.save_batch(batch_idx, img_paths, cropped_images, text_features, labels, bboxes)
        return num_batches

    def process_batch(self, batch_data):
        img_paths = []
        cropped_images = []
        text_features = []
        labels = []
        bboxes = []
        
        prev_image_path = None
        prev_image = None
        for image_path, annotation in batch_data:
            try:
                if image_path != prev_image_path:
                    prev_image = Image.open(image_path).convert("RGB")
                    prev_image_path = image_path
                    
                image = prev_image
                bbox = annotation['bbox']
                x, y, w, h = bbox
                # Crop the image using the bounding box coordinates
                cropped_image = image.crop((x, y, x + w, y + h))
                bbox = self.correct_bbox_format(bbox)
                # Apply the transformation
                transformed_image = self.transform(cropped_image).to(self.device)
            except Exception as e:
                print(f"Skipping invalid image: {image_path}. Error: {e}")
                continue
            
            caption = annotation['caption']
            text_input = self.clip_processor(caption, return_tensors="pt", padding=True, truncation=True, max_length=self.max_caption_length).to(self.device)
            text_feature = self.clip_model.get_text_features(**text_input).to(self.device)
            
            json_path = image_path.replace('/novice/images/', '/til-24-base/vlm/src/data/imgs/').replace('.jpg', '.npy')
            img_paths.append(json_path)
            cropped_images.append(transformed_image)
            text_features.append(text_feature)
            true_label = results.get(caption)
            # if not true_label:
            #     print(f"Caption not in results for {caption}")
            # if not true_label['objects'][0]:
            #     print(f"No object for {caption}")
            labels.append(label_to_id_mapping[true_label['objects'][0]])
            bboxes.append(bbox)

        return img_paths, cropped_images, text_features, labels, bboxes

    def pad_batch(self, img_paths, cropped_images, text_features, labels, bboxes):
        while len(cropped_images) < self.batch_size:
            cropped_images.append(self.padding_image)
            text_features.append(self.dummy_text)
            labels.append(label_to_id_mapping['padding'])
            bboxes.append([0,0,0,0])
            file_path = os.path.join(data_dir, 'imgs', 'dummy.npy')
            if not os.path.exists(file_path):
                image_array = np.array(self.padding_image)
                np.save(file_path, image_array)
                print(f"Image saved to {file_path}")
            img_paths.append(file_path)
            
        return img_paths, cropped_images, text_features, labels, bboxes
    
    def create_dummy_image(self):
        # Create a dummy image with the target size and filled with zeros
        dummy_image = Image.new('RGB', (224, 224), (0, 0, 0))
        transformed_dummy = self.transform(dummy_image).to(self.device)
        if transformed_dummy.dim() == 3:
            transformed_dummy = transformed_dummy.permute(1, 2, 0)
            array = transformed_dummy.cpu().numpy().astype(np.uint8)
            pil_image = Image.fromarray(array)
            preprocessed_image = preprocess(pil_image)  # Implement this method based on your needs  
            return preprocessed_image

    def create_dummy_text(self):
        # Create a padding caption tokenized and padded to the max length
        padding_text = self.clip_processor.tokenizer.pad_token
        text_input = self.clip_processor(
            padding_text, 
            padding="max_length",    # Ensures all tokens are padded to the same length
            truncation=True,         # Ensures inputs do not exceed the model's maximum length
            max_length=77,           # Optional: specify max length if different from the default
            return_tensors="pt"      # Return PyTorch tensors
        ).to(self.device)
        text_feature = self.clip_model.get_text_features(**text_input)
        return text_feature

    def save_batch(self, batch_idx, img_paths, cropped_images, text_features, labels, bboxes):
        batch_output_dir = os.path.join(self.output_dir, f"batch_{batch_idx}")
        os.makedirs(batch_output_dir, exist_ok=True)

        # Save cropped images as numpy arrays
        image_arrays = [img.cpu().numpy() for img in cropped_images]
        np.save(os.path.join(batch_output_dir, "cropped_images.npy"), np.array(image_arrays))

        # Save text_features as numpy arrays
        text_feature_arrays = [text.detach().cpu().numpy() for text in text_features]
        np.save(os.path.join(batch_output_dir, "text_features.npy"), np.array(text_feature_arrays))

        # Save bounding boxes as numpy arrays
        np.save(os.path.join(batch_output_dir, "bboxes.npy"), np.array(bboxes))

        # Save labels as numpy arrays
        np.save(os.path.join(batch_output_dir, "labels.npy"), np.array(labels))

        # Save image paths as JSON
        with open(os.path.join(batch_output_dir, "img_paths.json"), 'w') as f:
            json.dump(img_paths, f)

        # print(f"Batch {batch_idx} saved successfully.")
        
    @staticmethod
    def correct_bbox_format(bbox):
        x, y, width, height = bbox
        xmin = x
        ymin = y
        xmax = x + width
        ymax = y + height
        return [xmin, ymin, xmax, ymax]
        

In [10]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



In [11]:
train_processor = ImagePreprocessor(train_data, model=clip_model, processor=clip_processor, output_dir=os.path.join(train_dir))
val_processor = ImagePreprocessor(val_data, model=clip_model, processor=clip_processor, output_dir=os.path.join(val_dir))
test_processor = ImagePreprocessor(test_data, model=clip_model, processor=clip_processor, output_dir=os.path.join(test_dir))

In [12]:
train_processor.preprocess_and_save_batches()
val_processor.preprocess_and_save_batches()
test_processor.preprocess_and_save_batches()

Processing Batches: 100%|██████████| 1709/1709 [13:57<00:00,  2.04it/s]
Processing Batches: 100%|██████████| 214/214 [01:27<00:00,  2.45it/s]
Processing Batches: 100%|██████████| 214/214 [01:24<00:00,  2.55it/s]


214

In [13]:
# following code is probably wrong
# train_num_batches = (len(train_processor.dataset['image']) + train_processor.batch_size - 1)
# val_num_batches = (len(val_processor.dataset['image']) + val_processor.batch_size - 1)
# test_num_batches = (len(test_processor.dataset['image']) + test_processor.batch_size - 1)

# print(train_num_batches)
# print(val_num_batches)
# print(test_num_batches)

In [14]:
stop

NameError: name 'stop' is not defined

In [None]:
from PIL import Image
import os
import torch
from torch.utils.data import IterableDataset, DataLoader
from torchvision import transforms

class CustomImageCaptionDataset(IterableDataset):
    def __init__(self, image_paths, text_features):
        self.image_paths = image_paths
        self.text_features = text_features
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
        ])

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
    
        return image, caption
    
class ASRDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer, train_data, val_data, test_data, augmentations=None, collate_fn=None, num_workers=0, transform=None):
        super().__init__()
        self.tokenizer = tokenizer # can just use the global one?
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data
        self.augmentations = augmentations
        self.collate_fn = collate_fn
        self.num_workers = num_workers
        self.transform = transform

    def setup(self, stage=None):
        self.train_dataset = ASRIterableDataset(self.train_data, self.tokenizer, self.augmentations, shuffle=True, transform=self.transform)
        self.val_dataset = ASRIterableDataset(self.val_data, self.tokenizer, self.augmentations, transform=self.transform)
        self.test_dataset = ASRIterableDataset(self.test_data, self.tokenizer, self.augmentations, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, collate_fn=self.collate_fn, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, collate_fn=self.collate_fn, num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, collate_fn=self.collate_fn, num_workers=self.num_workers)