# EEE 197z Project 1 - Zero Shot Object Detection
use SAM to perform zero-shot object detection using COCO 2017 val split. 

*Author: Sean Red Mendoza | 2020-01751 | scmendoza5@up.edu.ph*

## Tools/ References
- [SegmentAnything](https://github.com/facebookresearch/segment-anything)
- [OpenClip](https://github.com/mlfoundations/open_clip)
- [Coco 2017 Validation Dataset](https://cocodataset.org/#home)
- [roatienza/mlops](https://github.com/roatienza/mlops)
- [roatienza/Deep-Learning-Experiments](https://github.com/roatienza/Deep-Learning-Experiments)
- [Google Cloud G2 GPU VM (Nvidia L4)](https://cloud.google.com/blog/products/compute/introducing-g2-vms-with-nvidia-l4-gpus)

## Goals

## Approach

## Notes


In [None]:
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import numpy as np
import torch
import torchvision
import sys
from skimage import io
import matplotlib.pyplot as plt
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator
import os
import open_clip
from PIL import Image
from pprint import pprint
import json
import shutil
import re

In [None]:

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
print("[setup]: determining CUDA support...")
print("PyTorch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print("CUDA is available:", torch.cuda.is_available())

In [None]:
# clean output directory
if not os.path.exists("../output"):
    os.mkdir(f"../output")
shutil.rmtree("../output/")
os.mkdir(f"../output/")

## Images
This program supports manual entry of input image links or random sampling from the CoCo 2017 Validation Dataset
*Please select your desired input method below:*

- [1] Input Images
- [2] CoCo Images

After selecting the input image source, you will be asked to input the following:
- [a] List of image URL strings, or
- [b] number of random images sourced from CoCo 2017*

**a hard limit of 10 images is set to prevent overloading of the system*

In [None]:
def get_valid_input():
    while True:
        user_input = input("Enter 1 or 2: ")
        if user_input in ['1', '2']:
            return int(user_input)
        else:
            print("Invalid input. Please enter 1 or 2.")
            
def get_valid_number(n):
    while True:
        user_input = input("Enter an integer from 1 to {}: ".format(n))
        if user_input.isdigit():
            number = int(user_input)
            if 1 <= number <= n:
                return number
        print("Invalid input. Please enter an integer from 1 to {}.".format(n))

input_type = get_valid_input()

if input_type == 1:
    print("You have selected MANUAL_INPUT, please enter the image URLs in the following block")
elif input_type==2:
    input_image_count = get_valid_number(10)
    print(f"You have selected RANDOM_INPUT of {input_image_count} images from the CoCo 2017 dataset")

In [None]:

print("[0]: loading coco annotations and captions...")
dataDir = '../coco'
dataType = 'val2017'
annFile = '{}/annotations/instances_{}.json'.format(dataDir, dataType)
# initialize COCO api for instance annotations + COCO ground truth api
cocoGt = COCO(annFile)
annFile = '{}/annotations/captions_{}.json'.format(dataDir, dataType)
coco_caps = COCO(annFile)
catIDs = cocoGt.getCatIds()

# get all captions
annIds = coco_caps.getAnnIds(imgIds=[], catIds=[])
anns = coco_caps.loadAnns(annIds)
coco_labels_words = []
for ann in anns:
    words = ann['caption'].split()
    for word in words:
        clean_word = re.sub(r'[^a-zA-Z0-9\s]', '', word.lower())
        coco_labels_words.append(clean_word)

coco_labels_words_values = list(set(coco_labels_words))

In [None]:
input_image_IDs = []
input_image_catIDs = []
input_image_URLs = []
input_image_areas = []
input_image_labels = []

## RANDOM INPUT
def get_random_coco_image(n):
    for i in range(n):
        # get random coco image
        catID = catIDs[np.random.randint(0, len(catIDs))]
        catImages = cocoGt.getImgIds(catIds=[catID])
        imgID = catImages[np.random.randint(0, len(catImages))]
        # build image meta data
        img = cocoGt.loadImgs(imgID)[0]
        imgArea = img['height']*img['width']
        imgURL = img['coco_url']
        # fetch ground truth labels from coco
        annIds = coco_caps.getAnnIds(imgIds=imgID)
        anns = coco_caps.loadAnns(annIds)
        ground_truth_labels = list(map(lambda x: x['caption'], anns))
        input_image_IDs.append(imgID)
        input_image_catIDs.append(catID)
        input_image_URLs.append(imgURL)
        input_image_areas.append(imgArea)
        input_image_labels.append(ground_truth_labels)

random_images = get_random_coco_image(input_image_count)    

## MODIFY FOR MANUAL INPUT
if input_type == 1:
    input_images_links = []

In [None]:

print("[1]: loading sam model")
sam_checkpoint = os.path.join("../checkpoints", "sam_vit_h_4b8939.pth")
model_type = "vit_h"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device=device)
mask_generator = SamAutomaticMaskGenerator(
    model=sam,
    points_per_side=32,
    points_per_batch=64,
    pred_iou_thresh=0.8,
    box_nms_thresh=0.3,
    stability_score_thresh=0.9,
    min_mask_region_area=1000,  # Requires open-cv to run post-processing
    crop_n_layers=1,
    crop_n_points_downscale_factor=2,
)

In [None]:

print("[2]: creating open clip model...")
modelType = 'ViT-B-32-quickgelu'
modelDataset = "laion400m_e31"
model, _, preprocess = open_clip.create_model_and_transforms(
    modelType, pretrained=modelDataset)
tokenizer = open_clip.get_tokenizer(modelType)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("[2]: loading coco categories as labels...")
text = tokenizer(coco_labels_words_values)
text = text.to(device)
