# Setup

The cells in this section are for setup- things like installing packages and downloading/preprocessing the dataset from Kaggle

### Verify CUDA is available

In [None]:
!nvidia-smi

Tue Dec 10 00:50:26 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.120                Driver Version: 550.120        CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A5500               Off |   00000000:9C:00.0 Off |                  Off |
| 30%   34C    P8             14W /  230W |      11MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Prepare GroundingDINO

Install the GroundingDINO package & its dependencies using pip, then download the pre-trained weights from the official GitHub releases

In [None]:
# @title Configure HOME path

import os
HOME = os.path.join("/content", "5561 Final Project")
print(HOME)

/content/5561 Final Project


In [None]:
# @title Install GroundingDINO's official PyTorch implementation from PyPi
!pip install groundingdino-py



In [None]:
# @title Download pre-trained weights (no training code is available for GroundingDINO)
import os

weights_path = os.path.join(HOME, "weights")

if not os.path.isdir(weights_path):
  !mkdir "{weights_path}"

%cd "{weights_path}"
!wget -q --show-progress https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

/content/5561 Final Project/weights


# Full Dataset

## Prepare Data

In [None]:
# @title Activate autoreload
%load_ext autoreload
%autoreload 2

In [None]:
# @title Add project directory to PATH so we can import from it
import os
import sys


if HOME:
  sys.path.append(HOME)
else:
  print("Please run the Setup section of cells before proceeding")

In [None]:
# @title Import symbols from dataloader.py
from dataloader import TACODownloader, TACODataset



### Download Images

In [None]:
# @title Verify Downloader imported successfully
TACODownloader.test_import()

DataLoader imported successfully!


In [None]:
# @title Initialize Downloader
DATA_PATH = os.path.join(HOME, "data")

if not os.path.isdir(DATA_PATH):
  !mkdir "{DATA_PATH}"

TACO_PATH = os.path.join(DATA_PATH, "taco")
TACO_DATA_PATH = os.path.join(DATA_PATH, "taco-full")

%mkdir "{TACO_DATA_PATH}"

tl = TACODownloader(
    repo_path=TACO_PATH,
    download_dir=TACO_DATA_PATH,
    use_full_resolution=True
)

if tl.check_params():
  print("Path checks successful! :)")
  print(TACO_PATH)
  print(TACO_DATA_PATH)
else:
  print("Path checks failed! :(")

mkdir: cannot create directory ‘/content/5561 Final Project/data/taco-full’: File exists
Path checks successful! :)
/content/5561 Final Project/data/taco
/content/5561 Final Project/data/taco-full


In [None]:
# @title Download images
if os.listdir(TACO_DATA_PATH) == []:
  tl.download_images()
else:
  print("Images already downloaded")

Images already downloaded


### Load dataset

In [None]:
dataset = TACODataset(
    json_path=os.path.join(TACO_PATH, "data", "annotations.json"),
    imgs_path=TACO_DATA_PATH
)

Processing Categories:   0%|          | 0/60 [00:00<?, ?it/s]

Processing Annotations:   0%|          | 0/4784 [00:00<?, ?it/s]

## Prepare Predictor

In [None]:
# @title Ensure that the weights and model config are present
import os

WEIGHTS_NAME = "groundingdino_swint_ogc.pth"
WEIGHTS_PATH = os.path.join(HOME, "weights", WEIGHTS_NAME)
print(WEIGHTS_PATH, "; exist:", os.path.isfile(WEIGHTS_PATH))

CONFIG_PATH = os.path.join(HOME, "config", "GroundingDINO_SwinT_OGC.py")
if os.path.isfile(CONFIG_PATH):
  print(CONFIG_PATH, "; exist:", os.path.isfile(CONFIG_PATH))
else:
  print(CONFIG_PATH, "; does not exist. Downloading from GitHub")
  !wget -q --show-progress https://github.com/IDEA-Research/GroundingDINO/blob/856dde20aee659246248e20734ef9ba5214f5e44/groundingdino/config/GroundingDINO_SwinT_OGC.py?raw=true -O "{CONFIG_PATH}"

print("--------------------")

print("✅ Good to go" if os.path.isfile(WEIGHTS_PATH) and os.path.isfile(CONFIG_PATH) else "❌ Missing files")

/content/5561 Final Project/weights/groundingdino_swint_ogc.pth ; exist: True
/content/5561 Final Project/config/GroundingDINO_SwinT_OGC.py ; exist: True
--------------------
✅ Good to go


In [None]:
# @title Import GroundingDINOPredictor
from predictor import Predictor
from groundingdino.util.inference import load_image

## Perform Inference

In [None]:
# @title Create a DataLoader for the TACODataset
from torch.utils.data import DataLoader

dl = DataLoader(dataset, batch_size=1, shuffle=False)

In [None]:
# @title Prepare prompts
very_generic_prompt = "objects"

generic_prompt = "pieces of garbage. pieces of trash. recyclable objects."

supercategories_prompt = ".".join(dataset.supercategories)
print(f"{len(dataset.supercategories)} supercategories: {supercategories_prompt}")

categories_prompt = ".".join(dataset.categories)
print(f"{len(dataset.categories)} categories: {categories_prompt}")

28 supercategories: Aluminium foil.Battery.Blister pack.Bottle.Bottle cap.Broken glass.Can.Carton.Cup.Food waste.Glass jar.Lid.Other plastic.Paper.Paper bag.Plastic bag & wrapper.Plastic container.Plastic glooves.Plastic utensils.Pop tab.Rope & strings.Scrap metal.Shoe.Squeezable tube.Straw.Styrofoam piece.Unlabeled litter.Cigarette
60 categories: Aluminium foil.Battery.Aluminium blister pack.Carded blister pack.Other plastic bottle.Clear plastic bottle.Glass bottle.Plastic bottle cap.Metal bottle cap.Broken glass.Food Can.Aerosol.Drink can.Toilet tube.Other carton.Egg carton.Drink carton.Corrugated carton.Meal carton.Pizza box.Paper cup.Disposable plastic cup.Foam cup.Glass cup.Other plastic cup.Food waste.Glass jar.Plastic lid.Metal lid.Other plastic.Magazine paper.Tissues.Wrapping paper.Normal paper.Paper bag.Plastified paper bag.Plastic film.Six pack rings.Garbage bag.Other plastic wrapper.Single-use carrier bag.Polypropylene bag.Crisp packet.Spread tub.Tupperware.Disposable food c

In [None]:
# @title Create directory to save inference results
import os
import locale

RESULTS_PATH = os.path.join(HOME, "results")
print(RESULTS_PATH)

if not os.path.isdir(RESULTS_PATH):
  !mkdir "{RESULTS_PATH}"


for i in range(4):
  for j in range(4):
    results_path = os.path.join(RESULTS_PATH, f"{i}{j}")

    if not os.path.isdir(results_path):
      !mkdir "{results_path}"

/content/5561 Final Project/results


In [None]:
# @title Run inference on all images with each prompt
from predictor import batch_predict


prompts = [
    very_generic_prompt,
    generic_prompt,
    supercategories_prompt,
    categories_prompt
]

prompt_names = [
    "very_generic",
    "generic",
    "supercategories",
    "categories"
]

thresholds = [0.25, 0.35, 0.5, 0.75]


for i in range(len(thresholds)):

  for j in range(len(thresholds)):
    results_path = os.path.join(RESULTS_PATH, f"{i}{j}")

    print(f"box_threshold={thresholds[i]}, text_threshold={thresholds[j]}, [{i}{j}]")

    if os.listdir(results_path) != []:
      print(f"Results already exist in {results_path}")
      continue

    predictor = Predictor(
      config_path=CONFIG_PATH,
      weights_path=WEIGHTS_PATH,
      box_threshold=thresholds[i],
      text_threshold=thresholds[j]
    )

    batch_predict(
      predictor=predictor,
      dataloader=dl,
      prompts=prompts,
      prompt_names=prompt_names,
      results_dir=results_path
    )


box_threshold=0.25, text_threshold=0.25, [00]
Results already exist in /content/5561 Final Project/results/00
box_threshold=0.25, text_threshold=0.35, [01]
final text_encoder_type: bert-base-uncased


  checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
Running Inference for very_generic: 100%|██████████| 1500/1500 [05:38<00:00,  4.43it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:39<00:00,  4.41it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:17<00:00,  3.97it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:51<00:00,  3.65it/s]


box_threshold=0.25, text_threshold=0.5, [02]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:39<00:00,  4.42it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:41<00:00,  4.39it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:20<00:00,  3.95it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:54<00:00,  3.62it/s]


box_threshold=0.25, text_threshold=0.75, [03]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:38<00:00,  4.43it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:40<00:00,  4.41it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:18<00:00,  3.97it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:51<00:00,  3.64it/s]


box_threshold=0.35, text_threshold=0.25, [10]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:38<00:00,  4.44it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:40<00:00,  4.41it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:18<00:00,  3.97it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:51<00:00,  3.65it/s]


box_threshold=0.35, text_threshold=0.35, [11]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:36<00:00,  4.45it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:40<00:00,  4.41it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:17<00:00,  3.97it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:51<00:00,  3.64it/s]


box_threshold=0.35, text_threshold=0.5, [12]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:37<00:00,  4.44it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:40<00:00,  4.41it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:18<00:00,  3.96it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:51<00:00,  3.64it/s]


box_threshold=0.35, text_threshold=0.75, [13]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:38<00:00,  4.43it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:40<00:00,  4.40it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:18<00:00,  3.96it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:52<00:00,  3.63it/s]


box_threshold=0.5, text_threshold=0.25, [20]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:36<00:00,  4.45it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:40<00:00,  4.41it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:17<00:00,  3.97it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:51<00:00,  3.64it/s]


box_threshold=0.5, text_threshold=0.35, [21]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:37<00:00,  4.44it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:40<00:00,  4.41it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:18<00:00,  3.96it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:51<00:00,  3.64it/s]


box_threshold=0.5, text_threshold=0.5, [22]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:37<00:00,  4.45it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:39<00:00,  4.42it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:17<00:00,  3.98it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:51<00:00,  3.65it/s]


box_threshold=0.5, text_threshold=0.75, [23]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:37<00:00,  4.45it/s]
Running Inference for generic: 100%|██████████| 1500/1500 [05:40<00:00,  4.41it/s]
Running Inference for supercategories: 100%|██████████| 1500/1500 [06:17<00:00,  3.97it/s]
Running Inference for categories: 100%|██████████| 1500/1500 [06:51<00:00,  3.64it/s]


box_threshold=0.75, text_threshold=0.25, [30]
final text_encoder_type: bert-base-uncased


Running Inference for very_generic: 100%|██████████| 1500/1500 [05:38<00:00,  4.43it/s]
Running Inference for generic:  51%|█████▏    | 772/1500 [03:01<02:37,  4.61it/s]

Buffered data was truncated after reaching the output size limit.

## Evaluate Performance

In [256]:
from evaluator import Evaluator

ev = Evaluator(dl, os.path.join(RESULTS_PATH, "10"))

evalutation = ev.evaluate('supercategories')


Evaluating Predictions: 0it [00:00, ?it/s]

# Prediction labels matching GT labels: 0


## Plot Results

In [260]:
# @title Plot Confusion Matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Replace this with your actual dictionary
confusion_dict = evalutation

labels = sorted(confusion_dict.keys())
matrix = np.zeros((len(labels), len(labels)))

if not labels:
    raise ValueError("Empty confusion dictionary")

for i, true_label in enumerate(labels):
    for j, pred_label in enumerate(labels):
        matrix[i, j] = confusion_dict[true_label][pred_label]

plt.figure(figsize=(8, 6))
sns.heatmap(matrix, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

ValueError: Empty confusion dictionary