In [1]:
import os
import subprocess
import pandas as pd
import json

import viame2coco.viame2coco as viame2coco
from pycocotools.coco import COCO
import cv2
# import matplotlib.pyplot as plt
import math


This notebook converts VIAME CSV file to COCO format, and extracts identified ROIs to image library folders. This notebook is still fairly ad-hoc, and thus users should review all aspects of the notebook whenever running it.

Current steps, and known pieces that need to happen between deployments:

1) cp annotation files to the "jw-annotations" folder. Currently Jen saves these in the amlr-gliders-imagery-proc-dev folder. For instance `gcloud storage cp gs://amlr-gliders-imagery-proc-dev/REFOCUS/2024/george-20240530/annotations-manual/* jw-annotations/`

2) Update the `deployment` and `directory` variables. Ensure that the `csv_base_path` actually does point to the annotations that you're looking to use, and that `json_base_path` is correct.

3) Update the file names for `csv_path2` and `dir_path` as needed. These updates will depend across deployments

4) Run all cells up to the 'Extract regions from images' section

5) Update `roi_base`, if wanting to write ROIs to a test directory. Otherwise, run the 'Extract regions from images' and watch the ROIs flow

## Mount GCP buckets

In [2]:
home_path = "/home/sam_woodman_noaa_gov"

# Mount relevant buckets
imagery_raw_bucket = "amlr-gliders-imagery-raw-dev"
imagery_raw_mt = f"{home_path}/amlr-gliders-imagery-raw-dev"

lib_bucket = "esd-image-library-dev"
lib_mt = f"{home_path}/esd-image-library-dev"

# Set variables 
deployment = 'george-20240907'
directory = "Dir0001"
csv_base_path = f"{home_path}/jw-annotations"
json_base_path = f"{home_path}/coco-jsons"

# Mount bucket(s)
os.makedirs(imagery_raw_mt, exist_ok=True)
cmd = ["gcsfuse", "--implicit-dirs", "-o", "ro", imagery_raw_bucket, imagery_raw_mt]
subprocess.run(cmd)

os.makedirs(lib_mt, exist_ok=True)
cmd = ["gcsfuse", "--implicit-dirs", lib_bucket, lib_mt]
subprocess.run(cmd)

{"timestamp":{"seconds":1749856734,"nanos":328530621},"severity":"INFO","message":"Start gcsfuse/2.9.0 (Go version go1.23.5) for app \"\" using mount point: /home/sam_woodman_noaa_gov/amlr-gliders-imagery-raw-dev\n"}
{"timestamp":{"seconds":1749856734,"nanos":329153430},"severity":"INFO","message":"GCSFuse config","config":{"AppName":"","CacheDir":"","Debug":{"ExitOnInvariantViolation":false,"Fuse":false,"Gcs":false,"LogMutex":false},"EnableAtomicRenameObject":false,"EnableHns":true,"FileCache":{"CacheFileForRangeRead":false,"DownloadChunkSizeMb":50,"EnableCrc":false,"EnableODirect":false,"EnableParallelDownloads":false,"MaxParallelDownloads":16,"MaxSizeMb":-1,"ParallelDownloadsPerFile":16,"WriteBufferSize":4194304},"FileSystem":{"DirMode":"755","DisableParallelDirops":false,"FileMode":"644","FuseOptions":["ro"],"Gid":-1,"HandleSigterm":true,"IgnoreInterrupts":true,"KernelListCacheTtlSecs":0,"PreconditionErrors":true,"RenameDirLimit":0,"TempDir":"","Uid":-1},"Foreground":false,"GcsAuth

Error: daemonize.Run: readFromProcess: sub-process: Error while mounting gcsfuse: mountWithArgs: mountWithStorageHandle: mount: mount: running /usr/bin/fusermount: exit status 1
Error: daemonize.Run: readFromProcess: sub-process: Error while mounting gcsfuse: mountWithArgs: mountWithStorageHandle: mount: mount: running /usr/bin/fusermount: exit status 1


CompletedProcess(args=['gcsfuse', '--implicit-dirs', 'esd-image-library-dev', '/home/sam_woodman_noaa_gov/esd-image-library-dev'], returncode=1)

## Convert VIAME CSV file to COCO format

In [3]:
# Print all csv options
dirfiles = os.listdir(csv_base_path)
display(sorted(dirfiles))

# print("probably this one:")
# print([deployment in i for i in dirfiles])
# print([directory.lower() in i for i in dirfiles])
# dirfiles[[directory in i for i in dirfiles] and [deployment in i for i in dirfiles]]

['amlr08-20220513-dir0000-annotations-manual.csv',
 'amlr08-20220513-dir0000-dir0001-first110frames-annotations-manual.csv',
 'amlr08-20220513-dir0001-frame111-frame759-annotations-manual.csv',
 'amlr08-20220513-dir0002-annotations-manual.csv',
 'amlr08-20220513-dir0003-annotations-manual.csv',
 'amlr08-20220513-dir0004-annotations-manual.csv',
 'amlr08-20220513-dir0005-annotations-manual.csv',
 'amlr08-20220513-dir0006-annotations-manual.csv',
 'amlr08-20220513-dir0007-annotations-manual.csv',
 'amlr08-20220513-dir0008-annotations-manual.csv',
 'amlr08-20250513-dir0009-annotations-manual.csv',
 'george-20240530-dir0000-annotations-manual.csv',
 'george-20240530-dir0001-annotations-manual-partial.csv',
 'george-20240907-dir0000-annotations-manual-hydrozoans-chaetognaths-siphonophores.csv',
 'george-20240907-dir0001-annotations-manual-hydrozoans-chaetognaths-siphonophores.csv']

In [4]:
# Read in CSV file(s)
# csv_path2 = os.path.join(csv_base_path, 'amlr08-20220513-dir0002-annotations-manual.csv')
csv_path2 = os.path.join(
    csv_base_path, 
    f'{deployment}-{directory.lower()}-annotations-manual-hydrozoans-chaetognaths-siphonophores.csv'
)

print(csv_path2)
d2 = pd.read_csv(csv_path2)
display(d2)

display(d2.iloc[1:, 9].value_counts())

/home/sam_woodman_noaa_gov/jw-annotations/george-20240907-dir0001-annotations-manual-hydrozoans-chaetognaths-siphonophores.csv


Unnamed: 0,# 1: Detection or Track-id,2: Video or Image Identifier,3: Unique Frame Identifier,4-7: Img-bbox(TL_x,TL_y,BR_x,BR_y),8: Detection or Length Confidence,9: Target Length (0 or -1 if invalid),10-11+: Repeated Species,Confidence Pairs or Attributes
0,# metadata,"exported_by: ""dive:python""","exported_time: ""Wed Jun 11 22:09:59 2025""",,,,,,,,
1,0,george 20240907-195237-001.jpg,8,576.0,716.0,640.0,792.0,1.0,-1.0,chaetognath,1.0
2,1,george 20240907-195246-001.jpg,12,969.0,633.0,1025.0,696.0,1.0,-1.0,chaetognath,1.0
3,2,george 20240907-195248-001.jpg,13,1160.0,1312.0,1222.0,1360.0,1.0,-1.0,doliolid,1.0
4,3,george 20240907-195310-001.jpg,23,600.0,836.0,679.0,918.0,1.0,-1.0,doliolid,1.0
5,4,george 20240907-195312-001.jpg,24,653.0,860.0,716.0,917.0,1.0,-1.0,chaetognath,1.0
6,5,george 20240907-195423-001.jpg,56,1416.0,566.0,1528.0,641.0,1.0,-1.0,hydrozoan,1.0
7,6,george 20240907-195933-001.jpg,195,1020.0,968.0,1136.0,1101.0,1.0,-1.0,hydrozoan,1.0
8,7,george 20240907-200028-001.jpg,220,827.0,1381.0,894.0,1446.0,1.0,-1.0,hydrozoan,1.0
9,8,george 20240907-200048-001.jpg,229,655.0,226.0,787.0,334.0,1.0,-1.0,siphonophore,1.0


10-11+: Repeated Species
chaetognath     15
hydrozoan       14
siphonophore     4
doliolid         2
salp             2
ctenophore       1
Name: count, dtype: int64

In [5]:
# viame2coco.viame2coco(csv_path2, "temporary description")
dir_path = os.path.join(
    home_path, 
    imagery_raw_bucket, 
    # f"SANDIEGO/2022/{deployment}/images/{directory}"
    f"REFOCUS/2024/{deployment}/images/{directory}"
)

# This config describes which columns mean which in the VIAME CSV file
config = {
    'filename': 1,
    'label': 9, 
    'bbox_tlbr': {
        'tlx': 3,
        'tly': 4,
        'brx': 5,
        'bry': 6
    }
}

print(dir_path)
c2 = viame2coco.viame2coco(
    csv_path2, "temporary description", 
    filename_base=dir_path, viame_csv_config=config
)
j2_name = f"{json_base_path}/{deployment}-{directory}-coco.json"
print(j2_name)
c2.to_json(j2_name)
display(c2)

/home/sam_woodman_noaa_gov/amlr-gliders-imagery-raw-dev/REFOCUS/2024/george-20240907/images/Dir0001
/home/sam_woodman_noaa_gov/coco-jsons/george-20240907-Dir0001-coco.json


<pycocowriter.coco.COCOData at 0x7fded3985190>

In [None]:
# # Merge pt1 and pt2 of Dir0001 annotation files. Specific to amlr08-20220513
# from pycocowriter import cocomerge
# from pycocowriter import coco
# import json
# import datetime

# j11_file = '/home/sam_woodman_noaa_gov/dir1-coco-pt1.json'
# j12_file = '/home/sam_woodman_noaa_gov/dir1-coco-pt2.json'

# with open(j11_file) as fin:
#     j11 = json.load(fin)
# with open(j12_file) as fin:
#     j12 = json.load(fin)

# print(j11["info"])
# info = coco.COCOInfo(
#     year=j11["info"]["year"], 
#     version=j11["info"]["version"], 
#     description=j11["info"]["description"], 
#     date_created=datetime.datetime.now(datetime.timezone.utc), 
# )

# j_dir1 = cocomerge.coco_merge(j11, j12, info=info)
# display(j_dir1)

# # with open('/home/sam_woodman_noaa_gov/dir1-coco.json', "w") as fp:
# #     json.dump(j_dir1 , fp)

### Explore COCO output

If desired

In [6]:
with open(j2_name) as fin:
    j2 = json.load(fin)

print(j2.keys())
[print(j2[i]) for i in j2.keys()]

j2_images = j2["images"]
print(j2_images)

j2_anno = j2["annotations"]
print(j2_anno)

# # print(j2["info"])
# # print(j2["licenses"])
# # print(j2["categories"])

j2_cat = j2["categories"]
j2_cat

dict_keys(['info', 'images', 'annotations', 'licenses', 'categories'])
{'year': 2025, 'version': '0.1', 'description': 'temporary description', 'date_created': '2025-06-13T23:19:02.074663+00:00'}
[{'id': 1, 'file_name': 'george 20240907-195237-001.jpg', 'width': 2028, 'height': 1520}, {'id': 2, 'file_name': 'george 20240907-195246-001.jpg', 'width': 2028, 'height': 1520}, {'id': 3, 'file_name': 'george 20240907-195248-001.jpg', 'width': 2028, 'height': 1520}, {'id': 4, 'file_name': 'george 20240907-195310-001.jpg', 'width': 2028, 'height': 1520}, {'id': 5, 'file_name': 'george 20240907-195312-001.jpg', 'width': 2028, 'height': 1520}, {'id': 6, 'file_name': 'george 20240907-195423-001.jpg', 'width': 2028, 'height': 1520}, {'id': 7, 'file_name': 'george 20240907-195933-001.jpg', 'width': 2028, 'height': 1520}, {'id': 8, 'file_name': 'george 20240907-200028-001.jpg', 'width': 2028, 'height': 1520}, {'id': 9, 'file_name': 'george 20240907-200048-001.jpg', 'width': 2028, 'height': 1520}, {'

[{'name': 'chaetognath', 'id': 1},
 {'name': 'doliolid', 'id': 2},
 {'name': 'hydrozoan', 'id': 3},
 {'name': 'siphonophore', 'id': 4},
 {'name': 'ctenophore', 'id': 5},
 {'name': 'salp', 'id': 6}]

## Extract regions from images
 
Using COCO output, based on this code: https://forum.image.sc/t/crop-image-and-annotations-to-bbox-coco-format/74520/4

COCO data format: https://cocodataset.org/#format-data

COCO API functions: top of https://github.com/ppwwyyxx/cocoapi/blob/master/PythonAPI/pycocotools/coco.py

In [7]:
### Sam exp - loop through categories and write regions to category folders

roi_base = os.path.join(lib_mt, "esd-shadowgraph-library")
# roi_base = os.path.join(home_path, "zzhope")

# This was tmp code to deal with the split amlr08 annotations
# roi_base = os.path.join("/home/sam_woodman_noaa_gov", "Dir0001-pt2-out")

# Load COCO file for images+annotations+categories
annFile = j2_name
coco=COCO(annFile) #load via cocoapi
cats = coco.loadCats(coco.getCatIds()) #get all categorys in coco file

cats_names = [i["name"] for i in cats]
print(f"Categories: {cats_names}")

for cat in cats:
    # Get category name, and make directory
    cat_name = cat["name"]
    cat_base = os.path.join(roi_base, cat_name)
    os.makedirs(cat_base, exist_ok=True)
    
    catIds = coco.getCatIds(catNms=[cat_name]) #use to pull cat ID from cat
    imgIds = coco.getImgIds(catIds=catIds ) #get all images with the specified category above in

    print(f"For category {cat_name}, there are {len(imgIds)} full images. " +
            f"ROIs from these images are being written to {cat_base}")
    for img_idx in imgIds:
        # Get and read in the current image
        img = coco.loadImgs(img_idx)[0]
        I = cv2.imread(os.path.join(dir_path, img['file_name']))
        # I

        # Get the current annotation(s)
        annIds = coco.getAnnIds(imgIds=img['id'], catIds=catIds, iscrowd=None)
        anns = coco.loadAnns(annIds)

        for i in range(len(anns)):
            ann = anns[i]
            crop_bbox = ann["bbox"]

            #make cropped image
            cropim = I[math.floor(crop_bbox[1]):math.ceil(crop_bbox[1] + crop_bbox[3]), 
                        math.floor(crop_bbox[0]):math.ceil(crop_bbox[0]+crop_bbox[2])]
            
            #save cropped image
            file_name_curr = os.path.splitext(img["file_name"])
            file_name = (
                f"{deployment}_{file_name_curr[0]}_crop-{(i+1):02}" + 
                file_name_curr[1])
            path_out = os.path.join(cat_base, file_name)
            cv2.imwrite(path_out, cropim)

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Categories: ['chaetognath', 'doliolid', 'hydrozoan', 'siphonophore', 'ctenophore', 'salp']
For category chaetognath, there are 15 full images. ROIs from these images are being written to /home/sam_woodman_noaa_gov/esd-image-library-dev/esd-shadowgraph-library/chaetognath
For category doliolid, there are 2 full images. ROIs from these images are being written to /home/sam_woodman_noaa_gov/esd-image-library-dev/esd-shadowgraph-library/doliolid
For category hydrozoan, there are 14 full images. ROIs from these images are being written to /home/sam_woodman_noaa_gov/esd-image-library-dev/esd-shadowgraph-library/hydrozoan
For category siphonophore, there are 4 full images. ROIs from these images are being written to /home/sam_woodman_noaa_gov/esd-image-library-dev/esd-shadowgraph-library/siphonophore
For category ctenophore, there are 1 full images. ROIs from these images are being written to /home/sam_woodman_

In [None]:
# ### Sam experiments - individual files
# q = 2

# # Get and read in the current image
# img = coco.loadImgs(imgIds[q])[0]
# I = cv2.imread(img['file_name'])
# I

# # Get the current annotation(s)
# annIds = coco.getAnnIds(imgIds=img['id'], catIds=catIds, iscrowd=None)
# anns = coco.loadAnns(annIds)

# crop_bbox = anns[0]["bbox"]
# #make cropped image
# cropim = I[math.floor(crop_bbox[1]):math.ceil(crop_bbox[1] + crop_bbox[3]), math.floor(crop_bbox[0]):math.ceil(crop_bbox[0]+crop_bbox[2])]
# #save cropped image
# cv2.imwrite(os.path.join("/home/sam_woodman_noaa_gov", os.path.basename(img["file_name"])), 
#             cropim)