In [1]:
# Importing all the required packages 
import boto3 
import pandas as pd
import numpy as np
import os 

In [2]:
# Configuring the dataset bucket to access the data
s3 = boto3.client('s3')
bucket = 'surgimind01' # Note : later update this to be fet6ch from env


In [3]:
"""
Since the dataset is large (around 35 GB), S3 only returns up to 1000 keys per request. 
To avoid missing files, a paginator is used to iterate through all pages and list every object matching the prefix. 
This allows us to accurately count all frames, videos, or metadata stored under a given folder.
S3 does NOT treat folders as objects. So the solution is: count unique prefixes.
"""
prefix = "Testing/"

paginator = s3.get_paginator("list_objects_v2")

folders = set()

for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/"):
    for folder in page.get("CommonPrefixes", []):
        folders.add(folder["Prefix"])

print("Total video folders:", len(folders))
print("Folders:")
for f in folders:
    print(f)

Total video folders: 8
Folders:
Testing/VID39/
Testing/VID12/
Testing/VID07/
Testing/VID111/
Testing/VID06/
Testing/VID92/
Testing/VID25/
Testing/VID01/


In [8]:
prefix = "Training/"

paginator = s3.get_paginator("list_objects_v2")

folders = set()

for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/"):
    for folder in page.get("CommonPrefixes", []):
        folders.add(folder["Prefix"])

print("Total video folders:", len(folders))
print("Folders:")
for f in folders:
    print(f)

Total video folders: 10
Folders:
Training/VID37/
Training/VID96/
Training/VID31/
Training/VID17/
Training/VID02/
Training/VID23/
Training/VID04/
Training/VID11/
Training/VID103/
Training/VID13/


In [5]:
prefix = "Validation/"

paginator = s3.get_paginator("list_objects_v2")

folders = set()

for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/"):
    for folder in page.get("CommonPrefixes", []):
        folders.add(folder["Prefix"])

print("Total video folders:", len(folders))
print("Folders:")
for f in folders:
    print(f)

Total video folders: 2
Folders:
Validation/VID30/
Validation/VID110/


prefix = "Training/"

paginator = s3.get_paginator("list_objects_v2")
mmmmmmmmmmmmmmmmmm
folders = set()

for page in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/"):
    for folder in page.get("CommonPrefixes", []):
        folders.add(folder["Prefix"])

print("Total video folders:", len(folders))
print("Folders:")
for f in folders:
    print(f)


**Dataset Summary**

The dataset contains surgical video data divided into three main splits:

**Training Set (10 folders or videos)**
- Extracted PNG frames  
- JSON annotations  
- No `.mp4` files  

Used for model training.

**Testing Set (8 folders or videos)**
- No Extracted PNG frames  
- JSON annotations  
- Includes `.mp4` files  

Used to measure model performance during development.

**Validation Set (2 folders or video)**
- Extracted PNG frames  
- JSON annotations  
- No `.mp4` files  

Used for hyperparameter tuning and checking generalization.


| Split Type | Number of Videos | Contains MP4 | Contains Frames | Contains JSON |
| ---------- | ---------------- | ------------ | --------------- | ------------- |
| Training   | 10               | Yes          | Yes             | Yes           |
| Testing    | 8                | No           | Yes             | Yes           |
| Validation | 2                | No           | Yes             | Yes           |
