# SKU110k Dataset: Configuring Vertex AI Datasets

### 1. Import Required Libraries

In [1]:
import os
from google.cloud import aiplatform
from google.cloud import storage

### 2. Initialize Vertex AI

In [2]:
# Set up project and region
PROJECT_ID = "shelfscout"
REGION = "us-central1"     # choose appropriate region
BUCKET_NAME = "sku-110k-dataset"

# Initialize Vertex AI SDK
aiplatform.init(project=PROJECT_ID, location=REGION)

### 3. Create the Dataset

In [3]:
# Define GCS paths to your data
gcs_source = f"gs://{BUCKET_NAME}/SKU110K_Kaggle"

# Create the Vertex AI Dataset
dataset = aiplatform.ImageDataset.create(
    display_name="SKU110K-Dataset",
    gcs_source=gcs_source,
    import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
    sync=True
)

# Get the created dataset resource name
print(f"Dataset created: {dataset.resource_name}")

Creating ImageDataset
Create ImageDataset backing LRO: projects/385790973873/locations/us-central1/datasets/8706296457323347968/operations/7637142448669458432
ImageDataset created. Resource name: projects/385790973873/locations/us-central1/datasets/8706296457323347968
To use this ImageDataset in another session:
ds = aiplatform.ImageDataset('projects/385790973873/locations/us-central1/datasets/8706296457323347968')
Importing ImageDataset data: projects/385790973873/locations/us-central1/datasets/8706296457323347968
Import ImageDataset data backing LRO: projects/385790973873/locations/us-central1/datasets/8706296457323347968/operations/352007101444259840
ImageDataset data imported. Resource name: projects/385790973873/locations/us-central1/datasets/8706296457323347968
Dataset created: projects/385790973873/locations/us-central1/datasets/8706296457323347968


### 4. Configure Dataset Splits

In [4]:
# Create separate datasets for each split
train_dataset = aiplatform.ImageDataset.create(
    display_name="SKU110K-Train",
    gcs_source=f"gs://{BUCKET_NAME}/SKU110K_Kaggle/images/train",
    import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
    sync=True
)

val_dataset = aiplatform.ImageDataset.create(
    display_name="SKU110K-Validation",
    gcs_source=f"gs://{BUCKET_NAME}/SKU110K_Kaggle/images/val",
    import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
    sync=True
)

test_dataset = aiplatform.ImageDataset.create(
    display_name="SKU110K-Test",
    gcs_source=f"gs://{BUCKET_NAME}/SKU110K_Kaggle/images/test",
    import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
    sync=True
)

Creating ImageDataset
Create ImageDataset backing LRO: projects/385790973873/locations/us-central1/datasets/4353567417469763584/operations/4925412523038998528
ImageDataset created. Resource name: projects/385790973873/locations/us-central1/datasets/4353567417469763584
To use this ImageDataset in another session:
ds = aiplatform.ImageDataset('projects/385790973873/locations/us-central1/datasets/4353567417469763584')
Importing ImageDataset data: projects/385790973873/locations/us-central1/datasets/4353567417469763584
Import ImageDataset data backing LRO: projects/385790973873/locations/us-central1/datasets/4353567417469763584/operations/3311997956533518336
ImageDataset data imported. Resource name: projects/385790973873/locations/us-central1/datasets/4353567417469763584
Creating ImageDataset
Create ImageDataset backing LRO: projects/385790973873/locations/us-central1/datasets/6668417625938198528/operations/8909690818378334208
ImageDataset created. Resource name: projects/385790973873/loc

### 5. Add Metadata to Datasets

In [7]:
# Add metadata to the dataset with compliant labels
try:
    dataset.update(
        labels={
            "purpose": "retail_object_detection",
            "project": "shelfscout",
            "dataset_name": "sku110k",
            "version": "1_0"
        }
    )
    print("Successfully updated dataset metadata")
except Exception as e:
    print(f"Error updating dataset metadata: {e}")

Successfully updated dataset metadata


In [12]:
# Add metadata to each split dataset
for split in ["train", "validation", "test"]:
    # Get the dataset by name
    datasets = aiplatform.ImageDataset.list(
        filter=f"display_name=SKU110K-{split.capitalize()}"
    )
    
    if datasets:
        split_dataset = datasets[0]
        
        # Add metadata to the split dataset
        try:
            split_dataset.update(
                labels={
                    "purpose": "retail_object_detection",
                    "project": "shelfscout",
                    "dataset_name": "sku110k",
                    "split": split,
                    "version": "1_0"
                }
            )
            print(f"Successfully updated metadata for {split} dataset")
        except Exception as e:
            print(f"Error updating metadata for {split} dataset: {e}")
    else:
        print(f"Dataset for split '{split}' not found")

Successfully updated metadata for train dataset
Successfully updated metadata for validation dataset
Successfully updated metadata for test dataset


### 6. Verify Dataset Creation

In [8]:
# List all datasets to verify creation
datasets = aiplatform.ImageDataset.list()
for ds in datasets:
    print(f"Dataset name: {ds.display_name}, resource: {ds.resource_name}")

Dataset name: SKU110K-Dataset, resource: projects/385790973873/locations/us-central1/datasets/8706296457323347968
Dataset name: SKU110K-Test, resource: projects/385790973873/locations/us-central1/datasets/3750085067402117120
Dataset name: SKU110K-Validation, resource: projects/385790973873/locations/us-central1/datasets/6668417625938198528
Dataset name: SKU110K-Train, resource: projects/385790973873/locations/us-central1/datasets/4353567417469763584


### 7. Handle Annotation Format Conversion

In [9]:
import json
import os
from google.cloud import storage

# Define a function to convert the annotation format
def convert_annotations_to_vertex_format(gcs_bucket_name, annotation_path, image_path, output_path):
    """
    Convert SKU-110K annotations to Vertex AI compatible format (JSONL).
    
    Args:
        gcs_bucket_name: GCS bucket name containing the dataset
        annotation_path: Path to annotation files in GCS
        image_path: Path to image files in GCS
        output_path: Path to save the converted annotation files
    """
    
    # Create a GCS client
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(gcs_bucket_name)
    
    # Define a function to process a single annotation file
    def process_annotation_file(annotation_blob, image_folder):
        # Get the annotation file name and corresponding image name
        annotation_name = annotation_blob.name.split('/')[-1]
        image_name = annotation_name.replace('.txt', '.jpg')
        
        # Download annotation content
        annotation_content = annotation_blob.download_as_string().decode('utf-8')
        
        # Parse annotation lines
        bboxes = []
        for line in annotation_content.strip().split('\n'):
            parts = line.strip().split(' ')
            if len(parts) >= 5:  # Ensure we have at least class, x1, y1, x2, y2
                class_id = int(parts[0])
                x1 = float(parts[1])
                y1 = float(parts[2])
                width = float(parts[3])
                height = float(parts[4])
                
                # In SKU-110K, coordinates are already normalized (0-1 range)
                # and format is [x1 y1 width height confidence]
                x2 = x1 + width if width < 1 else width  # Handle different formats
                y2 = y1 + height if height < 1 else height
                
                # Create bbox dictionary
                bbox = {
                    "xMin": x1,
                    "yMin": y1,
                    "xMax": x2,
                    "yMax": y2,
                    "label": "product"  # Assuming all objects are products
                }
                bboxes.append(bbox)
        
        # Create Vertex AI compatible annotation
        vertex_annotation = {
            "imageGcsUri": f"gs://{gcs_bucket_name}/{image_folder}/{image_name}",
            "boundingBoxAnnotations": bboxes
        }
        
        return vertex_annotation
    
    # Process train, val, and test sets
    splits = ["train", "val", "test"]
    
    for split in splits:
        print(f"Processing {split} annotations...")
        
        # List all annotation files for this split
        annotation_blobs = list(bucket.list_blobs(prefix=f"{annotation_path}/{split}/"))
        
        vertex_annotations = []
        
        # Process each annotation file (with progress reporting)
        for i, annotation_blob in enumerate(annotation_blobs):
            if i % 100 == 0:
                print(f"  Processed {i}/{len(annotation_blobs)} files")
            
            try:
                vertex_annotation = process_annotation_file(annotation_blob, f"{image_path}/{split}")
                vertex_annotations.append(vertex_annotation)
            except Exception as e:
                print(f"Error processing {annotation_blob.name}: {e}")
        
        # Save annotations to JSONL file
        output_file = f"{output_path}/{split}_annotations.jsonl"
        
        with open(output_file, 'w') as f:
            for annotation in vertex_annotations:
                f.write(json.dumps(annotation) + '\n')
        
        print(f"Saved {len(vertex_annotations)} annotations to {output_file}")
        
        # Upload the JSONL file to GCS
        output_blob = bucket.blob(f"{output_path}/{split}_annotations.jsonl")
        output_blob.upload_from_filename(output_file)
        
        print(f"Uploaded {output_file} to gs://{gcs_bucket_name}/{output_path}/{split}_annotations.jsonl")

# Define paths (use variables that should already be defined from previous steps)
BUCKET_NAME = "sku-110k-dataset"
ANNOTATION_PATH = "SKU110K_Kaggle/labels"
IMAGE_PATH = "SKU110K_Kaggle/images"
OUTPUT_PATH = "vertex_annotations"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Call the conversion function
convert_annotations_to_vertex_format(BUCKET_NAME, ANNOTATION_PATH, IMAGE_PATH, OUTPUT_PATH)

Processing train annotations...
  Processed 0/8185 files
  Processed 100/8185 files
  Processed 200/8185 files
  Processed 300/8185 files
  Processed 400/8185 files
  Processed 500/8185 files
  Processed 600/8185 files
  Processed 700/8185 files
  Processed 800/8185 files
  Processed 900/8185 files
  Processed 1000/8185 files
  Processed 1100/8185 files
  Processed 1200/8185 files
  Processed 1300/8185 files
  Processed 1400/8185 files
  Processed 1500/8185 files
  Processed 1600/8185 files
  Processed 1700/8185 files
  Processed 1800/8185 files
  Processed 1900/8185 files
  Processed 2000/8185 files
  Processed 2100/8185 files
  Processed 2200/8185 files
  Processed 2300/8185 files
  Processed 2400/8185 files
  Processed 2500/8185 files
  Processed 2600/8185 files
  Processed 2700/8185 files
  Processed 2800/8185 files
  Processed 2900/8185 files
  Processed 3000/8185 files
  Processed 3100/8185 files
  Processed 3200/8185 files
  Processed 3300/8185 files
  Processed 3400/8185 files


### 8. Save Dataset Information

In [10]:
# Store the dataset IDs for future reference
with open('dataset_info.txt', 'w') as f:
    f.write(f"Main Dataset ID: {dataset.resource_name}\n")
    f.write(f"Train Dataset ID: {train_dataset.resource_name}\n")
    f.write(f"Validation Dataset ID: {val_dataset.resource_name}\n")
    f.write(f"Test Dataset ID: {test_dataset.resource_name}\n")