# SKU110k Dataset: Configuring Vertex AI Datasets

### 1. Import Required Libraries

In [1]:
import os
from google.cloud import aiplatform
from google.cloud import storage
import time

### 2. Initialize Vertex AI

In [2]:
PROJECT_ID = "shelfscout"
REGION = "us-central1"
BUCKET_NAME = "sku-110k-dataset"

# Initialize Vertex AI SDK
aiplatform.init(project=PROJECT_ID, location=REGION)

### 3. Clean up any existing datasets first

In [3]:
print("Checking for existing datasets to clean up...")
datasets = aiplatform.ImageDataset.list()
for ds in datasets:
    if ds.display_name in ["SKU110K-Dataset", "SKU110K-Train", "SKU110K-Validation", "SKU110K-Test"]:
        print(f"Deleting existing dataset: {ds.display_name}")
        ds.delete()

# Wait a moment to ensure deletion completes
time.sleep(5)
print("Cleanup complete")

Checking for existing datasets to clean up...
Cleanup complete


### 4. Create the Dataset

In [4]:
print("\nCreating main SKU110K dataset...")
gcs_source = f"gs://{BUCKET_NAME}/SKU110K_Kaggle"

dataset = aiplatform.ImageDataset.create(
    display_name="SKU110K-Dataset",
    gcs_source=gcs_source,
    import_schema_uri=aiplatform.schema.dataset.ioformat.image.bounding_box,
    sync=True
)

print(f"Dataset created: {dataset.resource_name}")


Creating main SKU110K dataset...
Creating ImageDataset
Create ImageDataset backing LRO: projects/385790973873/locations/us-central1/datasets/4152031334144933888/operations/1724127244164530176
ImageDataset created. Resource name: projects/385790973873/locations/us-central1/datasets/4152031334144933888
To use this ImageDataset in another session:
ds = aiplatform.ImageDataset('projects/385790973873/locations/us-central1/datasets/4152031334144933888')
Importing ImageDataset data: projects/385790973873/locations/us-central1/datasets/4152031334144933888
Import ImageDataset data backing LRO: projects/385790973873/locations/us-central1/datasets/4152031334144933888/operations/6391545307980627968
ImageDataset data imported. Resource name: projects/385790973873/locations/us-central1/datasets/4152031334144933888
Dataset created: projects/385790973873/locations/us-central1/datasets/4152031334144933888


### 5. Add comprehensive metadata

In [9]:
print("\nAdding metadata to dataset...")
try:
    # Get split counts for metadata
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.get_bucket(BUCKET_NAME)
    
    def count_files(prefix):
        blobs = list(bucket.list_blobs(prefix=prefix))
        return len([blob for blob in blobs if not blob.name.endswith('/')])
    
    train_count = count_files('SKU110K_Kaggle/images/train/')
    val_count = count_files('SKU110K_Kaggle/images/val/')
    test_count = count_files('SKU110K_Kaggle/images/test/')
    total_count = train_count + val_count + test_count
    
    # Update with rich metadata
    dataset.update(
        labels={
            "purpose": "retail_object_detection",
            "project": "shelfscout",
            "dataset_name": "sku110k",
            "version": "1_0",
            "splits": "train_val_test",
            "train_count": str(train_count),
            "val_count": str(val_count),
            "test_count": str(test_count),
            "total_images": str(total_count)
        }
    )
    print("Successfully updated dataset metadata")
except Exception as e:
    print(f"Error updating dataset metadata: {e}")


Adding metadata to dataset...
Successfully updated dataset metadata


### 6. Verify the dataset

In [10]:
print("\nVerifying dataset...")
datasets = aiplatform.ImageDataset.list(filter="display_name=SKU110K-Dataset")
if datasets:
    verified_dataset = datasets[0]
    print(f"Dataset verified:")
    print(f"  Name: {verified_dataset.display_name}")
    print(f"  Resource: {verified_dataset.resource_name}")
    print(f"  Metadata: {verified_dataset.labels}")
else:
    print("Warning: Dataset not found in verification step")


Verifying dataset...
Dataset verified:
  Name: SKU110K-Dataset
  Resource: projects/385790973873/locations/us-central1/datasets/4152031334144933888
  Metadata: {'val_count': '584', 'test_count': '2920', 'purpose': 'retail_object_detection', 'dataset_name': 'sku110k', 'project': 'shelfscout', 'splits': 'train_val_test', 'train_count': '8185', 'version': '1_0', 'total_images': '11689'}


### 7. Save Dataset Information

In [11]:
print("\nSaving dataset information...")
with open('dataset_info.txt', 'w') as f:
    f.write(f"SKU110K Dataset ID: {dataset.resource_name}\n")
    f.write(f"GCS Location: gs://{BUCKET_NAME}/SKU110K_Kaggle\n")
    f.write(f"Creation Time: {dataset.create_time}\n")
    f.write(f"Total Images: {total_count}\n")
    f.write(f"Train/Val/Test Split: {train_count}/{val_count}/{test_count}\n")

print("\nDataset configuration complete!")
print(f"Dataset information saved to dataset_info.txt")


Saving dataset information...

Dataset configuration complete!
Dataset information saved to dataset_info.txt


### 8. Links to Vertex AI console

In [12]:
print("\nYou can view the dataset in the Vertex AI console:")
print(f"https://console.cloud.google.com/vertex-ai/datasets/projects/{PROJECT_ID}/locations/{REGION}")


You can view the dataset in the Vertex AI console:
https://console.cloud.google.com/vertex-ai/datasets/projects/shelfscout/locations/us-central1
