# SKU110K Dataset: Implement Data Preprocessing Pipeline for Image Normalization

### 1. Install Tensorflow

In [4]:
%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-3.0.1-py3-none-any.whl.metadata (6.1 kB)
Collecting tensorboard~=2

### 2. Import Libraries

In [1]:
import os
import io
import json
import tensorflow as tf
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from google.cloud import storage

2025-04-18 05:50:27.864808: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-18 05:50:28.234203: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-18 05:50:28.527925: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744955428.766803    5536 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744955428.829491    5536 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744955429.422794    5536 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

### 3. Set Up Dependencies and Configuration

In [2]:
# Set up project configuration
PROJECT_ID = "shelfscout"
BUCKET_NAME = "sku-110k-dataset"
INPUT_PATH = "SKU110K_Kaggle"
OUTPUT_PATH = "processed_data"
TARGET_SIZE = (640, 640)

# Initialize storage client
storage_client = storage.Client(project=PROJECT_ID)
bucket = storage_client.get_bucket(BUCKET_NAME)

### 4. Funtion for Pre-processing Data and Creating TFRecord

In [3]:
# Core preprocessing function
def preprocess_and_create_tfrecord(split, num_shards=10):
    """
    Preprocess images and create TFRecords for the given split
    """
    # Create output path
    output_dir = f"{OUTPUT_PATH}/{split}"
    bucket.blob(f"{output_dir}/").upload_from_string('')
    
    # List images
    image_blobs = list(bucket.list_blobs(prefix=f"{INPUT_PATH}/images/{split}/"))
    image_blobs = [blob for blob in image_blobs if blob.name.lower().endswith(('.jpg', '.jpeg', '.png'))]
    num_images = len(image_blobs)
    
    # Calculate sharding
    images_per_shard = int(np.ceil(num_images / num_shards))
    print(f"Processing {num_images} images from {split} split into {num_shards} shards")
    
    # Process each shard
    for shard_id in range(num_shards):
        # Set shard range
        start_idx = shard_id * images_per_shard
        end_idx = min((shard_id + 1) * images_per_shard, num_images)
        
        # Create TFRecord file locally
        output_file = f"shard_{split}_{shard_id:03d}.tfrecord"
        
        with tf.io.TFRecordWriter(output_file) as writer:
            # Process each image in shard
            for idx in tqdm(range(start_idx, end_idx), desc=f"Shard {shard_id+1}/{num_shards}"):
                try:
                    # Get image data
                    image_blob = image_blobs[idx]
                    image_name = os.path.basename(image_blob.name)
                    image_id = os.path.splitext(image_name)[0]
                    
                    # Get corresponding annotation
                    annotation_path = f"{INPUT_PATH}/labels/{split}/{image_id}.txt"
                    annotation_blob = bucket.blob(annotation_path)
                    
                    if not annotation_blob.exists():
                        print(f"Skipping {image_name}: no annotation")
                        continue
                    
                    # Read image and annotation
                    image_data = image_blob.download_as_bytes()
                    img = Image.open(io.BytesIO(image_data))
                    if img.mode != 'RGB':
                        img = img.convert('RGB')
                    
                    # Read annotation and parse bounding boxes
                    annotation_text = annotation_blob.download_as_string().decode('utf-8')
                    boxes = []
                    for line in annotation_text.strip().split('\n'):
                        parts = line.strip().split()
                        if len(parts) >= 5:
                            # Parse according to dataset format
                            x1, y1, width, height = map(float, parts[:4])
                            x2, y2 = x1 + width, y1 + height
                            boxes.append([x1, y1, x2, y2])
                    
                    # Preprocess image - resize and normalize
                    img_resized = img.resize(TARGET_SIZE, Image.LANCZOS)
                    img_array = np.array(img_resized, dtype=np.float32) / 255.0
                    
                    # Scale bounding boxes to new dimensions
                    orig_width, orig_height = img.size
                    scaled_boxes = []
                    for box in boxes:
                        x1, y1, x2, y2 = box
                        # Convert to absolute pixels in original image
                        x1_px = x1 * orig_width
                        y1_px = y1 * orig_height
                        x2_px = x2 * orig_width
                        y2_px = y2 * orig_height
                        
                        # Scale to new dimensions and normalize
                        x1_new = x1_px * TARGET_SIZE[0] / orig_width / TARGET_SIZE[0]
                        y1_new = y1_px * TARGET_SIZE[1] / orig_height / TARGET_SIZE[1]
                        x2_new = x2_px * TARGET_SIZE[0] / orig_width / TARGET_SIZE[0]
                        y2_new = y2_px * TARGET_SIZE[1] / orig_height / TARGET_SIZE[1]
                        
                        scaled_boxes.append([x1_new, y1_new, x2_new, y2_new])
                    
                    # Create TF Example
                    tf_example = create_tf_example(img_array, scaled_boxes, image_id)
                    writer.write(tf_example.SerializeToString())
                    
                except Exception as e:
                    print(f"Error processing {image_name}: {e}")
        
        # Upload to GCS
        print(f"Uploading shard {shard_id+1}/{num_shards} to GCS...")
        bucket.blob(f"{output_dir}/{output_file}").upload_from_filename(output_file)
        os.remove(output_file)
    
    return f"gs://{BUCKET_NAME}/{output_dir}/"

### 5. Function to generate a metadata file

In [5]:
# Create a metadata file with preprocessing info
def create_preprocessing_metadata():
    metadata = {
        "dataset": "SKU-110K",
        "preprocessing": {
            "image_size": TARGET_SIZE,
            "normalization": "0-1 scale",
            "resize_method": "LANCZOS",
            "format": "TFRecord"
        },
        "splits": {
            "train": {"shards": 10},
            "val": {"shards": 5},
            "test": {"shards": 5}
        },
        "created": "2025-04-17",
        "version": "1.0"
    }
    
    # Save metadata locally and to GCS
    with open('preprocessing_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    bucket.blob(f"{OUTPUT_PATH}/metadata.json").upload_from_filename('preprocessing_metadata.json')
    
    return "Metadata created successfully"

### 6. Executing both Functions

In [None]:
# Process all splits
for split, shards in [("train", 10), ("val", 5), ("test", 5)]:
    output_path = preprocess_and_create_tfrecord(split, shards)
    print(f"Processed {split} split: {output_path}")

# Create metadata
create_preprocessing_metadata()

Processing 8185 images from train split into 10 shards


Shard 1/10:   0%|          | 0/819 [00:00<?, ?it/s]

2025-04-17 17:09:27.886290: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Uploading shard 1/10 to GCS...


Shard 2/10:   0%|          | 0/819 [00:00<?, ?it/s]

Uploading shard 2/10 to GCS...


Shard 3/10:   0%|          | 0/819 [00:00<?, ?it/s]

Uploading shard 3/10 to GCS...


Shard 4/10:   0%|          | 0/819 [00:00<?, ?it/s]

Uploading shard 4/10 to GCS...


Shard 5/10:   0%|          | 0/819 [00:00<?, ?it/s]

Uploading shard 5/10 to GCS...


Shard 6/10:   0%|          | 0/819 [00:00<?, ?it/s]

Uploading shard 6/10 to GCS...


Shard 7/10:   0%|          | 0/819 [00:00<?, ?it/s]

Uploading shard 7/10 to GCS...


Shard 8/10:   0%|          | 0/819 [00:00<?, ?it/s]

Uploading shard 8/10 to GCS...


Shard 9/10:   0%|          | 0/819 [00:00<?, ?it/s]

Uploading shard 9/10 to GCS...


Shard 10/10:   0%|          | 0/814 [00:00<?, ?it/s]

Uploading shard 10/10 to GCS...
Processed train split: gs://sku-110k-dataset/processed_data/train/
Processing 584 images from val split into 5 shards


Shard 1/5:   0%|          | 0/117 [00:00<?, ?it/s]

Uploading shard 1/5 to GCS...


Shard 2/5:   0%|          | 0/117 [00:00<?, ?it/s]

Uploading shard 2/5 to GCS...


Shard 3/5:   0%|          | 0/117 [00:00<?, ?it/s]

Uploading shard 3/5 to GCS...


Shard 4/5:   0%|          | 0/117 [00:00<?, ?it/s]

Uploading shard 4/5 to GCS...


Shard 5/5:   0%|          | 0/116 [00:00<?, ?it/s]

Uploading shard 5/5 to GCS...
Processed val split: gs://sku-110k-dataset/processed_data/val/
Processing 2920 images from test split into 5 shards


Shard 1/5:   0%|          | 0/584 [00:00<?, ?it/s]

Uploading shard 1/5 to GCS...


Shard 2/5:   0%|          | 0/584 [00:00<?, ?it/s]

Uploading shard 2/5 to GCS...


Shard 3/5:   0%|          | 0/584 [00:00<?, ?it/s]

Uploading shard 3/5 to GCS...


Shard 4/5:   0%|          | 0/584 [00:00<?, ?it/s]

Uploading shard 4/5 to GCS...


Shard 5/5:   0%|          | 0/584 [00:00<?, ?it/s]

### 7. Verifying the Processed Dataset

In [None]:
# Dataset Verification Section
# Add this code to the end of your Data Processing notebook

import random
import matplotlib.pyplot as plt
import matplotlib.patches as patches

print("\n" + "="*50)
print("DATASET VERIFICATION")
print("="*50)

def parse_tfrecord_example(example_proto):
    """Parse a single example from a TFRecord file."""
    feature_description = {
        'image/height': tf.io.FixedLenFeature([], tf.int64),
        'image/width': tf.io.FixedLenFeature([], tf.int64),
        'image/filename': tf.io.FixedLenFeature([], tf.string),
        'image/source_id': tf.io.FixedLenFeature([], tf.string),
        'image/encoded': tf.io.FixedLenFeature([], tf.string),
        'image/format': tf.io.FixedLenFeature([], tf.string),
        'image/object/bbox/xmin': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/xmax': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/ymin': tf.io.VarLenFeature(tf.float32),
        'image/object/bbox/ymax': tf.io.VarLenFeature(tf.float32),
        'image/object/class/text': tf.io.VarLenFeature(tf.string),
        'image/object/class/label': tf.io.VarLenFeature(tf.int64),
    }
    
    example = tf.io.parse_single_example(example_proto, feature_description)
    
    # Convert sparse tensor to dense
    for key in ['image/object/bbox/xmin', 'image/object/bbox/xmax', 
                'image/object/bbox/ymin', 'image/object/bbox/ymax',
                'image/object/class/text', 'image/object/class/label']:
        example[key] = tf.sparse.to_dense(example[key])
    
    return example

def visualize_sample(example):
    """Visualize a sample with its bounding boxes."""
    # Decode the image
    image = tf.io.decode_jpeg(example['image/encoded'])
    
    # Get image info
    height = int(example['image/height'])
    width = int(example['image/width'])
    filename = example['image/filename'].numpy().decode('utf-8')
    
    # Get bounding box coordinates
    xmins = example['image/object/bbox/xmin'].numpy()
    xmaxs = example['image/object/bbox/xmax'].numpy()
    ymins = example['image/object/bbox/ymin'].numpy()
    ymaxs = example['image/object/bbox/ymax'].numpy()
    
    # Plot the image
    plt.figure(figsize=(10, 10))
    plt.imshow(image.numpy())
    
    # Plot bounding boxes (limit to 25 for visibility)
    max_boxes = min(25, len(xmins))
    for i in range(max_boxes):
        # Convert normalized coordinates to actual pixels
        xmin, ymin = xmins[i] * width, ymins[i] * height
        box_width = (xmaxs[i] - xmins[i]) * width
        box_height = (ymaxs[i] - ymins[i]) * height
        
        # Create rectangle patch
        rect = patches.Rectangle(
            (xmin, ymin), box_width, box_height,
            linewidth=1, edgecolor='r', facecolor='none'
        )
        plt.gca().add_patch(rect)
    
    # Add title with information
    plt.title(f"Image: {filename}\nTotal objects: {len(xmins)} (showing {max_boxes})")
    plt.axis('on')
    plt.show()
    
    # Check if image dimensions match target size
    actual_size = image.shape
    print(f"Image dimensions: {actual_size[0]}x{actual_size[1]}")
    print(f"Expected dimensions: {TARGET_SIZE[1]}x{TARGET_SIZE[0]}")
    
    # Check normalization (pixel values should be 0-255 for JPEG encoded images)
    decoded = image.numpy()
    print(f"Pixel value range: {decoded.min()} - {decoded.max()}")
    
    # Check number of bounding boxes
    print(f"Number of objects: {len(xmins)}")
    
    # Display bounding box details for a few boxes
    if len(xmins) > 0:
        print("\nSample bounding boxes (normalized coordinates):")
        for i in range(min(3, len(xmins))):
            print(f"  Box {i+1}: xmin={xmins[i]:.4f}, ymin={ymins[i]:.4f}, xmax={xmaxs[i]:.4f}, ymax={ymaxs[i]:.4f}")
    
    return image.shape, len(xmins)

def verify_tfrecords():
    """Verify TFRecord files from each split."""
    results = {}
    
    for split in ['train', 'val', 'test']:
        print(f"\n{'-'*30}")
        print(f"Verifying {split} split")
        print(f"{'-'*30}")
        
        # Get list of TFRecord files for this split
        blobs = list(bucket.list_blobs(prefix=f"{OUTPUT_PATH}/{split}/"))
        tfrecord_files = [blob for blob in blobs if blob.name.endswith('.tfrecord')]
        
        if not tfrecord_files:
            print(f"No TFRecord files found for {split} split")
            continue
            
        print(f"Found {len(tfrecord_files)} TFRecord files")
        
        # Sample a random TFRecord file
        sample_file = random.choice(tfrecord_files)
        print(f"Sampling from: {sample_file.name}")
        
        # Download the file temporarily
        local_file = f"temp_{split}.tfrecord"
        sample_file.download_to_filename(local_file)
        
        # Create TFRecord dataset
        dataset = tf.data.TFRecordDataset(local_file)
        parsed_dataset = dataset.map(parse_tfrecord_example)
        
        # Count examples and visualize a sample
        count = 0
        dimensions = []
        object_counts = []
        
        # Count total examples
        count = sum(1 for _ in parsed_dataset)
        print(f"Total examples in sampled file: {count}")
        
        # Reset the dataset and take a random sample
        parsed_dataset = tf.data.TFRecordDataset(local_file).map(parse_tfrecord_example)
        samples = parsed_dataset.shuffle(buffer_size=100).take(2)
        
        # Visualize samples
        for i, sample in enumerate(samples):
            print(f"\nSample {i+1}:")
            dim, obj_count = visualize_sample(sample)
            dimensions.append(dim)
            object_counts.append(obj_count)
        
        # Clean up
        os.remove(local_file)
        
        # Store results
        results[split] = {
            'file_count': len(tfrecord_files),
            'sample_count': count,
            'dimensions': dimensions,
            'object_counts': object_counts
        }
    
    return results

# Run verification
verification_results = verify_tfrecords()

# Summarize verification results
print("\n" + "="*50)
print("VERIFICATION SUMMARY")
print("="*50)

for split, result in verification_results.items():
    print(f"\n{split.upper()} SPLIT:")
    print(f"  TFRecord files: {result['file_count']}")
    print(f"  Examples in sampled file: {result['sample_count']}")
    print(f"  Image dimensions: {[f'{d[0]}x{d[1]}' for d in result['dimensions']]}")
    print(f"  Objects per image: {result['object_counts']}")

print("\nVerification complete!")
print("\nNext steps:")
print("1. Begin Week 2 model development with the processed data")
print("2. Create a baseline model using these TFRecord files")
print("3. Set up Vertex AI Experiments for model tracking")
print(f"\nProcessed data location: gs://{BUCKET_NAME}/{OUTPUT_PATH}/")