# **Fashion MNIST: Ingest Raw Data**

***
***

### **Import Libraries**

In [1]:
# Import necessary libraries
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import json
from PIL import Image
from datetime import datetime

2025-04-27 14:52:35.816835: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745765555.827071   71150 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745765555.830115   71150 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745765555.838817   71150 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745765555.838833   71150 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745765555.838835   71150 computation_placer.cc:177] computation placer alr

### **Download and Prepare Data for Multiple Formats**

In [2]:
# Create local directory structure
os.makedirs('./fashion_mnist_data', exist_ok=True)
os.makedirs('./fashion_mnist_data/custom_jobs', exist_ok=True)
os.makedirs('./fashion_mnist_data/vertex_datasets', exist_ok=True)
os.makedirs('./fashion_mnist_data/vertex_datasets/train', exist_ok=True)
os.makedirs('./fashion_mnist_data/vertex_datasets/test', exist_ok=True)

In [3]:
# Load Fashion MNIST dataset
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()

# Define class names
class_names = ['T-shirt_top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle_boot']

print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

Training data shape: (60000, 28, 28)
Test data shape: (10000, 28, 28)


### **Save in Multiple Formats**

##### **Format 1: NumPy Arrays (Best for Vertex AI Custom Jobs)**

In [4]:
# Save as NumPy arrays for Custom Jobs
# This format is most efficient for custom training scripts
np.savez_compressed(
    './fashion_mnist_data/custom_jobs/fashion_mnist.npz',
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

# Save class names
with open('./fashion_mnist_data/custom_jobs/class_names.json', 'w') as f:
    json.dump(class_names, f)

print("NumPy format for Custom Jobs saved successfully")

NumPy format for Custom Jobs saved successfully


##### **Format 2: Images with CSV (Best for Vertex AI Datasets)**

In [5]:
# Function to save images and create CSV for Vertex AI Datasets
def create_vertex_dataset_format(images, labels, split='train'):
    csv_data = []
    
    # Create class folders
    for class_name in class_names:
        os.makedirs(f'./fashion_mnist_data/vertex_datasets/{split}/{class_name}', exist_ok=True)
    
    for idx, (image, label) in enumerate(zip(images, labels)):
        class_name = class_names[label]
        image_filename = f"{split}_{idx:05d}.jpg"
        
        # Save as JPEG (better compression for storage)
        local_path = f'./fashion_mnist_data/vertex_datasets/{split}/{class_name}/{image_filename}'
        Image.fromarray(image).convert('L').save(local_path, 'JPEG', quality=95)
        
        # GCS path format for Vertex AI Datasets
        gcs_path = f'gs://fashion-mnist-datasets/vertex_datasets/{split}/{class_name}/{image_filename}'
        
        # Add to CSV data
        csv_data.append({
            'gcs_path': gcs_path,
            'label': class_name
        })
        
        if idx % 1000 == 0:
            print(f"Processed {idx} {split} images...")
    
    return pd.DataFrame(csv_data)

In [6]:
# Create datasets
train_df = create_vertex_dataset_format(X_train, y_train, 'train')
test_df = create_vertex_dataset_format(X_test, y_test, 'test')

# Save CSV for Vertex AI import
train_df.to_csv('./fashion_mnist_data/vertex_datasets/train.csv', index=False, header=False)
test_df.to_csv('./fashion_mnist_data/vertex_datasets/test.csv', index=False, header=False)

# Create combined CSV if needed
combined_df = pd.concat([train_df, test_df])
combined_df.to_csv('./fashion_mnist_data/vertex_datasets/all_data.csv', index=False, header=False)

print("Vertex AI Datasets format saved successfully")

Processed 0 train images...
Processed 1000 train images...
Processed 2000 train images...
Processed 3000 train images...
Processed 4000 train images...
Processed 5000 train images...
Processed 6000 train images...
Processed 7000 train images...
Processed 8000 train images...
Processed 9000 train images...
Processed 10000 train images...
Processed 11000 train images...
Processed 12000 train images...
Processed 13000 train images...
Processed 14000 train images...
Processed 15000 train images...
Processed 16000 train images...
Processed 17000 train images...
Processed 18000 train images...
Processed 19000 train images...
Processed 20000 train images...
Processed 21000 train images...
Processed 22000 train images...
Processed 23000 train images...
Processed 24000 train images...
Processed 25000 train images...
Processed 26000 train images...
Processed 27000 train images...
Processed 28000 train images...
Processed 29000 train images...
Processed 30000 train images...
Processed 31000 train

##### **Create README and Manifest**

In [10]:
# Create README file
readme_content = f"""# Fashion MNIST Dataset for Vertex AI

This directory contains the Fashion MNIST dataset optimized for Google Cloud Vertex AI.

## Directory Structure:
- `/custom_jobs`: NumPy compressed arrays for Vertex AI Custom Training Jobs
  - `fashion_mnist.npz`: Contains X_train, y_train, X_test, y_test
  - `class_names.json`: List of class names
  
- `/vertex_datasets`: Images with CSV for Vertex AI Datasets
  - `/train`: Training images organized by class
  - `/test`: Test images organized by class
  - `train.csv`: CSV for training data import
  - `test.csv`: CSV for test data import
  - `all_data.csv`: Combined CSV for full dataset import

## Dataset Details:
- Training samples: {len(X_train)}
- Test samples: {len(X_test)}
- Image dimensions: 28x28 grayscale
- Classes: {', '.join(class_names)}

## Usage:
1. For Vertex AI Custom Training: Load the NPZ file
   ```python
   data = np.load('fashion_mnist.npz')
   X_train, y_train = data['X_train'], data['y_train']
   X_test, y_test = data['X_test'], data['y_test']
2. For Vertex AI Datasets: Upload the CSV files to create Image Datasets

Use all_data.csv for the complete dataset
Use separate train/test CSVs for split datasets

Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
"""
with open('./fashion_mnist_data/README.md', 'w') as f:
  f.write(readme_content)

In [11]:
manifest = {
'dataset_name': 'Fashion MNIST',
'creation_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
'formats': {
'custom_jobs': {
'type': 'numpy_compressed',
'files': ['fashion_mnist.npz', 'class_names.json']
},
'vertex_datasets': {
'type': 'images_with_csv',
'files': ['train.csv', 'test.csv', 'all_data.csv'],
'image_format': 'JPEG'
}
},
'statistics': {
'train_samples': len(X_train),
'test_samples': len(X_test),
'image_shape': list(X_train[0].shape),
'classes': class_names
},
'gcs_bucket': 'fashion-mnist-datasets'
}
with open('./fashion_mnist_data/manifest.json', 'w') as f:
    json.dump(manifest, f, indent=4)

print("README and manifest created successfully")

README and manifest created successfully


### **Verify Local Files**

In [12]:
# Verify local files
def verify_local_files():
    print("\nVerifying local files...")
    
    files_to_check = [
        './fashion_mnist_data/README.md',
        './fashion_mnist_data/manifest.json',
        './fashion_mnist_data/custom_jobs/fashion_mnist.npz',
        './fashion_mnist_data/custom_jobs/class_names.json',
        './fashion_mnist_data/vertex_datasets/train.csv',
        './fashion_mnist_data/vertex_datasets/test.csv',
        './fashion_mnist_data/vertex_datasets/all_data.csv'
    ]
    
    for file_path in files_to_check:
        if os.path.exists(file_path):
            size_mb = os.path.getsize(file_path) / (1024 * 1024)
            print(f"✅ {file_path} ({size_mb:.2f} MB)")
        else:
            print(f"❌ {file_path} missing")
    
    # Count images
    train_count = 0
    test_count = 0
    
    for class_name in class_names:
        train_path = f'./fashion_mnist_data/vertex_datasets/train/{class_name}'
        test_path = f'./fashion_mnist_data/vertex_datasets/test/{class_name}'
        
        if os.path.exists(train_path):
            train_count += len([f for f in os.listdir(train_path) if f.endswith('.jpg')])
        if os.path.exists(test_path):
            test_count += len([f for f in os.listdir(test_path) if f.endswith('.jpg')])
    
    print(f"\nImage counts:")
    print(f"Training images: {train_count}")
    print(f"Test images: {test_count}")
    
    # Verify NPZ file
    try:
        data = np.load('./fashion_mnist_data/custom_jobs/fashion_mnist.npz')
        print(f"\nNPZ file contents:")
        for key in data.files:
            print(f"- {key}: shape {data[key].shape}")
    except Exception as e:
        print(f"Error loading NPZ file: {e}")

verify_local_files()


Verifying local files...
✅ ./fashion_mnist_data/README.md (0.00 MB)
✅ ./fashion_mnist_data/manifest.json (0.00 MB)
✅ ./fashion_mnist_data/custom_jobs/fashion_mnist.npz (29.44 MB)
✅ ./fashion_mnist_data/custom_jobs/class_names.json (0.00 MB)
✅ ./fashion_mnist_data/vertex_datasets/train.csv (4.65 MB)
✅ ./fashion_mnist_data/vertex_datasets/test.csv (0.76 MB)
✅ ./fashion_mnist_data/vertex_datasets/all_data.csv (5.40 MB)

Image counts:
Training images: 60000
Test images: 10000

NPZ file contents:
- X_train: shape (60000, 28, 28)
- y_train: shape (60000,)
- X_test: shape (10000, 28, 28)
- y_test: shape (10000,)


***
***