In [1]:
!pip install transformers torch numpy google-cloud-storage



In [None]:
import os
import json
import numpy as np
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModel
from google.cloud import storage
import uuid

# --- 1. initialization ---
PROJECT_ID = "629242692180"
GCS_OUTPUT_PATH = "gs://axmt_equipment_profile/siglip_vectors/local_image_vectors.json" 
# LOCAL_OUTPUT_FILENAME = "jsonl/local_image_vectors.jsonl"
LOCAL_OUTPUT_FILENAME = "jsonl/local_image_vectors_test.jsonl"
# equipment_base = "dataset/equipment_train"
equipment_base = "dataset/equipment_test"

# LOCAL_DATA_PATHS = ["dataset/equipment_test/AI1", "dataset/equipment_test/AI2", "dataset/equipment_test/AI3", "dataset/equipment_test/AI4", "dataset/equipment_test/AI5", "dataset/equipment_test/AI6", "dataset/equipment_test/AI7", "dataset/equipment_test/AI8", "dataset/equipment_test/AI9", "dataset/equipment_test/AI10", "dataset/equipment_test/AI11", "dataset/equipment_test/AI12",
#                     "dataset/equipment_test/AI13", "dataset/equipment_test/AI14", "dataset/equipment_test/AI15", "dataset/equipment_test/AI16", "dataset/equipment_test/AI17", "dataset/equipment_test/AI18", "dataset/equipment_test/AI19" ,"dataset/equipment_test/AI21", "dataset/equipment_test/AI22" ]  # Local directories containing images

LOCAL_DATA_PATHS = []
if os.path.exists(equipment_base):
    for item in sorted(os.listdir(equipment_base)):
        item_path = os.path.join(equipment_base, item)
        if os.path.isdir(item_path) and not item.startswith('.'):  # Skip hidden files like .DS_Store
            LOCAL_DATA_PATHS.append(item_path)

print(f"Found {len(LOCAL_DATA_PATHS)} directories in {equipment_base}:")
for path in LOCAL_DATA_PATHS:
    print(f"  - {path}")

# Load SigLIP Model (run once)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

try:
    processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
    model = AutoModel.from_pretrained("google/siglip-base-patch16-224").to(DEVICE)
    model.eval()
    print("SigLIP model loaded successfully.")
except Exception as e:
    print(f"Error loading SigLIP model: {e}")
    exit()

# --- 2. Helper function: Upload Local file to GCS ---
def upload_to_gcs(local_path: str, gcs_path: str):
    """Upload file from Local to Google Cloud Storage"""
    try:
        storage_client = storage.Client(project=PROJECT_ID)
        bucket_name = gcs_path.replace("gs://", "").split("/")[0]
        blob_path = "/".join(gcs_path.replace("gs://", "").split("/")[1:])
        
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_path)
        
        blob.upload_from_filename(local_path)
        print(f"File uploaded to GCS successfully: {gcs_path}")
        return True
    except Exception as e:
        print(f"!!! ERROR: GCS upload failed: {e}")
        return False

# --- 3. Main function to generate Embedding with Export options ---
def generate_siglip_embeddings(
    local_dirs: list, 
    temp_local_filename: str, 
    export_mode: str, 
    gcs_output_path: str = None
):
    """
    Create SigLIP Embeddings and save the results according to the selected mode (local or gcp)
    :param export_mode: 'local' or 'gcp'
    """
    if export_mode not in ['local', 'gcp']:
        raise ValueError("export_mode must be 'local' or 'gcp'")

    image_paths = []
    # Collect all image file paths
    for local_dir in local_dirs:
        if not os.path.isdir(local_dir):
            print(f"Warning: Directory not found - {local_dir}")
            continue
        for filename in os.listdir(local_dir):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_paths.append(os.path.join(local_dir, filename))

    if not image_paths:
        print("No image files found. Exiting.")
        return

    count = 0
    
    # --- A. create json file ---
    with open(temp_local_filename, "w") as f_out:
        for full_path in image_paths:
            # Get Class Label
            label_class = os.path.basename(os.path.dirname(full_path))
            image_id = f"{label_class}_{uuid.uuid4()}"
            
            try:
                # 1. Load and prepare image
                image = Image.open(full_path).convert("RGB")
                inputs = processor(images=image, return_tensors="pt").to(DEVICE)
                
                # 2. create Embedding
                with torch.no_grad():
                    outputs = model.get_image_features(**inputs)
                    image_embedding = outputs / outputs.norm(p=2, dim=-1, keepdim=True)
                
                embedding_vector = image_embedding.squeeze(0).tolist()
                
                # 3. create JSONL Object
                jsonl_record = {
                    "id": image_id,
                    "embedding": embedding_vector,
                    "original_path": full_path,
                    "label_class": label_class
                }
                
                # 4. save on Local
                f_out.write(json.dumps(jsonl_record) + "\n")
                count += 1
                
            except Exception as e:
                print(f"Error processing {full_path}: {e}")
                continue
                
    print(f"\n--- Processing complete: {count} items ---")

    # --- B. use mode Export ---
    if export_mode == 'gcp':
        if not gcs_output_path:
            print("ERROR: specific GCS Path is required for 'gcp' mode")
            return
        # If upload is successful, delete local file to save space
        if upload_to_gcs(temp_local_filename, gcs_output_path):
            os.remove(temp_local_filename)
            print(f"Local file has been deleted.")
            
    elif export_mode == 'local':
        print(f"File has been saved to Local Path: {temp_local_filename}")


# --- 4. Example usage (select one mode) ---
print("\n#####################################################")

# üü¢ Example 1: Export to Local Disk üíæ
generate_siglip_embeddings(
    local_dirs=LOCAL_DATA_PATHS,
    temp_local_filename=LOCAL_OUTPUT_FILENAME,
    export_mode='local'
)

# üü† Example 2: Export to GCP Cloud Storage ‚òÅÔ∏è
# generate_siglip_embeddings(
#     local_dirs=LOCAL_DATA_PATHS,
#     temp_local_filename=LOCAL_OUTPUT_FILENAME,
#     export_mode='gcp',
#     gcs_output_path=GCS_OUTPUT_PATH
# )

Found 21 directories in dataset/equipment_test:
  - dataset/equipment_test/AI1
  - dataset/equipment_test/AI10
  - dataset/equipment_test/AI11
  - dataset/equipment_test/AI12
  - dataset/equipment_test/AI13
  - dataset/equipment_test/AI14
  - dataset/equipment_test/AI15
  - dataset/equipment_test/AI16
  - dataset/equipment_test/AI17
  - dataset/equipment_test/AI18
  - dataset/equipment_test/AI19
  - dataset/equipment_test/AI2
  - dataset/equipment_test/AI21
  - dataset/equipment_test/AI22
  - dataset/equipment_test/AI3
  - dataset/equipment_test/AI4
  - dataset/equipment_test/AI5
  - dataset/equipment_test/AI6
  - dataset/equipment_test/AI7
  - dataset/equipment_test/AI8
  - dataset/equipment_test/AI9
Using device: cpu
SigLIP model loaded successfully.

#####################################################
SigLIP model loaded successfully.

#####################################################


KeyboardInterrupt: 