In [1]:
import h5py
import numpy as np
from pathlib import Path

def load_and_verify_h5(save_dir, model_name, dataset_name, train_val, layer_types=["hook_mlp_out", "hook_resid_post"]):
    files = {}
    try:
        # Load each layer type file
        for layer_type in layer_types:
            file_path = create_save_path(save_dir, model_name, dataset_name, train_val, layer_type)
            print(f"\nLoading {file_path}")
            
            with h5py.File(file_path, 'r') as f:
                # Print basic file info
                print(f"\nFile: {layer_type}")
                print("Keys:", list(f.keys()))
                
                # # Check image indices
                # if 'image_indices' in f:
                #     image_indices = f['image_indices'][:]
                #     print(f"Number of images: {len(image_indices)}")
                #     print(f"Image indices range: {image_indices.min()} to {image_indices.max()}")
                
                # Check each layer's activations
                for key in f.keys():
                    if key != 'image_indices':
                        data = f[key][:]
                        print(f"\nLayer: {key}")
                        print(f"Shape: {data.shape}")
                        print(f"Data type: {data.dtype}")
                        print(f"Non-zero elements: {np.count_nonzero(data)}")
                        print(f"Mean: {np.mean(data):.6f}")
                        print(f"Std: {np.std(data):.6f}")
                        print(f"Min: {np.min(data):.6f}")
                        print(f"Max: {np.max(data):.6f}")
                        
                        # Basic sanity checks
                        assert not np.any(np.isnan(data)), f"NaN values found in {key}"
                        assert not np.any(np.isinf(data)), f"Infinite values found in {key}"
                        
                        # Test random access
                        random_idx = np.random.randint(0, len(data))
                        _ = data[random_idx]
                        print(f"Successfully accessed random index {random_idx}")

        print("\nAll files loaded and verified successfully!")
        return True

    except Exception as e:
        print(f"\nError during verification: {str(e)}")
        return False

def create_save_path(save_dir, model_name, dataset_name, train_val, layer_type):
    save_path = Path(save_dir) / f"{model_name}/{dataset_name}/{train_val}/{layer_type}.h5"
    return str(save_path)

# Example usage:
if __name__ == "__main__":
    # Replace these with your actual values
    save_dir = "/network/scratch/s/sonia.joseph/CLIP_AUDIT/"
    model_name = "open-clip_laion_CLIP-ViT-B-32-DataComp.XL-s13B-b90K/"
    dataset_name = "imagenet21k"
    train_val = "train"  # or "val"
    
    success = load_and_verify_h5(
        save_dir=save_dir,
        model_name=model_name,
        dataset_name=dataset_name,
        train_val=train_val
    )
    
    if success:
        # Optional: Load specific data for further testing
        layer_type = "hook_mlp_out"
        file_path = create_save_path(save_dir, model_name, dataset_name, train_val, layer_type)
        
        with h5py.File(file_path, 'r') as f:
            # Example: Load first 100 activations from a specific layer
            layer_name = list(f.keys())[1]  # First key after 'image_indices'
            sample_data = f[layer_name][:100]
            print(f"\nSample data shape from {layer_name}: {sample_data.shape}")
            
            # Example: Get corresponding image indices
            sample_indices = f['image_indices'][:100]
            print(f"Corresponding image indices: {sample_indices}")


Loading /network/scratch/s/sonia.joseph/CLIP_AUDIT/open-clip_laion_CLIP-ViT-B-32-DataComp.XL-s13B-b90K/imagenet21k/train/hook_mlp_out.h5

File: hook_mlp_out
Keys: ['blocks.0.hook_mlp_out', 'blocks.1.hook_mlp_out', 'blocks.10.hook_mlp_out', 'blocks.11.hook_mlp_out', 'blocks.2.hook_mlp_out', 'blocks.3.hook_mlp_out', 'blocks.4.hook_mlp_out', 'blocks.5.hook_mlp_out', 'blocks.6.hook_mlp_out', 'blocks.7.hook_mlp_out', 'blocks.8.hook_mlp_out', 'blocks.9.hook_mlp_out', 'image_indices']


In [2]:
import h5py
import numpy as np
import os
import h5py
import numpy as np
from pathlib import Path

import h5py
import numpy as np
import os

def verify_image_ids(save_dir, model_name, dataset_name, train_val):
    model_name = model_name.replace('/', '_')
    model_name = model_name.replace(':', '_')
    
    file_path = os.path.join(save_dir, model_name, dataset_name, train_val, "image_ids.h5")
    
    try:
        with h5py.File(file_path, 'r') as f:
            print(f"\nChecking image IDs file: {file_path}")
            print("Keys in file:", list(f.keys()))
            
            ids = f['image_ids']
            
            # Find last non-empty ID
            last_valid_idx = len(ids) - 1
            while last_valid_idx >= 0 and not ids[last_valid_idx]:
                last_valid_idx -= 1
            
            # Count valid IDs (starting with 'n')
            valid_count = 0
            chunk_size = 10000
            for i in range(0, last_valid_idx + 1, chunk_size):
                chunk = ids[i:min(i + chunk_size, last_valid_idx + 1)]
                valid_count += sum(1 for id in chunk if id and id.startswith(b'n'))
            
            print(f"\nDataset Info:")
            print(f"Total entries: {len(ids)}")
            print(f"Valid IDs (starting with 'n'): {valid_count}")
            print(f"Data type: {ids.dtype}")
            
            # Show first valid IDs
            print(f"\nFirst 10 valid IDs:")
            valid_ids = []
            idx = 0
            while len(valid_ids) < 10 and idx < len(ids):
                if ids[idx] and ids[idx].startswith(b'n'):
                    valid_ids.append(ids[idx])
                idx += 1
            print(valid_ids)
            
            # Show last valid IDs
            print(f"\nLast 10 valid IDs:")
            valid_ids = []
            idx = last_valid_idx
            while len(valid_ids) < 10 and idx >= 0:
                if ids[idx] and ids[idx].startswith(b'n'):
                    valid_ids.insert(0, ids[idx])
                idx -= 1
            print(valid_ids)
            
            # File size
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # Convert to MB
            print(f"\nFile size: {file_size:.2f} MB")
            
            return True
            
    except Exception as e:
        print(f"Error checking image IDs file: {str(e)}")
        return False

if __name__ == "__main__":
    save_dir = "/network/scratch/s/sonia.joseph/CLIP_AUDIT"
    model_name = "open-clip:laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K"
    dataset_name = "imagenet21k"
    train_val = "train"
    
    verify_image_ids(save_dir, model_name, dataset_name, train_val)
    
if __name__ == "__main__":
    # Replace these with your actual values
    save_dir = "/network/scratch/s/sonia.joseph/CLIP_AUDIT/"
    model_name = "open-clip_laion_CLIP-ViT-B-32-DataComp.XL-s13B-b90K/"
    dataset_name = "imagenet21k"
    train_val = "train"  # or "val"
    
    success = verify_image_ids(
        save_dir=save_dir,
        model_name=model_name,
        dataset_name=dataset_name,
        train_val=train_val
    )


Checking image IDs file: /network/scratch/s/sonia.joseph/CLIP_AUDIT/open-clip_laion_CLIP-ViT-B-32-DataComp.XL-s13B-b90K/imagenet21k/train/image_ids.h5
Keys in file: ['image_ids']

Dataset Info:
Total number of IDs: 14000000
Data type: object

First 10 IDs:
[b'n02689434_3333' b'n02689434_9260' b'n02689434_4489' b'n02689434_5322'
 b'n02689434_406' b'n02689434_420' b'n02689434_519' b'n02689434_810'
 b'n02689434_833' b'n02689434_958']

Last 10 IDs:
[b'' b'' b'' b'' b'' b'' b'' b'' b'' b'']
Error checking image IDs file: Indexing elements must be in increasing order
