In [1]:
import h5py
import numpy as np
from pathlib import Path

def load_and_verify_h5(save_dir, model_name, dataset_name, train_val, layer_types=["hook_mlp_out", "hook_resid_post"]):
    files = {}
    try:
        # Load each layer type file
        for layer_type in layer_types:
            file_path = create_save_path(save_dir, model_name, dataset_name, train_val, layer_type)
            print(f"\nLoading {file_path}")
            
            with h5py.File(file_path, 'r') as f:
                # Print basic file info
                print(f"\nFile: {layer_type}")
                print("Keys:", list(f.keys()))
                
                # Check image indices
                if 'image_indices' in f:
                    image_indices = f['image_indices'][:]
                    print(f"Number of images: {len(image_indices)}")
                    print(f"Image indices range: {image_indices.min()} to {image_indices.max()}")
                
                # Check each layer's activations
                for key in f.keys():
                    if key != 'image_indices':
                        data = f[key][:]
                        print(f"\nLayer: {key}")
                        print(f"Shape: {data.shape}")
                        print(f"Data type: {data.dtype}")
                        print(f"Non-zero elements: {np.count_nonzero(data)}")
                        print(f"Mean: {np.mean(data):.6f}")
                        print(f"Std: {np.std(data):.6f}")
                        print(f"Min: {np.min(data):.6f}")
                        print(f"Max: {np.max(data):.6f}")
                        
                        # Basic sanity checks
                        assert not np.any(np.isnan(data)), f"NaN values found in {key}"
                        assert not np.any(np.isinf(data)), f"Infinite values found in {key}"
                        
                        # Test random access
                        random_idx = np.random.randint(0, len(data))
                        _ = data[random_idx]
                        print(f"Successfully accessed random index {random_idx}")

        print("\nAll files loaded and verified successfully!")
        return True

    except Exception as e:
        print(f"\nError during verification: {str(e)}")
        return False

def create_save_path(save_dir, model_name, dataset_name, train_val, layer_type):
    # Implement this function according to your file naming convention
    save_path = Path(save_dir) / f"{model_name}_{dataset_name}_{train_val}_{layer_type}.h5"
    return str(save_path)

# Example usage:
if __name__ == "__main__":
    # Replace these with your actual values
    save_dir = "/network/scratch/s/sonia.joseph/CLIP_AUDIT/open-clip_laion_CLIP-ViT-B-32-DataComp.XL-s13B-b90K/imagenet21k/train
    model_name = "your_model_name"
    dataset_name = "imagenet21k"
    train_val = "train"  # or "val"
    
    success = load_and_verify_h5(
        save_dir=save_dir,
        model_name=model_name,
        dataset_name=dataset_name,
        train_val=train_val
    )
    
    if success:
        # Optional: Load specific data for further testing
        layer_type = "hook_mlp_out"
        file_path = create_save_path(save_dir, model_name, dataset_name, train_val, layer_type)
        
        with h5py.File(file_path, 'r') as f:
            # Example: Load first 100 activations from a specific layer
            layer_name = list(f.keys())[1]  # First key after 'image_indices'
            sample_data = f[layer_name][:100]
            print(f"\nSample data shape from {layer_name}: {sample_data.shape}")
            
            # Example: Get corresponding image indices
            sample_indices = f['image_indices'][:100]
            print(f"Corresponding image indices: {sample_indices}")


Loading path/to/your/save/directory/your_model_name_imagenet21k_train_hook_mlp_out.h5

Error during verification: [Errno 2] Unable to synchronously open file (unable to open file: name = 'path/to/your/save/directory/your_model_name_imagenet21k_train_hook_mlp_out.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
