In [11]:
import joblib
# ------------------------------------------------------
# sarpyx - SAR Processing and Analysis in Python
data = joblib.load('/Data_large/marine/PythonProjects/SAR/sarpyx/data/output/sar_stepped_processing_azimuth_compression.pkl')
metadata = joblib.load('/Data_large/marine/PythonProjects/SAR/sarpyx/processed_data/s1a-s1-raw-s-vh-20150107t054312-20150107t054337-004061-004e74_burst_1_metadata.pkl')
print('Data loaded successfully')
print(len(metadata))

Data loaded successfully
40722


In [12]:
metadata

Unnamed: 0,packet_version_number,packet_type,secondary_header_flag,pid,pcat,sequence_flags,packet_sequence_count,packet_data_length,coarse_time,fine_time,...,swath_number,number_of_quads,signal_type_name,data_take_hex,samples_per_line,polarization_name,temp_comp_name,sync_marker_valid,baq_mode_valid,packet_version_valid
0,0,0,True,65,12,3,6021,20329,1104644611,0.543404,...,0,12862,echo,0x009CE920,25724,,disabled,True,True,True
1,0,0,True,65,12,3,6022,20217,1104644611,0.543938,...,0,12862,echo,0x009CE920,25724,,disabled,True,True,True
2,0,0,True,65,12,3,6023,20221,1104644611,0.544472,...,0,12862,echo,0x009CE920,25724,,disabled,True,True,True
3,0,0,True,65,12,3,6024,20213,1104644611,0.545006,...,0,12862,echo,0x009CE920,25724,,disabled,True,True,True
4,0,0,True,65,12,3,6025,20185,1104644611,0.545540,...,0,12862,echo,0x009CE920,25724,,disabled,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40717,0,0,True,65,12,3,14390,16645,1104644633,0.534081,...,0,12862,echo,0x009CE920,25724,,disabled,True,True,True
40718,0,0,True,65,12,3,14391,16613,1104644633,0.534615,...,0,12862,echo,0x009CE920,25724,,disabled,True,True,True
40719,0,0,True,65,12,3,14392,16597,1104644633,0.535149,...,0,12862,echo,0x009CE920,25724,,disabled,True,True,True
40720,0,0,True,65,12,3,14393,16621,1104644633,0.535683,...,0,12862,echo,0x009CE920,25724,,disabled,True,True,True


In [15]:
ephemeris = joblib.load('/Data_large/marine/PythonProjects/SAR/sarpyx/processed_data/s1a-s1-raw-s-vh-20150107t054312-20150107t054337-004061-004e74_ephemeris.pkl')

In [17]:
import zarr
import numcodecs

def save_array_to_zarr(array: 'np.ndarray', file_path: str, compressor_level: int = 9, metadata_df=None, ephemeris_df=None) -> None:
    """
    Save a numpy array to a Zarr file with maximum compression.

    Args:
        array (np.ndarray): The numpy array to save.
        file_path (str): The path to the output Zarr file.
        compressor_level (int): Compression level for Blosc (default is 9, maximum).
        metadata_df: Optional pandas DataFrame to save as zarr attributes.
        ephemeris_df: Optional pandas DataFrame with ephemeris data to save as zarr attributes.

    Returns:
        None
    """
    assert array is not None, 'Input array must not be None'
    assert isinstance(file_path, str) and file_path, 'file_path must be a non-empty string'
    
    # Use smaller chunks for better compression
    # Complex64 data often compresses better with smaller, square-ish chunks
    chunk_size = min(512, array.shape[0] // 4, array.shape[1] // 4)
    chunk_size = max(64, chunk_size)  # Ensure minimum chunk size
    chunks = (chunk_size, chunk_size)
    
    # Use maximum compression with zstd and byte shuffle
    codec = numcodecs.Blosc(
        cname='zstd',           # Best compression ratio
        clevel=9,               # Maximum compression level
        shuffle=numcodecs.Blosc.BITSHUFFLE  # Better for floating point data
    )
    
    zarr_array = zarr.open(
        file_path, 
        mode='w', 
        shape=array.shape, 
        dtype=array.dtype,
        zarr_format=2, 
        compressor=codec, 
        chunks=chunks
    )
    zarr_array[:] = array
    
    # Add metadata as attributes if provided
    if metadata_df is not None:
        # Handle NaN values by filling them with None or converting to string
        metadata_clean = metadata_df.fillna('null')
        
        # Convert DataFrame to dictionary for zarr attributes
        zarr_array.attrs['metadata'] = metadata_clean.to_dict('records')
        zarr_array.attrs['metadata_columns'] = list(metadata_df.columns)
        zarr_array.attrs['metadata_dtypes'] = metadata_df.dtypes.astype(str).to_dict()
        print(f'Added metadata with {len(metadata_df)} records as zarr attributes')
    
    # Add ephemeris data as attributes if provided
    if ephemeris_df is not None:
        ephemeris_clean = ephemeris_df.fillna('null')
        
        zarr_array.attrs['ephemeris'] = ephemeris_clean.to_dict('records')
        zarr_array.attrs['ephemeris_columns'] = list(ephemeris_df.columns)
        zarr_array.attrs['ephemeris_dtypes'] = ephemeris_df.dtypes.astype(str).to_dict()
        print(f'Added ephemeris with {len(ephemeris_df)} records as zarr attributes')
    
    print(f'Saved array to {file_path} with maximum compression (zstd-9, chunks={chunks})')

save_array_to_zarr(data, 'data_high_compression.zarr', metadata_df=metadata, ephemeris_df=ephemeris)

Added metadata with 40722 records as zarr attributes
Added ephemeris with 737 records as zarr attributes
Saved array to data_high_compression.zarr with maximum compression (zstd-9, chunks=(512, 512))


In [10]:
import os

# Get the size of the zarr file
zarr_file_path = 'data_high_compression.zarr'
zarr_size = 0

# Calculate total size of zarr directory and all its contents
for dirpath, dirnames, filenames in os.walk(zarr_file_path):
    for filename in filenames:
        filepath = os.path.join(dirpath, filename)
        if os.path.exists(filepath):
            zarr_size += os.path.getsize(filepath)

# Get original data size in memory
original_size = data.nbytes

# Display sizes in human-readable format
def format_bytes(bytes_size: int) -> str:
    """Format bytes to human readable string."""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if bytes_size < 1024.0:
            return f'{bytes_size:.2f} {unit}'
        bytes_size /= 1024.0
    return f'{bytes_size:.2f} TB'

print(f'Original data size in memory: {format_bytes(original_size)}')
print(f'Compressed zarr file size: {format_bytes(zarr_size)}')
print(f'Compression ratio: {original_size / zarr_size:.2f}x')
print(f'Data shape: {data.shape}')
print(f'Data type: {data.dtype}')

Original data size in memory: 15.61 GB
Compressed zarr file size: 13.88 GB
Compression ratio: 1.12x
Data shape: (40722, 25724)
Data type: complex128


In [18]:
from sarpyx.utils.zarr_utils import ZarrManager

# Create an instance of the ZarrManager
zarr_manager = ZarrManager(zarr_file_path)

# Example usage:
print("Zarr Info:")
print(zarr_manager.info)
print("\nCompression Info:")
print(zarr_manager.get_compression_info())

Zarr Info:
{'shape': (40722, 25724), 'dtype': dtype('complex128'), 'chunks': (512, 512), 'nbytes': 16760523648, 'size_mb': 15984.080932617188}

Compression Info:
{'uncompressed_size_mb': 15984.080932617188, 'compressed_size_mb': 14276.47688293457, 'compression_ratio': 1.1196096252377088, 'space_saved_percent': 10.68315442646485}


In [21]:
zarr_manager.get_ephemeris()

Unnamed: 0,x,y,z,vx,vy,vz,time_stamp,q0,q1,q2,...,tile12_efeh_temperature,tile12_efev_temperature,tile12_ta_temperature,tile13_efeh_temperature,tile13_efev_temperature,tile13_ta_temperature,tile14_efeh_temperature,tile14_efev_temperature,tile14_ta_temperature,tgu_temperature
0,4.851713e+06,1.092964e+06,5.024307e+06,5519.073730,-685.893250,-5166.852539,18532861191651328,0.965929,-0.258806,0.000005,...,71,77,77,73,80,77,68,70,64,89
1,4.851713e+06,1.092964e+06,5.024307e+06,5519.073730,-685.893250,-5166.852539,18532861191651328,0.965929,-0.258806,0.000005,...,71,77,77,73,80,77,68,70,64,89
2,4.851713e+06,1.092964e+06,5.024307e+06,5519.073730,-685.893250,-5166.852539,18532861191651328,0.965929,-0.258806,0.000005,...,71,77,77,73,80,77,68,70,64,89
3,4.851713e+06,1.092964e+06,5.024307e+06,5519.073730,-685.893250,-5166.852539,18532861191651328,0.965929,-0.258806,0.000005,...,71,77,77,73,80,77,68,70,64,89
4,4.851713e+06,1.092964e+06,5.024307e+06,5519.073730,-685.893250,-5166.852539,18532861191651328,0.965929,-0.258806,0.000005,...,71,77,77,73,80,77,68,70,64,89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,4.987943e+06,1.075187e+06,4.893378e+06,5378.638184,-736.137329,-5306.817383,18532861611081728,0.965889,-0.258957,0.000002,...,71,77,77,73,80,77,68,75,65,89
733,4.987943e+06,1.075187e+06,4.893378e+06,5378.638184,-736.137329,-5306.817383,18532861611081728,0.965889,-0.258957,0.000002,...,71,77,77,73,80,77,68,75,65,89
734,4.987943e+06,1.075187e+06,4.893378e+06,5378.638184,-736.137329,-5306.817383,18532861611081728,0.965889,-0.258957,0.000002,...,71,77,77,73,80,77,68,75,65,89
735,4.987943e+06,1.075187e+06,4.893378e+06,5378.638184,-736.137329,-5306.817383,18532861611081728,0.965889,-0.258957,0.000002,...,71,77,77,73,80,77,68,75,65,89
