In [1]:
import os
import pandas as pd
import humanfriendly
import numpy as np
from pathlib import Path

In [2]:
def parse_size(values):
    size = np.zeros(values.shape, dtype=int)
    for j, val in enumerate(values):
        size[j] = humanfriendly.parse_size(val)
    return size


In [3]:
fn = "zarr_inspect2.csv"
df_zarr = pd.read_csv(fn)
df_zarr["stored_bytes"] = parse_size(df_zarr.stored.values)
df_zarr.sort_values("stored_bytes", ascending=False, inplace=True)
df_zarr

Unnamed: 0,name,dtype,stored,size,ratio,nchunks,chunk_size,avg_chunk_stored,shape,chunk_shape,compressor,filters,stored_bytes
0,/call_LAD,int16,2.26 GiB,653.86 GiB,290.0,17900,37.41 MiB,132.17 KiB,"(715256, 245394, 2)","(1000, 10000, 2)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,2426656522
1,/call_GQ,int8,1.89 GiB,163.47 GiB,87.0,17900,9.35 MiB,110.55 KiB,"(715256, 245394)","(1000, 10000)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,2029372047
2,/call_LA,int8,1.24 GiB,326.93 GiB,260.0,17900,18.7 MiB,72.64 KiB,"(715256, 245394, 2)","(1000, 10000, 2)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,1331439861
3,/call_genotype,int8,914.22 MiB,326.93 GiB,370.0,17900,18.7 MiB,52.3 KiB,"(715256, 245394, 2)","(1000, 10000, 2)","Blosc(cname='zstd', clevel=7, shuffle=BITSHUFF...",,958629150
4,/call_RGQ,int16,729.57 MiB,326.93 GiB,460.0,17900,18.7 MiB,41.74 KiB,"(715256, 245394)","(1000, 10000)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,765009592
5,/call_genotype_mask,bool,606.02 MiB,326.93 GiB,550.0,17900,18.7 MiB,34.67 KiB,"(715256, 245394, 2)","(1000, 10000, 2)","Blosc(cname='zstd', clevel=7, shuffle=BITSHUFF...",,635458027
6,/call_FT,object,498.61 MiB,1.28 TiB,2700.0,17900,74.81 MiB,28.52 KiB,"(715256, 245394)","(1000, 10000)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",[VLenUTF8()],522830479
7,/call_genotype_phased,bool,17.17 MiB,163.47 GiB,9700.0,17900,9.35 MiB,1005 bytes,"(715256, 245394)","(1000, 10000)","Blosc(cname='zstd', clevel=7, shuffle=BITSHUFF...",,18004049
8,/variant_AF,float32,5.95 MiB,256.48 MiB,43.0,716,366.81 KiB,8.51 KiB,"(715256, 94)","(1000, 94)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,6239027
9,/variant_homozygote_count,int32,4.78 MiB,259.21 MiB,54.0,716,370.71 KiB,6.83 KiB,"(715256, 95)","(1000, 95)","Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFL...",,5012193


In [4]:
total = df_zarr.stored_bytes.sum()

humanfriendly.format_size(total, binary=True)

'8.12 GiB'

In [5]:
df_zarr.nchunks.sum()

151821

In [9]:
df_display_table = pd.DataFrame({
    "Field":df_zarr.name,
    "type": df_zarr.dtype,
    "storage":df_zarr.stored,
    "compress": df_zarr.ratio,
    "percentage": df_zarr.stored_bytes / total})
threshold = 0.01 / 100 # 0.01% 
df_display_table = df_display_table[df_display_table.percentage >= threshold].copy()
df_display_table.sort_values("percentage", ascending=False, inplace=True)
df_display_table["percentage"] = df_display_table["percentage"].map('{:.2%}'.format)
df_display_table["compress"] = df_display_table["compress"].map('{:.1f}'.format)
df_display_table

Unnamed: 0,Field,type,storage,compress,percentage
0,/call_LAD,int16,2.26 GiB,290.0,27.85%
1,/call_GQ,int8,1.89 GiB,87.0,23.29%
2,/call_LA,int8,1.24 GiB,260.0,15.28%
3,/call_genotype,int8,914.22 MiB,370.0,11.00%
4,/call_RGQ,int16,729.57 MiB,460.0,8.78%
5,/call_genotype_mask,bool,606.02 MiB,550.0,7.29%
6,/call_FT,object,498.61 MiB,2700.0,6.00%
7,/call_genotype_phased,bool,17.17 MiB,9700.0,0.21%
8,/variant_AF,float32,5.95 MiB,43.0,0.07%
9,/variant_homozygote_count,int32,4.78 MiB,54.0,0.06%


In [10]:
print(df_display_table.to_latex(index=False, escape=True).replace("object", "str"))

\begin{tabular}{lllll}
\toprule
Field & type & storage & compress & percentage \\
\midrule
/call\_LAD & int16 & 2.26 GiB & 290.0 & 27.85\% \\
/call\_GQ & int8 & 1.89 GiB & 87.0 & 23.29\% \\
/call\_LA & int8 & 1.24 GiB & 260.0 & 15.28\% \\
/call\_genotype & int8 & 914.22 MiB & 370.0 & 11.00\% \\
/call\_RGQ & int16 & 729.57 MiB & 460.0 & 8.78\% \\
/call\_genotype\_mask & bool & 606.02 MiB & 550.0 & 7.29\% \\
/call\_FT & str & 498.61 MiB & 2700.0 & 6.00\% \\
/call\_genotype\_phased & bool & 17.17 MiB & 9700.0 & 0.21\% \\
/variant\_AF & float32 & 5.95 MiB & 43.0 & 0.07\% \\
/variant\_homozygote\_count & int32 & 4.78 MiB & 54.0 & 0.06\% \\
/variant\_allele & str & 4.74 MiB & 110.0 & 0.06\% \\
/variant\_AC & int32 & 4.24 MiB & 61.0 & 0.05\% \\
/variant\_filter & bool & 2.87 MiB & 0.9 & 0.03\% \\
/variant\_AN & int32 & 904.75 KiB & 3.1 & 0.01\% \\
\bottomrule
\end{tabular}

