In [1]:
from os.path import join
import bnr.azrael as azrael

import pandas as pd
import numpy as np
import sidetable

In [18]:
def get_stats_by_level(df, level):
    if level == 0:
        grby = ['path0']
    elif level == 1:
        grby = ['path0', 'path1']
    elif level == 2:
        grby = ['path0', 'path1', 'path2']
    elif level == 3:
        grby = ['path0', 'path1', 'path2', 'mimetype']
        
    level_gr_stat = df.groupby(grby).agg(
            {
                'size': ['min',
                         lambda x: np.percentile(x, 5),
                         lambda x: np.percentile(x, 25),
                         'median',
                         lambda x: np.percentile(x, 75),
                         lambda x: np.percentile(x, 95),
                         'max']
            }
        )
    level_gr_stat = level_gr_stat.droplevel(axis=1, level=0).reset_index()
    level_gr_stat = level_gr_stat.rename(columns={'<lambda_0>': 'perc_05',
                                                  '<lambda_1>': 'perc_25',
                                                  '<lambda_2>': 'perc_75',
                                                  '<lambda_3>': 'perc_95'})
    stat_columns = ['min', 'perc_05', 'perc_25', 'median', 'perc_75', 'perc_95', 'max']
    for c in stat_columns:
        nc = f"{c}_mo"
        level_gr_stat[nc] = level_gr_stat[c].apply(lambda x: azrael.convert_size(x, to_size='mo'))
    
    level_gr_stat = level_gr_stat.drop(columns=stat_columns)

    if level == 0:    
        level_gr_stat['path1'] = level_gr_stat['path0'] + " - Sous-total"
        level_gr_stat['path2'] = np.nan
        level_gr_stat['mimetype'] = np.nan
    elif level == 1:
        level_gr_stat['path2'] = level_gr_stat['path0'] + " | " + level_gr_stat['path1'] + " - Sous-total"
        level_gr_stat['mimetype'] = np.nan
    elif level == 2:
        level_gr_stat['mimetype'] = level_gr_stat['path0'] + " | " + level_gr_stat['path1'] + " | " + level_gr_stat['path2'] + " - Sous-total"
    
    return level_gr_stat

In [3]:
data_folder = "data"
az2a = azrael.Azrael2analysis()
az2a.create_az(path_az=join(data_folder, "bnr_d001_20231218.csv.gz"))

az2a.split_path()
az2a.get_extension_mimetype()
az = az2a.az
az['nb'] = 1
az = az[az['path0'].isin(['BNR_VERIF', 'BNR_TAMPON', 'BNR_SAUV'])]
az.columns

Index(['name', 'path', 'md5', 'size', 'last_content_modification_date',
       'last_metadata_modification_date', 'path0', 'path1', 'path2', 'path3',
       'path4', 'extension', 'mimetype', 'guessed_extension', 'nb'],
      dtype='object')

In [None]:
#az.stb.missing()

In [None]:
#az.stb.freq(['path0'])

In [None]:
#az.stb.freq(['path1'])

In [None]:
#az.stb.freq(['mimetype'])

In [19]:
main_gr = az.groupby(['path0', 'path1', 'path2', 'mimetype']).agg(
        {
            'nb': 'sum',
            'size': 'sum'
        }
    ).stb.subtotal(
        sub_level=None,
        grand_label='Total',
        sub_label='Sous-total',
        show_sep=True,
        sep=' | ').reset_index()
main_gr['size_mo'] = main_gr['size'].apply(lambda x: azrael.convert_size(x, to_size='mo'))
main_gr['size_go'] = main_gr['size'].apply(lambda x: azrael.convert_size(x, to_size='go'))
main_gr['size_to'] = main_gr['size'].apply(lambda x: azrael.convert_size(x, to_size='to'))
main_gr.columns = ['path0', 'path1', 'path2', 'mimetype', 'nb', 'size', 'size_mo',
       'size_go', 'size_to']

In [20]:
main_gr_stats = pd.DataFrame()
for l in range(0, 4):
    l_stat = get_stats_by_level(az, l)
    main_gr_stats = pd.concat([main_gr_stats, l_stat])

In [21]:
main_gr_stats = main_gr_stats.replace(r'^\s*$', np.nan, regex=True)
main_gr = main_gr.replace(r'^\s*$', np.nan, regex=True)
for c in ['path0', 'path1', 'path2', 'mimetype']:
    main_gr_stats[c] = main_gr_stats[c].astype(str)
    main_gr[c] = main_gr[c].astype(str)
main_gr_ = pd.merge(main_gr, main_gr_stats,
               on=['path0', 'path1', 'path2', 'mimetype'],
               how='left')
main_gr_.set_index(['path0', 'path1', 'path2', 'mimetype']).to_excel("results/azrael_analysis.xlsx")