In [None]:
from utz import *
from humanize import naturalsize

[Papermill](https://papermill.readthedocs.io/en/latest/) parameters:

In [None]:
name = 'fetch-1'
out_fmt = 'csv'

In [None]:
makedirs(name, exist_ok=True)
def save(df, df_name, out_fmt=out_fmt):
    if out_fmt == 'csv':
        df.to_csv(f'{name}/{df_name}.csv')
    elif out_fmt in [ 'pqt', 'parquet' ]:
        df.to_parquet(f'{name}/{df_name}.parquet')
    else:
        raise ValueError(f'Unrecognize out_fmt: {out_fmt}')

In [None]:
har_path = f'{name}.har'
har = read_json(har_path)
har

In [None]:
entries = DF(har.loc['entries', 'log'])
entries

In [None]:
req = pd.json_normalize(entries.request)
res = pd.json_normalize(entries.response)

In [None]:
headers = req.headers.apply(lambda headers: { h['name']: h['value'] for h in headers }).apply(Series)
headers

In [None]:
ranges = headers.Range.dropna()
ranges = ranges.str.extract(r'bytes=(?P<start>\d+)-(?P<last>\d+)?')
starts = ranges.start.astype(int)
ends = ranges['last'].apply(lambda last: nan if isna(last) else int(last) + 1).rename('end')
ranges = sxs(starts, ends)
ranges

In [None]:
suffix_range_mask = ~ranges.start.isna() & ranges.end.isna()

In [None]:
e1 = sxs(req.method, req.url, res['content.size'].rename('size'), ranges)
e1

In [None]:
d1 = e1[e1.url.str.endswith('.duckdb')]
d1

In [None]:
d1.method.value_counts()

In [None]:
name_idx = d1.url.str.extract(r'.*/(?P<name>(?P<data_idx>\de\d).*)')
indexed = name_idx['name'].str.contains('-idx').rename('indexed')
name_idx = sxs(name_idx, indexed)
names = name_idx['name']
name_idxs = name_idx.drop_duplicates().set_index('name')
nrows = name_idxs.data_idx.apply(lambda s: int(float(s))).rename('nrows')
name_idxs = sxs(name_idxs, nrows).sort_values(['nrows', 'indexed'])
name_idxs

In [None]:
gets = sxs(d1, names)
gets = gets[gets['size'] > 0]
assert gets.method.unique().tolist() == ['GET']
gets = gets[['name', 'size']]
gets = (
    gets
    .merge(
        ranges,
        how='left',
        left_index=True,
        right_index=True,
    )
    .astype({ 'start': int, 'end': int })
    [[ 'name', 'start', 'end', 'size', ]]
)
gets

In [None]:
assert (gets.start + gets['size'] == gets.end).all()

In [None]:
save(gets, 'gets')

In [None]:
def get_stats(df):
    return dict(
        num=len(df),
        max=max(df),
        min=min(df),
    )
req_stats = gets.groupby('name')['size'].apply(get_stats)
req_stats.index = req_stats.index.set_names('stat', level=1)
req_stats = req_stats.reset_index(level=1).pivot(columns='stat', values='size')
req_stats.columns.name = ''
req_stats

In [None]:
fetched = gets.groupby('name')['size'].sum().rename('fetched')
fetched_iec = fetched.apply(naturalsize).rename('fetched_iec')
fetched = sxs(fetched, fetched_iec)
fetched = (
    sxs(
        fetched,
        name_idxs,
    )
    .sort_values(['nrows', 'indexed'])
    [fetched.columns]
)
fetched

In [None]:
import boto3
s3 = boto3.client('s3')

In [None]:
bkt = 'duckdb-repl'
def get_object_size(name):
    res = s3.head_object(Bucket=bkt, Key=name)
    return res['ContentLength']

s3_size = fetched.index.to_series().apply(get_object_size).rename('size')
s3_size_iec = s3_size.apply(naturalsize).rename('size_iec')

In [None]:
stats = sxs(fetched, s3_size, s3_size_iec, name_idxs[['nrows', 'indexed']]).merge(req_stats, how='left', left_index=True, right_index=True, validate='1:1')
stats

In [None]:
save(stats, 'stats')

In [None]:
import plotly.express as px
from IPython.display import Image
W = 800
H = W / 2
dims = dict(width=W, height=H)

In [None]:
fig = px.scatter(
    stats,
    x='nrows',
    y='fetched',
    log_x=True,
    log_y=True,
    color='indexed',
    labels={
        'fetched': "Bytes fetched",
        'nrows': "Dataset rows",
        'indexed': "Indexed?",
    },
).update_layout(
    title=dict(
        text='Data fetched: `select * from crashes limit 1`',
        x=0.5,
    ),
    plot_bgcolor='white',
).update_xaxes(
    gridcolor='#ccc',
).update_yaxes(
    gridcolor='#ccc',
).update_traces(marker_size=12)
fig.write_image(f'{name}/fetched.png', **dims)
Image(fig.to_image(**dims))