In [None]:
from df_cereal.df_widget import DFWidget, BaseWidget
from df_cereal.arrow_utils import df_to_arrow_bytes, df_to_base64
from traitlets import Unicode, Dict, observe, Any, Bytes, observe, Bool
import pandas as pd
import numpy as np
import json
import time
from buckaroo import BuckarooWidget

In [None]:
def second_convert(ser):
    return ser.dt.seconds + ser.dt.microseconds / (10**6)

class BaseBenchmarkWidget(BaseWidget):
    _view_name = Unicode('BytesBenchmarkWidgetView').tag(sync=True)
    
    #timing_info must be empty and do_calc must be True to trigger the benchmark, 
    #we use this to prevent loops, and to allow only calculating the calc time not the widget instantiation time
    timing_info = Dict({}).tag(sync=True)  
    do_calc = Bool(True).tag(sync=True)
    def convert_df(self, df):
        pass
    target_prop = None

    def __init__(self, df_gen_func, min_n, max_n, **kwargs):
        self.results_arr = []
        self.max_n = max_n
        self.current_n = min_n
        self.df_gen_func = df_gen_func
        self.bw = BuckarooWidget(pd.DataFrame(), pinned_rows=[])
        display(self.bw)
        super().__init__()
        self.set_from_gen()
        
    def __del__(self):
        print("in __del__")
        self.bw.close()
        del self.bw
        return super().__del__()
    
    def format_results(self):
        tdf = pd.DataFrame(self.results_arr)
        tdf['klass'] = self.__class__.__name__.replace("BenchmarkWidget","")
        tdf['py_serialize'] = second_convert(tdf['serial_t2'] - tdf['serial_t1'])
        tdf['widget_xfer'] =  second_convert(tdf['t1'] - tdf['serial_t2'] )
        tdf['js_deserialize'] = second_convert(tdf['t2'] - tdf['t1'])
        tdf['js_iterate'] = second_convert(tdf['t3'] - tdf['t2'])
        tdf['py_notify'] = second_convert(tdf['t4'] - tdf['t3'])
        tdf['roundtrip'] = second_convert(tdf['t4'] - tdf['serial_t3'])
        self.bw.raw_df = tdf[['rows', 'bytes', 'elements', 'klass', 'py_serialize', 'widget_xfer', 'js_deserialize', 'js_iterate', 'py_notify', 'roundtrip']]
    
    def set_from_gen(self):
        self.do_calc = False # this will trigger view re-evaluate, while doing nothing
        self.timing_info = {} # this will trigger view re-evaluate while doing nothing
        self.current_timing = {'gen_t1': pd.Timestamp.utcnow()}
        gen_df = self.df_gen_func(self.current_n)
        self.current_timing['serial_t1'] = pd.Timestamp.utcnow() # START PY_SERIALIZE
        self.current_timing['rows'] = len(gen_df)
        self.current_timing['elements'] = len(gen_df) * len(gen_df.columns)
        byts, self.current_timing['bytes'] = self.convert_df(gen_df) 
        self.current_timing['serial_t2'] = pd.Timestamp.utcnow()  #END PY_SERIALIZE
        setattr(self, self.target_prop, byts) # trigger transfer
        self.do_calc = True # trigger js_timing loop (t1, t2, t3)

        self.current_timing['serial_t3'] = pd.Timestamp.utcnow()
    
    @observe('timing_info')
    def _timing_info_change(self, change):
        if len(self.timing_info) == 0:
            print("exit early because empty timing_info")
            return
        new_res = { k: pd.Timestamp(v) for k,v in self.timing_info.items()}
        new_res['t4'] = pd.Timestamp.utcnow()
        new_res.update(self.current_timing)
        self.results_arr.append(new_res)
        if self.current_n > self.max_n:
            print("done")
            self.format_results()
            return
        else:
            self.do_calc = False 
            self.timing_info = {}
            self.set_from_gen()
            self.current_n += 1

In [None]:
class Base64BenchmarkWidget(BaseBenchmarkWidget):
    _view_name = Unicode('Base64BenchmarkWidgetView').tag(sync=True)
    df_base64 = Any("").tag(sync=True)
    target_prop = 'df_base64'
    
    def convert_df(self, df):
        txt = df_to_base64(df)
        return txt, len(txt)
class BytesBenchmarkWidget(BaseBenchmarkWidget):
    _view_name = Unicode('BytesBenchmarkWidgetView').tag(sync=True)

    def convert_df(self, df):
        byts = df_to_arrow_bytes(df)
        return byts, len(byts)
    df_arrow_bytes = Bytes().tag(sync=True)
    target_prop = 'df_arrow_bytes'

class JSONDFBenchmarkWidget(BaseBenchmarkWidget):
    _view_name = Unicode('DFDataBenchmarkWidgetView').tag(sync=True)

    def convert_df(self, df):
        txt = df.to_json(orient='records')
        return json.loads(txt), len(txt)
    df_data = Any([]).tag(sync=True)
    target_prop = 'df_data'

In [None]:
def df_gen_func(exp):
    N = 20 * (10**exp)//2
    big_df = df = pd.DataFrame({
        'str': ["foo", "barr", "bazz", "bofffff"] * (N//4),
        'log_normal': np.random.lognormal(25, .3, N),
        'bool': [True, False] * (N//2),
        'randint': np.random.randint(0,100,N)
    })
    return df

In [None]:
all_obs_dfs = []
b64_bn = Base64BenchmarkWidget(df_gen_func, min_n=1, max_n=5)
b64_bn

In [None]:
all_obs_dfs.append(b64_bn.bw.raw_df)
b64_bn.bw.close()
del b64_bn.bw
b64_bn.close()
del b64_bn
byt_bn = BytesBenchmarkWidget(df_gen_func, min_n=1, max_n=6)
byt_bn

In [None]:
all_obs_dfs.append(byt_bn.bw.raw_df)
byt_bn.bw.close()
del byt_bn.bw
byt_bn.close()
del byt_bn
jsdf_bn = JSONDFBenchmarkWidget(df_gen_func, min_n=1, max_n=4)
jsdf_bn

In [None]:
all_obs_dfs.append(jsdf_bn.bw.raw_df)
jsdf_bn.bw.close()
del jsdf_bn.bw
jsdf_bn.close()
del jsdf_bn

In [None]:
def without(arr, removs):
    new_arr = []
    for k in arr:
        if k in removs:
            continue
        new_arr.append(k)
    return new_arr
def reorder(df, first_cols):
    all_cols = df.columns
    rest_columns = without(all_cols, first_cols)
    ord_cols = first_cols.copy()
    ord_cols.extend(rest_columns)
    return df[ord_cols]

In [None]:
def df_gen_func(exp):  # redefining here for nice screenshots
    N = 20 * (10**exp)//2
    big_df = df = pd.DataFrame({
        'str': ["foo", "barr", "bazz", "bofffff"] * (N//4),
        'log_normal': np.random.lognormal(25, .3, N),
        'bool': [True, False] * (N//2),
        'randint': np.random.randint(0,100,N)
    })
    return df

In [None]:
full_df = pd.concat(all_obs_dfs)
full_df['els_per_second'] = full_df['elements'] / full_df['roundtrip']
full_df['bytes_per_second_full'] = full_df['bytes'] / full_df['roundtrip']
full_df['bytes_per_second_xfer'] = full_df['bytes'] / full_df['widget_xfer']

reorder(full_df, ['bytes', 'elements', 'klass', 'els_per_second', 'bytes_per_second_xfer'])