In [1]:
import pandas as pd
import numpy as np
from numpy.random import randint, shuffle
import random

def get_multiple_dfs(data_frame_rows):
    """
    Get a complete code str that creates a DF with random value
    """
    sizes_before_join = int(data_frame_rows * 1.1)
    start_with_offset = int(data_frame_rows * 0.1)
    end_with_offset = start_with_offset + sizes_before_join
    assert sizes_before_join - start_with_offset == data_frame_rows

    id_a = np.arange(sizes_before_join)
    shuffle(id_a)
    a = randint(0,100,size=(sizes_before_join))
    b = randint(0,100,size=(sizes_before_join))
    categories = ['cat_a', 'cat_b', 'cat_c']
    group_col_1 = pd.Series(random.choices(categories, k=sizes_before_join))
    group_col_2 = pd.Series(random.choices(categories, k=sizes_before_join))
    group_col_3 = pd.Series(random.choices(categories, k=sizes_before_join))

    id_b = np.arange(start_with_offset, end_with_offset)
    shuffle(id_b)
    c = randint(0,100,size=(sizes_before_join)) 
    d = randint(0,100,size=(sizes_before_join))

    df_a = pd.DataFrame(zip(id_a, a, b, group_col_1, group_col_2, group_col_3), columns=['id', 'A', 'B', 
        'group_col_1', 'group_col_2', 'group_col_3'])
    df_b = pd.DataFrame(zip(id_b, c, d), columns=['id', 'C', 'D'])
        
    return df_a, df_b

In [2]:
df_a, df_b = get_multiple_dfs(1000000)

# Without instrumentation

In [3]:
%%time
test = df_a.merge(df_b, on='id')

CPU times: user 322 ms, sys: 70.6 ms, total: 392 ms
Wall time: 392 ms


In [4]:
from mlinspect.inspections._lineage import LineageId, JoinLineageId

lineage_id_list_a = [LineageId(0, row_id) for row_id in range(len(df_a))]
lineage_ids_a = pd.DataFrame({"RowLineage(10)": pd.Series(lineage_id_list_a, dtype="object")})
lineage_id_list_b = [LineageId(0, row_id) for row_id in range(len(df_b))]
lineage_ids_b = pd.DataFrame({"RowLineage(10)": pd.Series(lineage_id_list_b, dtype="object")})

In [5]:
from mlinspect.backends._pandas_backend import PandasBackend, execute_inspection_visits_join
from mlinspect.backends._pandas_backend_frame_wrapper import MlinspectDataFrame
from mlinspect.inspections import RowLineage
from mlinspect.inspections._inspection_input import OperatorContext
from mlinspect.instrumentation._dag_node import OperatorType, CodeReference

pandas_backend = PandasBackend()
pandas_backend.inspections = [RowLineage(10)]
function_info = ('pandas.core.frame', 'merge')
operator_context = OperatorContext(OperatorType.JOIN, function_info)
code_reference = CodeReference(0,0,0,10)

mlinspect_df_a = MlinspectDataFrame(df_a.copy())
mlinspect_df_a.annotations = lineage_ids_a
mlinspect_df_b = MlinspectDataFrame(df_b.copy())
mlinspect_df_b.annotations = lineage_ids_b

# With current instrumentation

In [40]:
%%time
mlinspect_df_a['mlinspect_index_x'] = range(0, len(mlinspect_df_a))
mlinspect_df_b['mlinspect_index_y'] = range(0, len(mlinspect_df_b))
original_return_value = mlinspect_df_a.merge(mlinspect_df_b, on='id')


return_value = execute_inspection_visits_join(pandas_backend, operator_context, code_reference,
                                              mlinspect_df_a,
                                              mlinspect_df_a.annotations,
                                              mlinspect_df_b,
                                              mlinspect_df_b.annotations,
                                              original_return_value)

# list(pandas_backend.dag_node_identifier_to_inspection_output.values())
# return_value.annotations

CPU times: user 7.05 s, sys: 763 ms, total: 7.82 s
Wall time: 7.85 s


# Calculating directly using apply (apply is not a fast function)

In [13]:
%%time
df_a['lineage_left'] = lineage_ids_a
df_b['lineage_right'] = lineage_ids_b

join_result = df_a.merge(df_b, on='id')

join_result['annotations'] = join_result.apply(lambda row: JoinLineageId([row.lineage_left, row.lineage_right]), axis=1)
join_result.drop('lineage_left', inplace=True, axis=1)
join_result.drop('lineage_right', inplace=True, axis=1)
dag_annotation = join_result.head(10)
annotation = join_result.pop('annotations')

# dag_annotation
# annotation

CPU times: user 22.8 s, sys: 425 ms, total: 23.2 s
Wall time: 23.3 s


In [35]:
def lineage_iter(lineage_left, lineage_right):
    zipped_lineage = zip(lineage_left, lineage_right)
    join_ids_iter = map(lambda input_tuple: JoinLineageId([*input_tuple]), zipped_lineage)
    return list(join_ids_iter)

# Calculating directly using zip and map on numpy arrays

In [37]:
%%time
df_a['lineage_left'] = lineage_ids_a
df_b['lineage_right'] = lineage_ids_b

join_result = df_a.merge(df_b, on='id')

join_result['annotations'] = lineage_iter(join_result['lineage_left'].values, join_result['lineage_right'].values)
join_result.drop('lineage_left', inplace=True, axis=1)
join_result.drop('lineage_right', inplace=True, axis=1)
dag_annotation = join_result.head(10)
annotation = join_result.pop('annotations')

# dag_annotation
# annotation

CPU times: user 6.13 s, sys: 242 ms, total: 6.37 s
Wall time: 6.43 s


0         JoinLineageId(lineage_ids=[LineageId(operator_...
1         JoinLineageId(lineage_ids=[LineageId(operator_...
2         JoinLineageId(lineage_ids=[LineageId(operator_...
3         JoinLineageId(lineage_ids=[LineageId(operator_...
4         JoinLineageId(lineage_ids=[LineageId(operator_...
                                ...                        
999995    JoinLineageId(lineage_ids=[LineageId(operator_...
999996    JoinLineageId(lineage_ids=[LineageId(operator_...
999997    JoinLineageId(lineage_ids=[LineageId(operator_...
999998    JoinLineageId(lineage_ids=[LineageId(operator_...
999999    JoinLineageId(lineage_ids=[LineageId(operator_...
Name: annotations, Length: 1000000, dtype: object

# Todos

* Think about alternative lineage representation not using custom Python objects 
* Then try to find a pandas/numpy way to use this to benefit from vectorization
* Test how fast this can be executed in other ways, e.g., with DuckDB