In [2]:
import pandas as pd
import pyarrow as pa

import math
from collections import Counter
import numpy as np


In [4]:
def verify_groupby_result_in_k(row, k=10):
    correct_src = row['src_filename']
    found_correct_src = -1
    ranked_results = 0
    total_results = 0
    
    try:
        total_results = len(row['results'])
        
        for ix, rank_item in enumerate(row['results']):
            if rank_item == correct_src and found_correct_src == -1:
                found_correct_src = ranked_results
            ranked_results += 1
    except Exception as e:
        #print(e)
        pass
            
    return pd.Series([found_correct_src, ranked_results, total_results])

def verify_pivot_result_in_k(row, k=10):
    correct_src = row['src_filename']
    found_correct_src = -1
    ranked_results = 0
    total_results = len(row['results'])
    
    for ix, rank_item in enumerate(row['results']):
        if rank_item == correct_src  and found_correct_src == -1:
            found_correct_src = ranked_results
        ranked_results += 1
            
    return pd.Series([found_correct_src, ranked_results, total_results])


def compute_recall_rate_at_k(rank_array: pd.Series, k: int = 10):
    correct_values = rank_array.loc[rank_array != -1]
    return sum(correct_values < k) / len(rank_array)

def get_query_statistics(result_df, col='total_results', rename_col='total_results'):
    stat_df = pd.DataFrame(result_df[col].describe()) 
    stat_df.rename(columns={col: rename_col}, inplace=True)
    return stat_df.T

def generate_top_k_df(result_df, result_col, k_range=[1,5,10], op_label='operation'):
    result_dict = {}
    for k in [1,5,10]:
        result_dict[f"k={k}"] = compute_recall_rate_at_k(result_df[result_col], k=k)
    return pd.DataFrame(result_dict, index=[op_label])




In [3]:
gb_results = pd.read_parquet('../src/gb_results_2col_MATE_256.parquet')
gb_results['src_filename'] = gb_results['src_labels'].apply(lambda x: os.path.basename(x[0]))
gb_results[['correct_src_rank', 'ranked_results', 'total_results']] = gb_results.apply(verify_groupby_result_in_k, axis=1)
groupby_qstats = get_query_statistics(gb_results, col='total_results', rename_col='groupby (2-col)')
groupby_acc = generate_top_k_df(gb_results, result_col='correct_src_rank', op_label='groupby (2-col)')
groupby_nums = pd.concat([groupby_qstats, groupby_acc], axis=1, keys=['Total Query Results', 'Ranking Rate @ k'])
groupby_nums

Unnamed: 0_level_0,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Ranking Rate @ k,Ranking Rate @ k,Ranking Rate @ k
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,k=1,k=5,k=10
groupby (2-col),100.0,1870.35,1442.200349,1.0,137.75,1809.0,3368.0,3499.0,0.15,0.19,0.2


In [4]:
pivot_results = pd.read_parquet('../src/pivot_results_2col_MATE_256.parquet')
pivot_results['src_filename'] = pivot_results['src_labels'].apply(lambda x: os.path.basename(x[0]))
pivot_results[['correct_src_rank', 'ranked_results', 'total_results']] = pivot_results.apply(verify_pivot_result_in_k, axis=1)
pivot_qstats = get_query_statistics(pivot_results, col='total_results', rename_col='pivot (2-col)')
pivot_acc = generate_top_k_df(pivot_results, result_col='correct_src_rank', op_label='pivot (2-col)')
pivot_nums = pd.concat([pivot_qstats, pivot_acc], axis=1, keys=['Total Query Results', 'Ranking Rate @ k'])
pivot_nums

Unnamed: 0_level_0,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Ranking Rate @ k,Ranking Rate @ k,Ranking Rate @ k
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,k=1,k=5,k=10
pivot (2-col),100.0,24.71,179.326028,1.0,1.0,1.0,8.0,1795.0,0.69,0.89,0.92


In [5]:
# Join Results


def verify_join_column_in_original(ranking_keys, src_labels):
    left_colset = set(x.replace('__LEFT', '') for x in ranking_keys if '__LEFT' in x)
    right_colset = set(x.replace('__RIGHT', '') for x in ranking_keys if '__RIGHT' in x)
                
    #print(left_colset, right_colset)
    join_src_mapping = {}
    
    for label in src_labels:
        src_df = pd.read_parquet(label)
        if set(src_df.columns).issuperset(left_colset):
            join_src_mapping['left'] = os.path.basename(label)
        if set(src_df.columns).issuperset(right_colset):
            join_src_mapping['right'] = os.path.basename(label)
    return join_src_mapping['left'], join_src_mapping['right']


def verify_join_columns(row):
    dst_cols = set(pd.read_parquet(row['dst_label']).columns)
    return verify_join_column_in_original(dst_cols, row['src_labels'])


def verify_join_result_in_k(row, k=10, side=0):
    correct_src = verify_join_columns(row)[side]
    found_correct_src = -1
    ranked_results = 0
    total_results = 0
    
    try:
        total_results = len(row['results'][side])
        
        for ix, rank_item in enumerate(row['results'][side]):
            if rank_item == correct_src and found_correct_src == -1:
                found_correct_src = ranked_results
            ranked_results += 1
    except Exception as e:
        #print(e)
        pass
            
    return pd.Series([found_correct_src, ranked_results, total_results])
    

join_results = pd.read_parquet('../src/join_results_2col_MATE_256.parquet')
join_results['src_filename'] = join_results['src_labels'].apply(lambda x: os.path.basename(x[0]))
join_results[['correct_src_rank', 'ranked_results', 'total_results']] = join_results.apply(verify_join_result_in_k, axis=1)
join_qstats = get_query_statistics(join_results, col='total_results', rename_col='join (2-col)')
join_acc = generate_top_k_df(join_results, result_col='correct_src_rank', op_label='join (2-col)')
join_nums = pd.concat([join_qstats, join_acc], axis=1, keys=['Total Query Results', 'Ranking Rate @ k'])
join_nums

Unnamed: 0_level_0,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Ranking Rate @ k,Ranking Rate @ k,Ranking Rate @ k
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,k=1,k=5,k=10
join (2-col),100.0,1256.21,1452.572669,0.0,2.0,607.0,3011.0,3454.0,0.33,0.36,0.37


In [10]:
#Other Side
def verify_join_result_in_k(row, k=10, side=1):
    correct_src = verify_join_columns(row)[side]
    found_correct_src = -1
    ranked_results = 0
    total_results = 0
    
    try:
        total_results = len(row['results'][side])
        
        for ix, rank_item in enumerate(row['results'][side]):
            if rank_item == correct_src and found_correct_src == -1:
                found_correct_src = ranked_results
            ranked_results += 1
    except Exception as e:
        #print(e)
        pass
            
    return pd.Series([found_correct_src, ranked_results, total_results])
    


join_results = pd.read_parquet('../src/join_results_2col_MATE_256.parquet')
join_results['src_filename'] = join_results['src_labels'].apply(lambda x: os.path.basename(x[1]))
join_results[['correct_src_rank', 'ranked_results', 'total_results']] = join_results.apply(verify_join_result_in_k, axis=1)
join_qstats = get_query_statistics(join_results, col='total_results', rename_col='join_right (2-col)')
join_acc = generate_top_k_df(join_results, result_col='correct_src_rank', op_label='join_right (2-col)')
join_nums = pd.concat([join_qstats, join_acc], axis=1, keys=['Total Query Results', 'Ranking Rate @ k'])
join_nums

Unnamed: 0_level_0,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Total Query Results,Ranking Rate @ k,Ranking Rate @ k,Ranking Rate @ k
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,k=1,k=5,k=10
join_right (2-col),100.0,1506.66,1512.411074,0.0,9.0,723.5,3240.0,3464.0,0.27,0.39,0.4
