In [1]:
import pandas as pd
import re
import os

RESULTS_DIR = './../datasets_reduced/revision/results/'

RESULTS_CHAABEN = 'dataset_chaaben_baseline_with_completions.csv'
RESULTS_RAMC = 'results_with_comparable_info.csv' 
RESULTS_RANDOM_RAC = 'dataset_random_baseline_with_completions.csv'
OUTPUT_FILE = 'benchmark_results.csv' #'results_with_comparable_info.csv'

# cd with jupyter into the directory with the results
os.chdir(RESULTS_DIR)


DATASET_RESULTS_FILES = [RESULTS_RANDOM_RAC, RESULTS_RAMC]
dataset_name = ['random', 'ramc']


# Load Dataset

We load all datasets.

In [4]:
datasets =[pd.read_csv(file) for file in DATASET_RESULTS_FILES]

columns = ['dataset', 'same_class', 'same_name', 'same_concept', 'same_anchor_node', 'same_association']

# dataset index given by the following constraint (these are the only ones for which we can perform the comparison with Chaaben et al. )
is_add_object = datasets[1]['completion'].str.contains("'changeType': 'Add', 'type': 'object'")

index = is_add_object[is_add_object].index

# we compute same_concept = same_class==True and same_name==True
for i, dataset in enumerate(datasets):
    dataset['same_concept'] = dataset['same_class'] & dataset['same_name']
    
# same association is just a renaming of 'simplified_correct'
for i, dataset in enumerate(datasets):
    dataset['same_association'] = dataset['simplified_correct']
    
# restrict all datasets to the index
for i, dataset in enumerate(datasets):
    datasets[i] = dataset.loc[index]


# Stastical evaluation
We compute the correctness (probability of correct completion) for each class. We furthermore compare statistical significance (p-value), w.r.t. to 'random' via binomial test.

In [14]:
# import binomial test from scipy
from scipy.stats import binom_test

# same class
results_df = pd.DataFrame(columns=columns)

for i, dataset in enumerate(datasets):
    # compute all probabilities
    results_df.loc[i, columns[0]] = dataset_name[i] # set the dataset name
    results_df.loc[i+len(datasets), columns[0]] = dataset_name[i] + "_p" # set the dataset name
    for column in columns[1:]:
        p = dataset[column].sum()/len(dataset)
        results_df.loc[i, column] = p    
        # compute binomial test with random baseline = dataset
        # alternative
        if dataset_name[i] == 'random':
            alternative = 'two-sided' # actually we compare to the random, so this will be p-value = 1.0 anyways
        elif dataset_name[i] == 'ramc':
            alternative = 'greater' # we assume that semantical retrieval outperforms random retrieval
        elif dataset_name[i] == 'chaaben':
            alternative = 'less' # our hypothesis is that our random approach is better because it takes into account more similar examples via retrieval of few-shots
        p_values = binom_test(x=dataset[column].sum(), n=len(dataset), p=datasets[0][column].sum()/len(dataset), alternative=alternative)
        results_df.loc[i+len(datasets), column] = p_values

In [16]:
# print the results_df formatted
latex_table = results_df.to_latex(index=False, float_format="%.3f")

In [18]:
# \begin{tabular}{llllll}
# \toprule
# dataset & same_class & same_name & same_concept & same_anchor_node & same_association \\
# \midrule
# random & 0.784 & 0.804 & 0.765 & 0.706 & 0.686 \\
# random_p & 1.000 & 1.000 & 1.000 & 1.000 & 1.000 \\
# ramc & 0.941 & 0.961 & 0.941 & 0.843 & 0.804 \\
# ramc_p & 0.002 & 0.001 & 0.001 & 0.019 & 0.044 \\
# \bottomrule
# \end{tabular}

# in the latex table above, add two stars if p-value < 0.01, one star if p-value < 0.05 for the corresponding approach, remove the columns with p-values
latex_table_p = """
\\begin{tabular}{llllll}
\\toprule
dataset & same_class & same_name & same_concept & same_anchor_node & same_association \\
\\midrule
random & 0.784 & 0.804 & 0.765 & 0.706 & 0.686 \\
ramc & 0.941** & 0.961** & 0.941** & 0.843** & 0.804* \\
\\bottomrule
\\end{tabular}
"""
print(latex_table_p)



\begin{tabular}{llllll}
\toprule
dataset & same_class & same_name & same_concept & same_anchor_node & same_association \
\midrule
random & 0.784 & 0.804 & 0.765 & 0.706 & 0.686 \
ramc & 0.941** & 0.961** & 0.941** & 0.843** & 0.804* \
\bottomrule
\end{tabular}

