In [1]:
import os
from utils import myconfig
import v6.data_io, v6.feature_selection, v6.label_debugger
import pandas as pd
import numpy as np
import time
from IPython.display import display, HTML
from pprint import pprint


In [2]:
## debug iterations
def debug_labels(debugger, index2pair, table_A, table_B, top_k, max_iter, file_prefix):

    iter_times = []
    correct_label_attr = 'correct_label'
    num_iter_without_errors = 0
    all_detected_errors = []
    match_detected_errors = []
    total_num_iters = 0
    start = time.clock()

    while True:
        top_suspicious_indices = debugger.find_suspicious_labels(top_k)

        end = time.clock()
        iter_time = end-start
        iter_times.append(iter_time)

        # combine those suspicious pairs into a dataframe and save to file
        all_pairs_with_label = []

        # find their correct labels from analyst
        index2correct_label = {}
        for index in top_suspicious_indices:
            p = index2pair[index]
            label = labels[index]
            try:
                left = table_A.loc[table_A['id'] == int(p[0])]
                right = table_B.loc[table_B['id'] == int(p[1])]
            except: # possible casting to int is the culprit
                left = table_A.loc[table_A['id'] == p[0]]
                right = table_B.loc[table_B['id'] == p[1]]

            tmp = {}
            for col in left:
                tmp['ltable.'+col] = left.iloc[0][col]
            for col in right:
                tmp['rtable.'+col] = right.iloc[0][col]
            #tmp['label'] = label
            all_pairs_with_label.append(tmp)

        ## ask analyst if the labels are correct, add to index2correct_label
        df = pd.DataFrame(all_pairs_with_label)
        display_df = df[[c for c in df.columns if not any(map(c.startswith, ['ltable.id', 'rtable.id', 'ltable.assembled', 'rtable.assembled']))
                         and not df[c].isnull().values.all()]]
        display(HTML(display_df.to_html(index=False)))
        #choice = '00000000000000000000'
        choice = raw_input("Inspect the table and enter 0/1 to signify non-match/match. The input length should be: " + str(len(top_suspicious_indices)) + '\n')
        ## TODO: verify inputs
        original_labels = [labels[i] for i in top_suspicious_indices]
        correct_labels = [int(c) for c in choice]
        assert len(correct_labels) == len(top_suspicious_indices)

        df['label'] = original_labels
        df[correct_label_attr] = correct_labels

        output_file = file_prefix + '_iter_' + str(debugger.iter_count) + '.csv'
        df.to_csv(output_file, index=False)

        corrected_labels_map = {i:c for (i, c) in zip(top_suspicious_indices, correct_labels)}
        iter_count, num_errors, error_indices, match_error_indices, det_error_poses  = debugger.analyze(corrected_labels_map)
        print('Iteration: ', iter_count)
        print('Number of suspicious labels found: ', len(top_suspicious_indices))
        print("Number of errors found: ", num_errors)
        print("Error indices in the table: ", [top_suspicious_indices.index(i) for i in error_indices])
        print("Detector performance: ")
        for n, (count, pos) in enumerate(det_error_poses):
            print("Detector ", n, "found ", count, " errors")
            #print("Positions: ", pos)

        all_detected_errors.extend(error_indices)
        match_detected_errors.extend(match_error_indices)

        if num_errors==0:
            num_iter_without_errors += 1
        else:
            num_iter_without_errors = 0

        if num_iter_without_errors>=3:
            break

        start = time.clock()
        index2correct_label = {index: correct_labels[i] for i, index in enumerate(top_suspicious_indices)}
        debugger.correct_labels(index2correct_label)
        ## mayuresh: explanations
        print('---Explanations for wrong matches and wrong non-matches follow---')
        debugger.explain_errors(False)
        debugger.explain_errors(True)

        total_num_iters += 1

        if total_num_iters>=max_iter:
            break

    return all_detected_errors, match_detected_errors, iter_times


In [6]:
# Get data from LabelDebugger config files

def read_config(config_file):
    params = myconfig.read_config(config_file)
    # other config params
    params['fs_alg'] = 'model'
    params['max_list_len'] = 20
    params['detectors'] = 'fpfn'
    params['confusion'] = True
    params['num_cores'] = 1
    params['num_folds'] = 5
    params['min_con_dim'] = 1
    params['counting_only'] = True
    params['top_k'] = 20
    params['max_iter'] = 10

    return params

def read_table(basedir, tab_name):
    return pd.read_csv(os.path.join(basedir, tab_name))

In [None]:
# Specify username; the output path will use the name
username = 'fake'

# Specify the configuration file path of your dataset
config_path = '/export/da/mkunjir/LabelDebugger/config/tools.config'
params = read_config(config_path)

# Optional: Change the budget parameters by uncommenting and setting desired values
#params['top_k'] = 10 # The number of examples inspected in each iterations
params['max_iter'] = 10 # The maximum number of debugging iterations

# read features and labels
basedir = params['basedir']
hpath = os.path.join(basedir, params['hpath'])
exclude_attrs = ['_id', 'ltable.id', 'rtable.id']
features, labels, feature_labels, pair2index, index2pair = v6.data_io.read_feature_file(hpath, exclude_attrs)

# read tables
table_A = read_table(basedir, params['apath'])
table_B = read_table(basedir, params['bpath'])

# select features
del params['spath']
if 'spath' in params:
       selected_features_path = os.path.join(basedir, params['spath'])
       selected_features = pd.read_csv(selected_features_path).to_numpy()
else:
       selected_features, selected_feature_indexes = v6.feature_selection.select_features(features, labels, params['fs_alg'])
       print('Selected features: ', [feature_labels[i] for i in selected_feature_indexes])
       #selected_features = v6.feature_selection.select_features(selected_features, labels, params['fs_alg'])
       selected_features_path = os.path.join(basedir, 'selected_features.csv')
       pd.DataFrame(selected_features).to_csv(selected_features_path, index=False)
print('Selected features of dim: ', selected_features.shape, ' from the original features: ', features.shape)
params['fs_alg'] = 'none' # disabling feature selection now that we are done selecting 

# set debugger
debugger = v6.label_debugger.LabelDebugger(selected_features, labels, params)

# start debugging
all_detected_errors, match_detected_errors, iter_times = debug_labels(debugger, index2pair, table_A, table_B, params['top_k'], params['max_iter'],
                                                                      '/export/da/mkunjir/LabelDebugger/analyst/' + username + '_' + params['dataset_name'])
print '\n\n'
print("Number of iterations: ", debugger.iter_count)
print("Number of checked pairs: ", len(debugger.verified_indices))
print("Number of detected errors: ", len(all_detected_errors))
print("Of which false non-matches: ", len(match_detected_errors))
print("First iteration (secs): ", iter_times[0])
print("For other iterations: ")
print("Min (secs): ", min(iter_times[1:]))
print("Max (secs): ", max(iter_times[1:]))
print("Ave (secs): ", sum(iter_times[1:])/len(iter_times[1:]))

