In [1]:
import os
import sys
import random

import pandas as pd

import v6.data_io

import utils.myconfig

from deep_blocker import DeepBlocker 
from tuple_embedding_models import  AutoEncoderTupleEmbedding, CTTTupleEmbedding, HybridTupleEmbedding
from vector_pairing_models import ExactTopKVectorPairing
import blocking_utils

In [7]:
tuple_embedding_model = AutoEncoderTupleEmbedding()
vector_pairing_model = ExactTopKVectorPairing(K=50)
blocker = DeepBlocker(tuple_embedding_model, vector_pairing_model)


Loading FastText model




In [13]:
def blocking_candidates(tab_A, tab_B, pair2index):
    # find common attributes for blocking
    cols_to_block = tab_A.columns & tab_B.columns
    print('Common attributes used for blocking: ', cols_to_block[:2])
    candidate_set_df = blocker.block_datasets(tab_A, tab_B, cols_to_block[:2])
    print('Candidates set: ', candidate_set_df)
    
    # lookup candidates in given indexes
    candidates = []
    for i, row in candidate_set_df.iterrows():
        pair = (row['ltable_id'], row['rtable_id'])
        if pair in pair2index:
            candidates.append(pair)
            print('Found candidate pair: ', pair)

    print('Number of candidates found: ', len(candidates))
    return candidates

def read_table(basedir, tab_name):
    return pd.read_csv(os.path.join(basedir, tab_name))

In [14]:
def inject_errors(errors=0.2):
    config_file=r'/export/da/mkunjir/LabelDebugger/config/bike.config'

    params = utils.myconfig.read_config(config_file)

    basedir = params['basedir']
    hpath = os.path.join(basedir, params['hpath'])
    gpath = os.path.join(basedir, 'golden.csv')

    exclude_attrs = ['_id', 'ltable.id', 'rtable.id']

    features, labels, pair2index, index2pair = v6.data_io.read_feature_file(hpath, exclude_attrs)
    pair2golden = v6.data_io.read_golden_label_file(gpath)

    #print('pair2index: ', pair2index)
    # label errors
    all_errors = []
    for index, p in index2pair.items():
        if labels[index]!=pair2golden[p]:
            #print('Error found at: ', index, ' the pair is: ', p)
            all_errors.append(index)

    # randomly insert errors
    seed = 0 
    rng = random.Random(seed)
    print("Seed was:", seed)

    perc = errors #rng.randint(5, 15)/100.0
    print("Error rate:", perc)

    num_err = int(len(labels)*perc)
    if num_err < len(all_errors):
        print("Existing errors larger than the error rate specified!")
        exit()
    
    # get candidates by blocking
    table_A = read_table(basedir, params['apath'])
    table_B = read_table(basedir, params['bpath'])
    eligible_indices = blocking_candidates(table_A, table_B, pair2index)
    
    error_indices = set(all_errors)
    new_error_indices = set()
    for _ in range(num_err*10):
        if len(new_error_indices) >= num_err-len(error_indices):
            break
        num = rng.randint(0, len(eligible_indices)-1)
        index = eligible_indices[num] # candidate pair index
        if index in error_indices:
            continue
        new_error_indices.add(index)
        labels[index] = 0 if labels[index]==1 else 1
        
    # write new labels along with features
    fv_df = pd.read_csv(hpath)
    new_hpath = os.path.join(basedir, 'feature_vector_errors-' + str(errors) + '.csv')
    print('New errors inserted: ', len(new_error_indices))
    for cnt, index in enumerate(new_error_indices):
      pair = index2pair[index]
      fv_df.at[index, 'label'] = labels[index]
    fv_df.to_csv(new_hpath, index=False)

    print("Total number of errors: ", len(error_indices) + len(new_error_indices))
    all_errors = list(error_indices)
    all_errors.extend(list(new_error_indices))


inject_errors()

Seed was: 0
Error rate: 0.2
Common attributes used for blocking:  Index(['id', 'bike_name'], dtype='object')


  cols_to_block = tab_A.columns & tab_B.columns
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Performing pre-processing for tuple embeddings 
Training AutoEncoder model
Obtaining tuple embeddings for left table
Obtaining tuple embeddings for right table
Indexing the embeddings from the right dataset
Querying the embeddings from left dataset
Candidates set:          ltable_id  rtable_id
0               0       8274
1               1       4113
2               2       7061
3               3       7354
4               4       5835
...           ...        ...
239295       4781       7132
239296       4782       2473
239297       4783       3936
239298       4784       5401
239299       4785       7018

[239300 rows x 2 columns]
Number of candidates found:  0


ValueError: empty range for randrange() (0, 0, 0)