In [1]:
import os
import pickle
import pandas as pd
from config import config
from func import load_corpus
from util import utils, sbert_utils, doduo_utils
from check import embed_check, sherlock_check, doduo_check, pattern_check, sbert_check, pyfunc_check, validator_check

unable to import 'smart_open.gcs', disabling that module


In [2]:
benchmark = load_corpus.load_corpus('st_bench')

In [3]:
RULE_CORPUS = 'Tablib_Sample_Large'
cohenh_thres = 0.8
conf_thres = 0.1
num_rule_thres = 500
sbert_dist_val_embeddings_fname = os.path.join(config.dir.storage_root_dir, config.dir.storage_root.sbert, f'{load_corpus.CORPUS_NAME}_dist_val_embeddings.pkl')
doduo_intermediate_result_dir = os.path.join(config.dir.storage_root_dir, config.dir.storage_root.doduo)
doduo_dist_val_scores_fname = os.path.join(config.dir.storage_root_dir, config.dir.storage_root.doduo, f'{load_corpus.CORPUS_NAME}_dist_val_scores.pickle')

rule_list_fname = os.path.join(config.dir.storage_root_dir, config.dir.storage_root.fine_select_rule, f'rule_{RULE_CORPUS}_cohen_h_{cohenh_thres}_wilson_{conf_thres}_num_rule_{num_rule_thres}/0.pickle')

In [4]:
with open(rule_list_fname, 'rb') as file:
    rule_list = pickle.load(file)      
pre_list = list(set([r[0] for r in rule_list]))

In [5]:
sbert_dist_val_embeddings = None
doduo_dist_val_scores = None

if any([rule[0][0] == 'sbert' for rule in rule_list]):
    if not os.path.exists(sbert_dist_val_embeddings_fname):
        print("SentenceBERT embedding file not found, computing ...")
        sbert_dist_val_embeddings = sbert_utils.dist_val_embeddings_parallel(benchmark, n_proc = 8)
        with open(sbert_dist_val_embeddings_fname, 'wb') as file:
            pickle.dump(sbert_dist_val_embeddings, file)
            
    with open(sbert_dist_val_embeddings_fname, 'rb') as file:
        sbert_dist_val_embeddings = pickle.load(file)

if any([rule[0][0] == 'doduo' for rule in rule_list]):
    if not os.path.exists(doduo_dist_val_scores_fname):
        print("Doduo preprocessing result not found, computing ...")
        doduo_utils.dist_val_scores_parallel(benchmark, doduo_intermediate_result_dir, doduo_dist_val_scores_fname, n_proc = 15)
    doduo_dist_val_scores = pd.read_pickle(doduo_dist_val_scores_fname)

In [None]:
test_matching_dict = utils.build_matching_idx_dict_from_pre_list_parallel(benchmark, pre_list, n_proc = 32, sbert_dist_val_embeddings = sbert_dist_val_embeddings, doduo_dist_val_scores = doduo_dist_val_scores)

In [None]:
results = []
if any([rule[1][0] == 'cta' for rule in rule_list]):
    sub_rule_list = [rule for rule in rule_list if rule[1][0] == 'cta']
    results += sherlock_check.sherlock_check_parallel(benchmark, test_matching_dict, sub_rule_list, n_proc = 48)
if any([rule[1][0] == 'doduo' for rule in rule_list]):
    sub_rule_list = [rule for rule in rule_list if rule[1][0] == 'doduo']
    results += doduo_check.doduo_check_parallel(benchmark, test_matching_dict, sub_rule_list, n_proc = 15, doduo_dist_val_scores = doduo_dist_val_scores)
if any([rule[1][0] == 'embed' for rule in rule_list]):
    sub_rule_list = [rule for rule in rule_list if rule[1][0] == 'embed']
    results += embed_check.embed_check_parallel(benchmark, test_matching_dict, sub_rule_list, n_proc = 48)
if any([rule[1][0] == 'sbert' for rule in rule_list]):
    sub_rule_list = [rule for rule in rule_list if rule[1][0] == 'sbert']
    results += sbert_check.sbert_check_parallel(benchmark, test_matching_dict, sub_rule_list, n_proc = 8, sbert_dist_val_embeddings = sbert_dist_val_embeddings)
if any([rule[1][0] == 'pattern' for rule in rule_list]):
    sub_rule_list = [rule for rule in rule_list if rule[1][0] == 'pattern']
    results += pattern_check.pattern_check(benchmark, test_matching_dict, sub_rule_list)
if any([rule[1][0] == 'pyfunc' for rule in rule_list]):
    sub_rule_list = [rule for rule in rule_list if rule[1][0] == 'pyfunc']
    results += pyfunc_check.pyfunc_check(benchmark, test_matching_dict, sub_rule_list)
if any([rule[1][0] == 'validator' for rule in rule_list]):
    sub_rule_list = [rule for rule in rule_list if rule[1][0] == 'validator']
    results += validator_check.validator_check(benchmark, test_matching_dict, sub_rule_list)

In [8]:
final_res = pd.DataFrame()
for r in results:
    for idx, row in r.iterrows():
        if idx not in final_res.index:
            final_res = final_res.append(row)
        else:
            if row['conf'] < final_res.loc[idx, 'conf']:
                final_res.loc[idx] = row
if len(final_res) > 0:  
    final_res['conf'] = 1 - final_res['conf']
final_res = final_res.rename(columns={"rule": "SDC"})
final_res.sort_values('conf', ascending = False)[['header', 'ground_truth', 'ground_truth_debatable', 'dist_val', 'outlier', 'conf', 'SDC']]

  arr_value = np.asarray(value)


Unnamed: 0,header,ground_truth,ground_truth_debatable,dist_val,outlier,conf,SDC
651,region,[xoutlets],[],"[arizona, atlanta, colorado, connecticut, flor...",xoutlets,0.984932,"((sbert, 1, indiana, 0.8, 1.1), (sbert, 1.35),..."
1003,f,[indonasia],[],"[uae, malaysia, indonasia, kenya, qatar, iraq,...",indonasia,0.983402,"((sbert, 1, canada, 0.8, 1.1), (sbert, 1.35), ..."
701,last inspection date,[new facility],[],"[12/3/2020, 11/5/2020, 2/5/2021, new facility,...",new facility,0.965059,"((pattern, 1, 99/08/27, ^[\+\-\d\.,]+/[\+\-\d\..."
619,clean country,[self-employed],[],"[pakistan, india, ethiopia, cambodia, sri lank...",self-employed,0.960564,"((embed, 1, australia, 0.8, 6.0), (embed, 8.0)..."
668,country,[],[],"[india, iran (islamic republic of), kyrgyzstan...",eswatini,0.942635,"((embed, 1, mexico, 0.8, 6.0), (embed, 9.5), 1..."
840,mail,[],[],"[cp.avellino@cri.it, cl.bagnoaripoli@cri.it, c...",area6@criferrara.it; cp.ferrara@cri.it,0.929262,"((pyfunc, email, 0.98), (pyfunc, email), 2.680..."
154,gene name,[-],[],"[-, yro2, akl1, mrs5, met8, srb6, mrpl27, sps2...",-,0.919412,"((pattern, 1, n7, ^[a-zA-Z]+[\+\-\d\.,]+$, 0.9..."
411,f,[ngpnwg],[],"[ngp05162, ngp05241, ngp05211, ngp02131, ngp03...",ngpnwg,0.919412,"((pattern, 1, n7, ^[a-zA-Z]+[\+\-\d\.,]+$, 0.9..."
574,code produit,[],[],"[mel003, mel004, mel005, mel006, mel007, mel00...",transp 1,0.919412,"((pattern, 1, n7, ^[a-zA-Z]+[\+\-\d\.,]+$, 0.9..."
897,ホームページ,[],[http://www.plasmashower.com/?napm=ct%3djhsmux...,"[www.ewhamask.com, www.cellreturn.com, www.hit...",http://www.plasmashower.com/?napm=ct%3djhsmuxe...,0.90162,"((pyfunc, url, 0.99), (pyfunc, url), 2.5115441..."
