In [1]:
import pandas as pd
import numpy as np
import os

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
HOME_DIR = '/Users/summ7t/dev/novartis/table-linker/t2dv2-candidates-april-28/train'

### Generate lof-graph-embedding-score for any table

Required datasets
- candidate file
- candidate feature file
- graph_embedding_complex.tsv (generated and stored during candidate generation)

Script used `lof-script.sh`

```
filename=$1
tsv_postfix=_graph_embedding_complex

tl smallest-qnode-number train-candidates/candidates-$filename.csv \
/ align-page-rank \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard \
/ vote-by-classifier --prob-threshold 0.995 --model weighted_lr.pkl \
> model-voted/$filename.csv

tl score-using-embedding model-voted/$filename.csv \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o graph-embedding-score \
--embedding-file train-graph-embeddings/$filename$tsv_postfix.tsv \
--embedding-url http://ckg07:9200/wikidatadwd-augmented/ \
> lof-score/$filename.csv
```

cmd: `bash {HOME_DIR}/lof-script.sh {fid}`

output: lof-score/$filename.csv contains `is_lof` and `graph-embedding-score` (centroid-of-lof)

In [3]:
!mkdir -p $HOME_DIR/model-voted
!mkdir -p $HOME_DIR/lof-score
!mkdir -p $HOME_DIR/merged-lof-score
!mkdir -p $HOME_DIR/final-features

In [4]:
# list all files in candidates dir
file_names = []
file_ids = []

for (dirpath, dirnames, filenames) in os.walk(f'{HOME_DIR}/train-candidates/'):
    for fn in filenames:
        if "csv" not in fn:
            continue
        abs_fn = dirpath + fn
        assert os.path.isfile(abs_fn)
        if os.path.getsize(abs_fn) == 0:
            continue
        file_names.append(abs_fn)
        file_ids.append(fn.split('.csv')[0].split('candidates-')[1])
len(file_names), file_ids[:3]

(44,
 ['69537082_0_7789694313271016902',
  '60319454_0_3938426910282115527',
  '16767252_0_2409448375013995751'])

In [12]:
# Make sure VPN is on
for idx, fid in enumerate(file_ids):
    print(f"Generating score for {idx}th file: {fid}...")
    os.system(f'bash {HOME_DIR}/lof-script.sh {fid}')
    assert os.path.isfile(f'{HOME_DIR}/model-voted/{fid}.csv'), f"Something wrong with model-voted result: {idx}th file: {fid}"
    assert os.path.isfile(f'{HOME_DIR}/lof-score/{fid}.csv'), f"Something wrong with lof-score result: {idx}th file: {fid}"

Generating score for 0th file: 69537082_0_7789694313271016902...
Generating score for 1th file: 60319454_0_3938426910282115527...
Generating score for 2th file: 16767252_0_2409448375013995751...
Generating score for 3th file: 84548468_0_5955155464119382182...
Generating score for 4th file: 80588006_0_6965325215443683359...
Generating score for 5th file: 39650055_5_7135804139753401681...
Generating score for 6th file: 8468806_0_4382447409703007384...
Generating score for 7th file: 1438042989043_35_20150728002309-00287-ip-10-236-191-2_875026214_2...
Generating score for 8th file: 25404227_0_2240631045609013057...
Generating score for 9th file: 63450419_0_8012592961815711786...
Generating score for 10th file: 53822652_0_5767892317858575530...
Generating score for 11th file: 22864497_0_8632623712684511496...
Generating score for 12th file: 37856682_0_6818907050314633217...
Generating score for 13th file: 26310680_0_5150772059999313798...
Generating score for 14th file: 29414811_12_25115247

In [13]:
# check model-voted and lof-score files
fid = '38428277_0_1311643810102462607'
model_voted_df = pd.read_csv(f'{HOME_DIR}/model-voted/{fid}.csv')
model_voted_df[model_voted_df['vote_by_classifier'] > 0]

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,vote_by_classifier
3,1,0,M,111|1931|Fritz Lang|75,M,Q127021,M,,exact-match,1931 German drama-thriller directed by Fritz Lang,4.462731e-09,15.978438,0,4.462731e-09,1.0,0.285714,0.571429,1
71,1,0,M,111|1931|Fritz Lang|75,M,Q9933,M,Mike|m|em,fuzzy-augmented,letter in the Latin alphabet,1.525629e-06,8.985167,1,0.000000e+00,1.0,0.000000,0.000000,1
98,1,0,M,111|1931|Fritz Lang|75,M,Q127021,M,,fuzzy-augmented,1931 German drama-thriller directed by Fritz Lang,4.462731e-09,8.823359,0,0.000000e+00,1.0,0.285714,0.571429,1
431,1,3,The Magnificent Seven,346|1960|John Sturges|318,The Magnificent Seven,Q19069,The Magnificent Seven,Magnificent Seven,exact-match,1960 American western film directed by John Sturges,4.667529e-09,19.657589,1,4.667529e-09,1.0,0.250000,0.500000,1
441,1,3,The Magnificent Seven,346|1960|John Sturges|318,The Magnificent Seven,Q19069,The Magnificent Seven,Magnificent Seven,fuzzy-augmented,1960 American western film directed by John Sturges,4.667529e-09,25.419687,1,0.000000e+00,1.0,0.250000,0.500000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14075,1,110,The Rules of the Game,192|1939|Jean Renoir|70,The Rules of the Game,Q748599,The Rules of the Game,La Regle du jeu|La Règle du jeu|Rules of the Game,fuzzy-augmented,1939 French film directed by Jean Renoir,3.914503e-09,22.951918,0,0.000000e+00,1.0,0.285714,0.571429,1
14081,1,111,Run Lola Run,191|1998|Tom Tykwer|494,Run Lola Run,Q468697,Run Lola Run,,exact-match,1998 film by Tom Tykwer,4.821743e-09,21.693314,1,4.821743e-09,1.0,0.400000,0.800000,1
14082,1,111,Run Lola Run,191|1998|Tom Tykwer|494,Run Lola Run,Q468697,Run Lola Run,,fuzzy-augmented,1998 film by Tom Tykwer,4.821743e-09,38.246160,1,0.000000e+00,1.0,0.400000,0.800000,1
14186,1,112,Rushmore,238|1998|Wes Anderson|286,Rushmore,Q1347393,Rushmore,,exact-match,1998 film by Wes Anderson,4.691608e-09,19.499273,0,4.691608e-09,1.0,0.400000,0.800000,1


In [14]:
fid = '38428277_0_1311643810102462607'
score_df = pd.read_csv(f'{HOME_DIR}/lof-score/{fid}.csv')
score_df.sort_values(by=['graph-embedding-score'], ascending=False).head(10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,vote_by_classifier,singleton,is_lof,graph-embedding-score
10179,1,7,The Man Who Shot Liberty Valance,305|1962|John Ford|288,The Man Who Shot Liberty Valance,Q555687,The Man Who Shot Liberty Valance,Man Who Shot Liberty Valance,exact-match,1962 film by John Ford,4.198485e-09,20.621593,0,4.198485e-09,1.0,0.4,0.8,1,0,1,0.898658
10183,1,7,The Man Who Shot Liberty Valance,305|1962|John Ford|288,The Man Who Shot Liberty Valance,Q555687,The Man Who Shot Liberty Valance,Man Who Shot Liberty Valance,fuzzy-augmented,1962 film by John Ford,4.198485e-09,43.213043,0,0.0,1.0,0.4,0.8,1,0,1,0.898658
10080,1,69,The Player,289|1992|Robert Altman|492,The Player,Q1551573,The Player,Player,fuzzy-augmented,1992 film by Robert Altman,4.266846e-09,17.465298,0,0.0,1.0,0.4,0.8,1,0,1,0.884048
10027,1,69,The Player,289|1992|Robert Altman|492,The Player,Q1551573,The Player,Player,exact-match,1992 film by Robert Altman,4.266846e-09,18.660294,0,4.266846e-09,1.0,0.4,0.8,1,0,1,0.884048
7641,1,52,The Outlaw Josey Wales,448|1976|Clint Eastwood|645,The Outlaw Josey Wales,Q477630,The Outlaw Josey Wales,Outlaw Josey Wales,exact-match,1976 film by Clint Eastwood,4.088294e-09,21.693314,0,4.088294e-09,1.0,0.4,0.8,1,1,1,0.883297
7642,1,52,The Outlaw Josey Wales,448|1976|Clint Eastwood|645,The Outlaw Josey Wales,Q477630,The Outlaw Josey Wales,Outlaw Josey Wales,fuzzy-augmented,1976 film by Clint Eastwood,4.088294e-09,38.130527,0,0.0,1.0,0.4,0.8,1,0,1,0.883297
11497,1,8,The Manchurian Candidate,92|1962|John Frankenheimer|124,The Manchurian Candidate,Q500672,The Candidate,Candidate,fuzzy-augmented,American political comedy-drama film,3.539613e-09,28.223623,0,0.0,0.93642,0.0,0.0,0,0,-1,0.878668
1688,1,109,The Royal Tenenbaums,266|2001|Wes Anderson|516,The Royal Tenenbaums,Q935105,The Royal Tenenbaums,Royal Tenenbaums,fuzzy-augmented,2001 film by Wes Anderson,3.855798e-09,34.257706,0,0.0,1.0,0.4,0.8,1,0,1,0.875843
1687,1,109,The Royal Tenenbaums,266|2001|Wes Anderson|516,The Royal Tenenbaums,Q935105,The Royal Tenenbaums,Royal Tenenbaums,exact-match,2001 film by Wes Anderson,3.855798e-09,21.047188,0,3.855798e-09,1.0,0.4,0.8,1,0,1,0.875843
12580,1,88,Rear Window,21|1954|Alfred Hitchcock|34,Rear Window,Q34414,Rear Window,,exact-match,1954 American suspense film directed by Alfred Hitchcock,4.041058e-09,20.621593,0,4.041058e-09,1.0,0.25,0.5,0,0,-1,0.87551


In [17]:
# merge lof candidate (graph-embedding-score) with candidate feature file
for idx, fid in enumerate(file_ids):
    print(f"Merging embedding score for {idx}th file: {fid}...")
    features_df = pd.read_csv(f'{HOME_DIR}/train-features/{fid}.csv')
    lof_score_df = pd.read_csv(f'{HOME_DIR}/lof-score/{fid}.csv')
    lof_score_df.rename(columns = {'graph-embedding-score':'lof-graph-embedding-score'}, inplace = True)
    trimmed_lof_score_df = lof_score_df.loc[:, ['column', 'row', 'kg_id', 'method', 'lof-graph-embedding-score', 'is_lof']]
    
    # merge two df on row, column, kg_id
    final_df = pd.merge(features_df, trimmed_lof_score_df, left_on=['column', 'row', 'kg_id', 'method'], right_on = ['column', 'row', 'kg_id', 'method'])
    final_df.drop_duplicates(inplace=True)
    assert len(final_df) == len(features_df), f"{len(features_df)}, {len(final_df)}"
    
    final_df.to_csv(f"{HOME_DIR}/merged-lof-score/{fid}.csv", index=False)
    assert os.path.isfile(f'{HOME_DIR}/merged-lof-score/{fid}.csv'), f"Something wrong with merged score result: {idx}th file: {fid}"

Merging embedding score for 0th file: 69537082_0_7789694313271016902...
Merging embedding score for 1th file: 60319454_0_3938426910282115527...
Merging embedding score for 2th file: 16767252_0_2409448375013995751...
Merging embedding score for 3th file: 84548468_0_5955155464119382182...
Merging embedding score for 4th file: 80588006_0_6965325215443683359...
Merging embedding score for 5th file: 39650055_5_7135804139753401681...
Merging embedding score for 6th file: 8468806_0_4382447409703007384...
Merging embedding score for 7th file: 1438042989043_35_20150728002309-00287-ip-10-236-191-2_875026214_2...
Merging embedding score for 8th file: 25404227_0_2240631045609013057...
Merging embedding score for 9th file: 63450419_0_8012592961815711786...
Merging embedding score for 10th file: 53822652_0_5767892317858575530...
Merging embedding score for 11th file: 22864497_0_8632623712684511496...
Merging embedding score for 12th file: 37856682_0_6818907050314633217...
Merging embedding score for

In [18]:
# check merged train feature files
fid = '50245608_0_871275842592178099'
merged_score_df = pd.read_csv(f'{HOME_DIR}/merged-lof-score/{fid}.csv')
# merged_score_df.sort_values(by=['lof-graph-embedding-score'], ascending=False).head(10)
merged_score_df[merged_score_df['is_lof'] == 1]

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,GT_kg_id,GT_kg_label,evaluation_label,monge_elkan,des_cont_jaccard,jaro_winkler,graph-embedding-score,singleton,reciprocal_rank,num_char,num_tokens,class_count_tf_idf_score,property_count_tf_idf_score,lof-graph-embedding-score,is_lof
106,0,1,12 Angry Men,1957|DVD|Sidney Lumet|Reginald Rose|96|Widescreen,12 Angry Men,Q2345,12 Angry Men,Twelve Angry Men,fuzzy-augmented,1957 American drama film by Sidney Lumet,1.424518e-05,27.113117,Q2345,12 Angry Men,1,1.000000,0.285714,1.000000,0.776624,0,0.200000,12,3,0.170207,0.497526,0.816226,1
204,0,1,12 Angry Men,1957|DVD|Sidney Lumet|Reginald Rose|96|Widescreen,12 Angry Men,Q2345,12 Angry Men,Twelve Angry Men,exact-match,1957 American drama film by Sidney Lumet,1.424518e-05,20.049892,Q2345,12 Angry Men,1,1.000000,0.285714,1.000000,0.776624,0,0.166667,12,3,0.170207,0.497526,0.816226,1
326,0,10,American History X,1998|DVD|Tony Kaye|David McKenna|119|Widescreen,American History X,Q208572,American History X,,exact-match,1998 drama film directed by Tony Kaye,5.971302e-09,21.693314,Q208572,American History X,1,1.000000,0.285714,1.000000,0.794793,1,0.500000,18,3,0.170207,0.408187,0.819788,1
460,0,100,Hackers,1995|DVD|Iain Softley|Rafael Moreu|107|Widescreen,Hackers,Q13908,Hackers,,exact-match,1995 American thriller film by Iain Softley,3.539613e-09,20.049892,Q13908,Hackers,1,1.000000,0.285714,1.000000,0.825205,0,0.500000,7,1,0.170207,0.328429,0.832763,1
465,0,101,Happy Gilmore,1996|DVD|Dennis Dugan|Tim Herlihy|92|Widescreen,Happy Gilmore,Q1313063,Happy Gilmore,,fuzzy-augmented,1996 film by Dennis Dugan,5.264195e-09,23.724293,Q1313063,Happy Gilmore,1,1.000000,0.400000,1.000000,0.788162,0,1.000000,13,2,0.170207,0.330090,0.779795,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32409,0,94,Goodfellas,1990|DVD|Martin Scorsese|Nicholas Pileggi|145|Widescreen,Goodfellas,Q42047,Goodfellas,Morrie\s Wig Shop|Morrie\s Wigs,exact-match,1990 film by Martin Scorsese,3.841212e-09,19.838593,Q42047,Goodfellas,1,1.000000,0.400000,1.000000,0.806880,0,0.500000,10,1,0.170207,0.450226,0.844393,1
32414,0,95,Gran Torino,2008|Blu-Ray|Clint Eastwood|Nick Schenk|116|Widescreen,Gran Torino,Q126699,Gran Torino,,fuzzy-augmented,2008 film by Clint Eastwood,3.840480e-09,29.181086,Q126699,Gran Torino,1,1.000000,0.400000,1.000000,0.805404,0,1.000000,11,2,0.170207,0.384093,0.804078,1
32520,0,95,Gran Torino,2008|Blu-Ray|Clint Eastwood|Nick Schenk|116|Widescreen,Gran Torino,Q126699,Gran Torino,,exact-match,2008 film by Clint Eastwood,3.840480e-09,21.693314,Q126699,Gran Torino,1,1.000000,0.400000,1.000000,0.805404,1,0.500000,11,2,0.170207,0.384093,0.804078,1
32812,0,98,"Great Gatsby, The",1974|DVD|Jack Clayton|Francis Ford Coppola|144|Widescreen,"Great Gatsby, The",Q1198799,The Great Gatsby,Great Gatsby,fuzzy-augmented,1974 film by Jack Clayton,3.925254e-09,28.070555,Q1198799,The Great Gatsby,1,0.990476,0.400000,0.732843,0.810695,0,1.000000,16,3,0.170207,0.365816,0.837175,1


In [31]:
# Generate lof-reciprocal-rank
for idx, fid in enumerate(file_ids):
    print(f"generating final feature for {idx}th file: {fid}")
    class_count_f = f'{HOME_DIR}/train-class-count/{fid}_class_count.tsv'
    property_count_f = f'{HOME_DIR}/train-prop-count/{fid}_prop_count.tsv'
    merged_lof_f = f'{HOME_DIR}/merged-lof-score/{fid}.csv'
    final_features_f = f'{HOME_DIR}/final-features/{fid}.csv'
    script = f"""
    tl generate-reciprocal-rank {merged_lof_f} \
    -c lof-graph-embedding-score \
    -o lof-reciprocal-rank \
    / compute-tf-idf \
    --feature-file {class_count_f} \
    --feature-name class_count \
    --singleton-column is_lof \
    -o lof_class_count_tf_idf_score \
    / compute-tf-idf \
    --feature-file {property_count_f} \
    --feature-name property_count \
    --singleton-column is_lof \
    -o lof_property_count_tf_idf_score \
    > {final_features_f}
    """
    os.system(script)
    assert os.path.isfile(final_features_f), f"Something wrong with final feature result: {idx}th file: {fid}"

generating final feature for 0th file: 69537082_0_7789694313271016902
generating final feature for 1th file: 60319454_0_3938426910282115527
generating final feature for 2th file: 16767252_0_2409448375013995751
generating final feature for 3th file: 84548468_0_5955155464119382182
generating final feature for 4th file: 80588006_0_6965325215443683359
generating final feature for 5th file: 39650055_5_7135804139753401681
generating final feature for 6th file: 8468806_0_4382447409703007384
generating final feature for 7th file: 1438042989043_35_20150728002309-00287-ip-10-236-191-2_875026214_2
generating final feature for 8th file: 25404227_0_2240631045609013057
generating final feature for 9th file: 63450419_0_8012592961815711786
generating final feature for 10th file: 53822652_0_5767892317858575530
generating final feature for 11th file: 22864497_0_8632623712684511496
generating final feature for 12th file: 37856682_0_6818907050314633217
generating final feature for 13th file: 26310680_0_51

In [None]:
#     / compute-tf-idf \
#     --feature-file {class_count_f} \
#     --feature-name class_count \
#     --singleton-column singleton \
#     -o cos_class_count_tf_idf_score \
#     / compute-tf-idf \
#     --feature-file {property_count_f} \
#     --feature-name property_count \
#     --singleton-column singleton \
#     -o cos_property_count_tf_idf_score \

In [54]:
# check final feature files
fid = '9567241_0_5666388268510912770'
final_feature_df = pd.read_csv(f'{HOME_DIR}/final-features/{fid}.csv')
final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, [
    'property_count_tf_idf_score', 'lof_property_count_tf_idf_score',
    'class_count_tf_idf_score', 'lof_class_count_tf_idf_score'
]]

Unnamed: 0,property_count_tf_idf_score,lof_property_count_tf_idf_score,class_count_tf_idf_score,lof_class_count_tf_idf_score
0,0.103461,0.324091,0.215613,0.560593
100,0.103461,0.324091,0.215613,0.560593
101,0.136712,0.428248,0.235488,0.612268
201,0.136712,0.428248,0.235488,0.612268
202,0.365876,0.711858,0.235488,0.612268
362,0.867718,0.807535,0.852363,0.698913
458,0.867718,0.807535,0.852363,0.698913
459,0.145132,0.454624,0.341726,0.888485
559,0.145132,0.454624,0.341726,0.888485
704,0.082228,0.18975,0.147975,0.296759


In [55]:
len(final_feature_df.groupby(['column', 'row']))

21

In [56]:
len(final_feature_df[final_feature_df['is_lof'] == 1]), \
len(final_feature_df[(final_feature_df['is_lof'] == 1) & (final_feature_df['evaluation_label'] == 1)]), \
len(final_feature_df[final_feature_df['singleton'] == 1]), \
len(final_feature_df[(final_feature_df['singleton'] == 1) & (final_feature_df['evaluation_label'] == 1)])

(7, 6, 11, 8)

In [61]:
# min-max scaling on tfidf score
cos_class_tfidf_max = final_feature_df['class_count_tf_idf_score'].max()
cos_class_tfidf_min = final_feature_df['class_count_tf_idf_score'].min()
cos_property_tfidf_max = final_feature_df['property_count_tf_idf_score'].max()
cos_property_tfidf_min = final_feature_df['property_count_tf_idf_score'].min()
lof_class_tfidf_max = final_feature_df['lof_class_count_tf_idf_score'].max()
lof_class_tfidf_min = final_feature_df['lof_class_count_tf_idf_score'].min()
lof_property_tfidf_max = final_feature_df['lof_property_count_tf_idf_score'].max()
lof_property_tfidf_min = final_feature_df['lof_property_count_tf_idf_score'].min()
final_feature_df['class_count_tf_idf_score'] = (final_feature_df['class_count_tf_idf_score'] - cos_class_tfidf_min) / (cos_class_tfidf_max - cos_class_tfidf_min)
final_feature_df['property_count_tf_idf_score'] = (final_feature_df['property_count_tf_idf_score'] - cos_property_tfidf_min) / (cos_property_tfidf_max - cos_property_tfidf_min)
final_feature_df['lof_class_count_tf_idf_score'] = (final_feature_df['lof_class_count_tf_idf_score'] - lof_class_tfidf_min) / (lof_class_tfidf_max - lof_class_tfidf_min)
final_feature_df['lof_property_count_tf_idf_score'] = (final_feature_df['lof_property_count_tf_idf_score'] - lof_property_tfidf_min) / (lof_property_tfidf_max - lof_property_tfidf_min)
final_feature_df

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,GT_kg_id,GT_kg_label,evaluation_label,monge_elkan,des_cont_jaccard,jaro_winkler,graph-embedding-score,singleton,reciprocal_rank,num_char,num_tokens,class_count_tf_idf_score,property_count_tf_idf_score,lof-graph-embedding-score,is_lof,lof-reciprocal-rank,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score
0,1,0,Dainik Jagran,1|15.400,Dainik Jagran,Q1318465,Dainik Jagran,,fuzzy-augmented,newspaper,4.597725e-09,39.010326,Q1318465,Dainik Jagran,1,1.000000,0.0,1.000000,0.773640,0,0.250000,13,2,0.252959,0.119233,0.878575,-1,1.000000,0.630953,0.401334
1,1,0,Dainik Jagran,1|15.400,Dainik Jagran,Q20984008,Dainik Asam,,fuzzy-augmented,,3.539613e-09,20.664627,Q1318465,Dainik Jagran,-1,0.805556,0.0,0.902098,0.681579,0,0.041667,11,2,0.110739,0.037485,0.745750,-1,0.052632,0.276216,0.126171
2,1,0,Dainik Jagran,1|15.400,Dainik Jagran,Q15240907,Kutton,,fuzzy-augmented,"village in Azad Kashmir, Pakistan",4.099829e-09,20.076506,Q1318465,Dainik Jagran,-1,0.444444,0.0,0.414530,0.537725,0,0.021277,6,1,0.046999,0.010682,0.525195,-1,0.021739,0.111370,0.035954
3,1,0,Dainik Jagran,1|15.400,Dainik Jagran,Q55614827,Jagran Prakashan Limited,Jagran Prakashan,fuzzy-augmented,Indian newspaper publisher and mass media company,1.493805e-08,20.076506,Q1318465,Dainik Jagran,-1,0.748898,0.0,0.566239,0.729553,0,0.062500,24,3,0.161634,0.087082,0.760178,-1,0.066667,0.403162,0.207393
4,1,0,Dainik Jagran,1|15.400,Dainik Jagran,Q14632383,Jagran,,fuzzy-augmented,village in Indonesia,7.991804e-09,18.884026,Q1318465,Dainik Jagran,-1,0.888889,0.0,0.495726,0.569860,0,0.024390,6,1,0.046999,0.010372,0.514971,-1,0.020833,0.111370,0.012742
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059,1,9,Dinamalar,10|3.500,Dinamalar,Q69536690,Allahabad Bank Ssm College Dinanagar branch,,fuzzy-augmented,Bank in India,3.539613e-09,8.612831,Q3246100,Dinamalar,-1,0.675110,0.0,0.429156,0.621177,0,0.111111,43,6,0.043232,0.010279,0.624439,-1,0.142857,0.101973,0.012742
2060,1,9,Dinamalar,10|3.500,Dinamalar,Q35336,Djimini,Djimini Senoufo|Djimini language,fuzzy-augmented,language,3.183702e-07,6.524282,Q3246100,Dinamalar,-1,0.588624,0.0,0.588624,0.485838,0,0.052632,7,1,0.013840,0.004998,0.504227,-1,0.058824,0.007505,0.016824
2061,1,9,Dinamalar,10|3.500,Dinamalar,Q97448743,,,fuzzy-augmented,,3.539613e-09,5.136879,Q3246100,Dinamalar,-1,0.000000,0.0,0.000000,0.659981,0,0.200000,0,0,0.209321,0.000026,0.620545,-1,0.125000,0.164232,0.000087
2062,1,9,Dinamalar,10|3.500,Dinamalar,Q5277834,Dina Azar,,fuzzy-augmented,Lebanese model,3.539613e-09,15.405266,Q3246100,Dinamalar,-1,0.780093,0.0,0.911111,0.273327,0,0.047619,9,2,0.016704,0.003346,0.254219,-1,0.047619,0.041665,0.011263


In [66]:
final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, ['kg_id', 'method', 'class_count_tf_idf_score', 'property_count_tf_idf_score', 'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score']]

Unnamed: 0,kg_id,method,class_count_tf_idf_score,property_count_tf_idf_score,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score
0,Q1318465,fuzzy-augmented,0.252959,0.119233,0.630953,0.401334
100,Q1318465,exact-match,0.252959,0.119233,0.630953,0.401334
101,Q1872524,fuzzy-augmented,0.276277,0.157553,0.689114,0.530315
201,Q1872524,exact-match,0.276277,0.157553,0.689114,0.530315
202,Q164746,fuzzy-augmented,0.276277,0.421653,0.689114,0.88152
362,Q9684,fuzzy-augmented,1.0,1.0,0.786635,1.0
458,Q9684,exact-match,1.0,1.0,0.786635,1.0
459,Q2129720,fuzzy-augmented,0.400916,0.167257,1.0,0.562977
559,Q2129720,exact-match,0.400916,0.167257,1.0,0.562977
704,Q1023924,fuzzy-augmented,0.173605,0.094764,0.334006,0.234974


In [62]:
final_feature_df[
    (final_feature_df['evaluation_label'] == 1) & (final_feature_df['property_count_tf_idf_score'] < final_feature_df['lof_property_count_tf_idf_score'])
]

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,GT_kg_id,GT_kg_label,evaluation_label,monge_elkan,des_cont_jaccard,jaro_winkler,graph-embedding-score,singleton,reciprocal_rank,num_char,num_tokens,class_count_tf_idf_score,property_count_tf_idf_score,lof-graph-embedding-score,is_lof,lof-reciprocal-rank,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score
0,1,0,Dainik Jagran,1|15.400,Dainik Jagran,Q1318465,Dainik Jagran,,fuzzy-augmented,newspaper,4.597725e-09,39.010326,Q1318465,Dainik Jagran,1,1.0,0.0,1.0,0.77364,0,0.25,13,2,0.252959,0.119233,0.878575,-1,1.0,0.630953,0.401334
100,1,0,Dainik Jagran,1|15.400,Dainik Jagran,Q1318465,Dainik Jagran,,exact-match,newspaper,4.597725e-09,21.693314,Q1318465,Dainik Jagran,1,1.0,0.0,1.0,0.77364,1,0.2,13,2,0.252959,0.119233,0.878575,1,0.5,0.630953,0.401334
101,1,1,Dainik Bhaskar,2|14.000,Dainik Bhaskar,Q1872524,Dainik Bhaskar,,fuzzy-augmented,Hindi Newspaper,3.539613e-09,30.801334,Q1872524,Dainik Bhaskar,1,1.0,0.0,1.0,0.730027,0,0.090909,14,2,0.276277,0.157553,0.821949,-1,0.2,0.689114,0.530315
201,1,1,Dainik Bhaskar,2|14.000,Dainik Bhaskar,Q1872524,Dainik Bhaskar,,exact-match,Hindi Newspaper,3.539613e-09,21.693314,Q1872524,Dainik Bhaskar,1,1.0,0.0,1.0,0.730027,1,0.083333,14,2,0.276277,0.157553,0.821949,1,0.166667,0.689114,0.530315
202,1,10,WALL STREET JOURNAL USA,11|3.400,WALL STREET JOURNAL USA,Q164746,The Wall Street Journal,Wall Street Journal|WSJ|The Journal,fuzzy-augmented,American daily newspaper,6.512986e-08,29.858479,Q164746,The Wall Street Journal,1,0.899306,0.0,0.746377,0.747756,0,0.333333,23,4,0.276277,0.421653,0.638817,-1,0.090909,0.689114,0.88152
459,1,13,Gujarat Samachar,14|3.000,Gujarat Samachar,Q2129720,Gujarat Samachar,Lok Prakashan Limited,fuzzy-augmented,Gujarati language daily newspaper in India,3.94256e-09,29.747366,Q2129720,Gujarat Samachar,1,1.0,0.0,1.0,0.768198,0,0.5,16,2,0.400916,0.167257,0.81908,-1,1.0,1.0,0.562977
559,1,13,Gujarat Samachar,14|3.000,Gujarat Samachar,Q2129720,Gujarat Samachar,Lok Prakashan Limited,exact-match,Gujarati language daily newspaper in India,3.94256e-09,21.693314,Q2129720,Gujarat Samachar,1,1.0,0.0,1.0,0.768198,1,0.333333,16,2,0.400916,0.167257,0.81908,1,0.5,1.0,0.562977
704,1,15,IBN live,16|2.800,IBN live,Q1023924,CNN-News18,CNN-IBN|CNNNews18,fuzzy-augmented,Indian English-language news television channel,4.834308e-08,38.062836,Q1023924,CNN-News18,1,0.470833,0.0,0.483333,0.688494,0,0.5,10,1,0.173605,0.094764,0.677521,-1,0.5,0.334006,0.234974
723,1,16,USA Today,17|2.525,USA Today,Q39681,USA Today,usatoday,fuzzy-augmented,American national daily newspaper,4.171684e-08,23.440456,Q39681,USA Today,1,1.0,0.0,1.0,0.718286,0,0.25,9,2,0.276277,0.351597,0.652207,-1,0.142857,0.689114,0.934676
828,1,16,USA Today,17|2.525,USA Today,Q39681,USA Today,usatoday,exact-match,American national daily newspaper,4.171684e-08,20.303715,Q39681,USA Today,1,1.0,0.0,1.0,0.718286,0,0.2,9,2,0.276277,0.351597,0.652207,-1,0.125,0.689114,0.934676


In [63]:
final_feature_df.sort_values(by=['class_count_tf_idf_score'], ascending=False)['evaluation_label'].head(10)

362     1
458     1
940    -1
1751   -1
1750   -1
1260    0
1447    0
1940    1
1687   -1
1919    1
Name: evaluation_label, dtype: int64

In [64]:
final_feature_df.sort_values(by=['lof_class_count_tf_idf_score'], ascending=False)['evaluation_label'].head(10)

559     1
459     1
458     1
362     1
1821   -1
1127    0
529    -1
286    -1
835    -1
723     1
Name: evaluation_label, dtype: int64

In [60]:
final_feature_df['class_count_tf_idf_score'].max()

0.8523632229755529

In [48]:
# final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, [
#     'cos_property_count_tf_idf_score', 'lof_property_count_tf_idf_score',
#     'cos_class_count_tf_idf_score', 'lof_class_count_tf_idf_score'
# ]]

In [49]:
# final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, [
#     'cos_class_count_tf_idf_score_tf', 'cos_class_count_tf_idf_score_idf',
#     'lof_class_count_tf_idf_score_tf', 'lof_class_count_tf_idf_score_idf'
# ]]

In [50]:
# final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, [
#     'cos_property_count_tf_idf_score_tf', 'cos_property_count_tf_idf_score_idf',
#     'lof_property_count_tf_idf_score_tf', 'lof_property_count_tf_idf_score_idf'
# ]]

### Evaluation of lof-graph-embedding-score
- baseline: graph-embedding-score (centroid-of-singleton)

In [38]:
# use top 1/5 accuracy
def embedding_eval(eval_file):
    assert "graph-embedding-score" in eval_file
    assert "lof-graph-embedding-score" in eval_file
    
    cos_top1_count = 0
    cos_top5_count = 0
    lof_top1_count = 0
    lof_top5_count = 0
    all_count = 0
    
    for ((col, row), group) in eval_file.groupby(['column', 'row']):
        all_count += 1
        
        # sort by centroid-of-singleton embedding score
        eval_labels = group.sort_values(by=['graph-embedding-score'], ascending=False)['evaluation_label']
        if eval_labels.iloc[0] == 1:
            cos_top1_count += 1
        if 1 in eval_labels.iloc[:5].values:
            cos_top5_count += 1
            
        # sort by centroid-of-lof embedding score
        eval_labels = group.sort_values(by=['lof-graph-embedding-score'], ascending=False)['evaluation_label']
        if eval_labels.iloc[0] == 1:
            lof_top1_count += 1
        if 1 in eval_labels.iloc[:5].values:
            lof_top5_count += 1
    
    return {
        'cos_top1_accuracy': cos_top1_count / all_count, 
        'cos_top5_accuracy': cos_top5_count / all_count, 
        'lof_top1_accuracy': lof_top1_count / all_count, 
        'lof_top5_accuracy': lof_top5_count / all_count,
        'all_count': all_count
    }

In [39]:
res_top_accuracy = {}
for fid in file_ids:
    final_df = pd.read_csv(f"{HOME_DIR}/merged-lof-score/{fid}.csv")
    res_top_accuracy[fid] = embedding_eval(final_df)
res_top_accuracy

{'69537082_0_7789694313271016902': {'cos_top1_accuracy': 0.4560669456066946,
  'cos_top5_accuracy': 0.7154811715481172,
  'lof_top1_accuracy': 0.8493723849372385,
  'lof_top5_accuracy': 0.9414225941422594,
  'all_count': 239},
 '60319454_0_3938426910282115527': {'cos_top1_accuracy': 0.2978723404255319,
  'cos_top5_accuracy': 0.5957446808510638,
  'lof_top1_accuracy': 0.46808510638297873,
  'lof_top5_accuracy': 0.6595744680851063,
  'all_count': 47},
 '16767252_0_2409448375013995751': {'cos_top1_accuracy': 0.5681818181818182,
  'cos_top5_accuracy': 0.8295454545454546,
  'lof_top1_accuracy': 0.6477272727272727,
  'lof_top5_accuracy': 0.8636363636363636,
  'all_count': 88},
 '84548468_0_5955155464119382182': {'cos_top1_accuracy': 0.48484848484848486,
  'cos_top5_accuracy': 0.7878787878787878,
  'lof_top1_accuracy': 0.5252525252525253,
  'lof_top5_accuracy': 0.8181818181818182,
  'all_count': 99},
 '80588006_0_6965325215443683359': {'cos_top1_accuracy': 0.19047619047619047,
  'cos_top5_acc

In [40]:
top_accuracy_df = pd.DataFrame(res_top_accuracy)
top_accuracy_df = top_accuracy_df.transpose()
len(top_accuracy_df[top_accuracy_df['lof_top1_accuracy'] < top_accuracy_df['cos_top1_accuracy']]), \
len(top_accuracy_df[top_accuracy_df['lof_top5_accuracy'] < top_accuracy_df['cos_top5_accuracy']]), \
len(top_accuracy_df)

(12, 10, 44)

In [41]:
# visualize embedding-score difference
def highlight_greaterthan_1(x):
    if x.lof_top1_accuracy < x.cos_top1_accuracy:
        return ['background-color: yellow']*5
    else:
        return ['background-color: white']*5
    
top_accuracy_df.style.apply(highlight_greaterthan_1, axis=1)

Unnamed: 0,cos_top1_accuracy,cos_top5_accuracy,lof_top1_accuracy,lof_top5_accuracy,all_count
69537082_0_7789694313271016902,0.456067,0.715481,0.849372,0.941423,239.0
60319454_0_3938426910282115527,0.297872,0.595745,0.468085,0.659574,47.0
16767252_0_2409448375013995751,0.568182,0.829545,0.647727,0.863636,88.0
84548468_0_5955155464119382182,0.484848,0.787879,0.525253,0.818182,99.0
80588006_0_6965325215443683359,0.190476,0.52381,0.333333,0.47619,21.0
39650055_5_7135804139753401681,0.34,0.66,0.31,0.66,100.0
8468806_0_4382447409703007384,0.245455,0.536364,0.318182,0.581818,110.0
1438042989043_35_20150728002309-00287-ip-10-236-191-2_875026214_2,0.0,0.0,0.0,0.0,18.0
25404227_0_2240631045609013057,0.44,0.85,0.55,0.9,100.0
63450419_0_8012592961815711786,0.0,0.020305,0.005076,0.020305,197.0
