In [65]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [6]:
HOME_DIR = '/Users/summ7t/dev/novartis/table-linker/t2dv2-candidates-april-28/dev'

### Generate lof-graph-embedding-score for any table

Required datasets
- candidate file
- candidate feature file
- graph_embedding_complex.tsv (generated and stored during candidate generation)

Script used `lof-script.sh`

```
filename=$1
tsv_postfix=_graph_embedding_complex

tl smallest-qnode-number train-candidates/candidates-$filename.csv \
/ align-page-rank \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard \
/ vote-by-classifier --prob-threshold 0.995 --model weighted_lr.pkl \
> model-voted/$filename.csv

tl score-using-embedding model-voted/$filename.csv \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o graph-embedding-score \
--embedding-file train-graph-embeddings/$filename$tsv_postfix.tsv \
--embedding-url http://ckg07:9200/wikidatadwd-augmented/ \
> lof-score/$filename.csv
```

cmd: `bash {HOME_DIR}/lof-script.sh {fid}`

output: lof-score/$filename.csv contains `is_lof` and `graph-embedding-score` (centroid-of-lof)

In [6]:
!mkdir -p $HOME_DIR/model-voted
!mkdir -p $HOME_DIR/lof-score
!mkdir -p $HOME_DIR/merged-lof-score
!mkdir -p $HOME_DIR/final-features

In [3]:
# list all files in candidates dir
file_names = []
file_ids = []

for (dirpath, dirnames, filenames) in os.walk(f'{HOME_DIR}/dev-candidates/'):
    for fn in filenames:
        if "csv" not in fn:
            continue
        abs_fn = dirpath + fn
        assert os.path.isfile(abs_fn)
        if os.path.getsize(abs_fn) == 0:
            continue
        file_names.append(abs_fn)
        file_ids.append(fn.split('.csv')[0].split('candidates-')[1])
len(file_names), file_ids[:3]

(9,
 ['14380604_4_3329235705746762392',
  '29414811_2_4773219892816395776',
  '84575189_0_6365692015941409487'])

In [12]:
for idx, fid in enumerate(file_ids):
    print(f"Generating score for {idx}th file: {fid}...")
    os.system(f'bash {HOME_DIR}/lof-script.sh {fid}')
    assert os.path.isfile(f'{HOME_DIR}/model-voted/{fid}.csv'), f"Something wrong with model-voted result: {idx}th file: {fid}"
    assert os.path.isfile(f'{HOME_DIR}/lof-score/{fid}.csv'), f"Something wrong with lof-score result: {idx}th file: {fid}"

Generating score for 0th file: 14380604_4_3329235705746762392...
Generating score for 1th file: 29414811_2_4773219892816395776...
Generating score for 2th file: 84575189_0_6365692015941409487...
Generating score for 3th file: 52299421_0_4473286348258170200...
Generating score for 4th file: 28086084_0_3127660530989916727...
Generating score for 5th file: 39759273_0_1427898308030295194...
Generating score for 6th file: 45073662_0_3179937335063201739...
Generating score for 7th file: 14067031_0_559833072073397908...
Generating score for 8th file: 50270082_0_444360818941411589...


In [13]:
# check model-voted and lof-score files
fid = '14380604_4_3329235705746762392'
model_voted_df = pd.read_csv(f'{HOME_DIR}/model-voted/{fid}.csv')
model_voted_df[model_voted_df['vote_by_classifier'] > 0]

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,vote_by_classifier
405,1,4,JPMorgan Chase & Co.,"5|USA|Banking|99.30|14.44|1,351.52|170.97",JPMorgan Chase Co.,Q192314,JPMorgan Chase,Bank of the Manhattan Company|JPMorgan Chase &...,fuzzy-augmented,American multinational banking and financial s...,1.038304e-07,34.48898,1,0.0,0.751111,0.125,0.208333,1
574,1,5,American International Group,6|USA|Insurance|113.19|14.01|979.41|174.47,American International Group,Q8774,International Airlines Group,International Consolidated Airlines Group SA|I...,fuzzy-augmented,British-Spanish multinational airline holding ...,4.579354e-08,15.741833,1,0.0,0.894444,0.0,0.0,1
606,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,exact-match,American multinational oil and gas corporation,7.353359e-08,21.693314,1,7.353359e-08,1.0,0.5,1.0,1
607,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,fuzzy-augmented,American multinational oil and gas corporation,7.353359e-08,21.561049,1,0.0,1.0,0.5,0.833333,1
1124,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",exact-match,British multinational oil and gas company,1.013292e-07,18.212986,0,1.013292e-07,1.0,0.5,1.0,1
1152,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",fuzzy-augmented,British multinational oil and gas company,1.013292e-07,16.17667,0,0.0,1.0,0.5,0.833333,1
2028,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,exact-match,British multinational banking and financial se...,4.450382e-08,20.621593,1,4.450382e-08,1.0,0.142857,0.285714,1
2031,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,fuzzy-augmented,British multinational banking and financial se...,4.450382e-08,18.98323,1,0.0,1.0,0.142857,0.238095,1


In [14]:
fid = '14380604_4_3329235705746762392'
score_df = pd.read_csv(f'{HOME_DIR}/lof-score/{fid}.csv')
score_df.sort_values(by=['graph-embedding-score'], ascending=False).head(10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,retrieval_score,smallest_qnode_number,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,vote_by_classifier,singleton,is_lof,graph-embedding-score
171,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",exact-match,British multinational oil and gas company,...,18.212986,0,1.013292e-07,1.0,0.5,1.0,1,0,1,0.885915
199,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",fuzzy-augmented,British multinational oil and gas company,...,16.17667,0,0.0,1.0,0.5,0.833333,1,0,1,0.885915
1075,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,exact-match,British multinational banking and financial se...,...,20.621593,1,4.450382e-08,1.0,0.142857,0.285714,1,0,1,0.863573
1078,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,fuzzy-augmented,British multinational banking and financial se...,...,18.98323,1,0.0,1.0,0.142857,0.238095,1,0,1,0.863573
1898,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,fuzzy-augmented,American multinational oil and gas corporation,...,21.561049,1,0.0,1.0,0.5,0.833333,1,0,1,0.843475
1897,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,exact-match,American multinational oil and gas corporation,...,21.693314,1,7.353359e-08,1.0,0.5,1.0,1,1,1,0.843475
0,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q219508,Citigroup,City Bank of New York|Citigroup Inc.|Citi,exact-match,American investment bank and financial service...,...,21.693314,0,8.027813e-08,1.0,0.0,0.0,0,1,1,0.811528
1,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q219508,Citigroup,City Bank of New York|Citigroup Inc.|Citi,fuzzy-augmented,American investment bank and financial service...,...,19.26316,0,0.0,1.0,0.0,0.0,0,0,-1,0.811528
1696,1,4,JPMorgan Chase & Co.,"5|USA|Banking|99.30|14.44|1,351.52|170.97",JPMorgan Chase Co.,Q192314,JPMorgan Chase,Bank of the Manhattan Company|JPMorgan Chase &...,fuzzy-augmented,American multinational banking and financial s...,...,34.48898,1,0.0,0.751111,0.125,0.208333,1,0,-1,0.792447
1797,1,5,American International Group,6|USA|Insurance|113.19|14.01|979.41|174.47,American International Group,Q212235,American International Group,"AIG|American International Group, Inc.",fuzzy-augmented,American multinational insurance corporation,...,28.054087,0,0.0,1.0,0.25,0.416667,0,0,-1,0.789462


In [None]:
# merge lof candidate (graph-embedding-score) with candidate feature file
for idx, fid in enumerate(file_ids):
    print(f"Merging embedding score for {idx}th file: {fid}...")
    features_df = pd.read_csv(f'{HOME_DIR}/dev-features/{fid}.csv')
    lof_score_df = pd.read_csv(f'{HOME_DIR}/lof-score/{fid}.csv')
    lof_score_df.rename(columns = {'graph-embedding-score':'lof-graph-embedding-score'}, inplace = True)
    trimmed_lof_score_df = lof_score_df.loc[:, ['column', 'row', 'kg_id', 'method', 'lof-graph-embedding-score', 'is_lof']]
    
    # merge two df on row, column, kg_id
    final_df = pd.merge(features_df, trimmed_lof_score_df, left_on=['column', 'row', 'kg_id', 'method'], right_on = ['column', 'row', 'kg_id', 'method'])
    final_df.drop_duplicates(inplace=True)
    assert len(final_df) == len(features_df), f"{len(features_df)}, {len(final_df)}"
    
    final_df.to_csv(f"{HOME_DIR}/merged-lof-score/{fid}.csv", index=False)
    assert os.path.isfile(f'{HOME_DIR}/merged-lof-score/{fid}.csv'), f"Something wrong with merged score result: {idx}th file: {fid}"

In [27]:
# check merged train feature files
fid = '14380604_4_3329235705746762392'
merged_score_df = pd.read_csv(f'{HOME_DIR}/merged-lof-score/{fid}.csv')
# merged_score_df.sort_values(by=['lof-graph-embedding-score'], ascending=False).head(10)
merged_score_df[merged_score_df['is_lof'] == 1]

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,GT_kg_id,GT_kg_label,evaluation_label,monge_elkan,des_cont_jaccard,jaro_winkler,graph-embedding-score,singleton,reciprocal_rank,num_char,num_tokens,class_count_tf_idf_score,property_count_tf_idf_score,lof-graph-embedding-score,is_lof
57,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q219508,Citigroup,City Bank of New York|Citigroup Inc.|Citi,exact-match,American investment bank and financial services corporation,8.027813e-08,21.693314,Q219508,Citigroup,1,1.0,0.0,1.0,0.84192,1,0.5,9,1,0.445131,0.575829,0.811528,1
176,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",fuzzy-augmented,British multinational oil and gas company,1.013292e-07,16.17667,Q152057,bp,1,1.0,0.5,1.0,0.74522,0,1.0,2,1,0.349661,0.531144,0.885915,1
311,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",exact-match,British multinational oil and gas company,1.013292e-07,18.212986,Q152057,bp,1,1.0,0.5,1.0,0.74522,0,0.5,2,1,0.349661,0.531144,0.885915,1
1075,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,fuzzy-augmented,British multinational banking and financial services company,4.450382e-08,18.98323,Q245343,Barclays,1,1.0,0.142857,1.0,0.770165,0,1.0,8,1,0.598094,0.43189,0.863573,1
1220,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,exact-match,British multinational banking and financial services company,4.450382e-08,20.621593,Q245343,Barclays,1,1.0,0.142857,1.0,0.770165,0,0.5,8,1,0.598094,0.43189,0.863573,1
1896,1,5,American International Group,6|USA|Insurance|113.19|14.01|979.41|174.47,American International Group,Q212235,American International Group,"AIG|American International Group, Inc.",exact-match,American multinational insurance corporation,1.831754e-08,21.693314,Q212235,American International Group,1,1.0,0.25,1.0,0.814121,1,0.5,28,3,0.349661,0.335018,0.789462,1
1897,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobil Corporation,fuzzy-augmented,American multinational oil and gas corporation,7.353359e-08,21.561049,Q156238,ExxonMobil,1,1.0,0.5,1.0,0.82417,0,1.0,10,1,0.620813,0.550966,0.843475,1
1997,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobil Corporation,exact-match,American multinational oil and gas corporation,7.353359e-08,21.693314,Q156238,ExxonMobil,1,1.0,0.5,1.0,0.82417,1,0.5,10,1,0.620813,0.550966,0.843475,1


In [53]:
# Generate lof-reciprocal-rank
for idx, fid in enumerate(file_ids):
    print(f"generating final feature for {idx}th file: {fid}")
    class_count_f = f'{HOME_DIR}/dev-class-count/{fid}_class_count.tsv'
    property_count_f = f'{HOME_DIR}/dev-prop-count/{fid}_prop_count.tsv'
    merged_lof_f = f'{HOME_DIR}/merged-lof-score/{fid}.csv'
    final_features_f = f'{HOME_DIR}/final-features/{fid}.csv'
    script = f"""
    tl generate-reciprocal-rank {merged_lof_f} \
    -c lof-graph-embedding-score \
    -o lof-reciprocal-rank \
    / compute-tf-idf \
    --feature-file {class_count_f} \
    --feature-name class_count \
    --singleton-column is_lof \
    -o lof_class_count_tf_idf_score \
    / compute-tf-idf \
    --feature-file {property_count_f} \
    --feature-name property_count \
    --singleton-column is_lof \
    -o lof_property_count_tf_idf_score \
    > {final_features_f}
    """
    os.system(script)
    assert os.path.isfile(final_features_f), f"Something wrong with final feature result: {idx}th file: {fid}"

generating final feature for 0th file: 14380604_4_3329235705746762392
generating final feature for 1th file: 29414811_2_4773219892816395776
generating final feature for 2th file: 84575189_0_6365692015941409487
generating final feature for 3th file: 52299421_0_4473286348258170200
generating final feature for 4th file: 28086084_0_3127660530989916727
generating final feature for 5th file: 39759273_0_1427898308030295194
generating final feature for 6th file: 45073662_0_3179937335063201739
generating final feature for 7th file: 14067031_0_559833072073397908
generating final feature for 8th file: 50270082_0_444360818941411589


In [None]:
#     / compute-tf-idf \
#     --feature-file {class_count_f} \
#     --feature-name class_count \
#     --singleton-column singleton \
#     -o cos_class_count_tf_idf_score \
#     / compute-tf-idf \
#     --feature-file {property_count_f} \
#     --feature-name property_count \
#     --singleton-column singleton \
#     -o cos_property_count_tf_idf_score \

In [69]:
# check final feature files
fid = '28086084_0_3127660530989916727'
final_feature_df = pd.read_csv(f'{HOME_DIR}/final-features/{fid}.csv')
final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, [
    'property_count_tf_idf_score', 'lof_property_count_tf_idf_score',
    'class_count_tf_idf_score', 'lof_class_count_tf_idf_score'
]]

Unnamed: 0,property_count_tf_idf_score,lof_property_count_tf_idf_score,class_count_tf_idf_score,lof_class_count_tf_idf_score
6,0.016894,0.018686,0.092391,0.1147
101,0.095265,0.098831,0.118282,0.150839
204,0.076145,0.073464,0.093663,0.106514
303,0.076145,0.073464,0.093663,0.106514
304,0.023844,0.026374,0.074322,0.069656
404,0.023844,0.026374,0.074322,0.069656
405,0.103333,0.114299,0.188452,0.233957
505,0.103333,0.114299,0.188452,0.233957
509,0.248082,0.274409,0.230741,0.286457
607,0.115195,0.127419,0.193542,0.240276


In [70]:
len(final_feature_df.groupby(['column', 'row']))

224

In [71]:
len(final_feature_df[final_feature_df['is_lof'] == 1]), \
len(final_feature_df[(final_feature_df['is_lof'] == 1) & (final_feature_df['evaluation_label'] == 1)]), \
len(final_feature_df[final_feature_df['singleton'] == 1]), \
len(final_feature_df[(final_feature_df['singleton'] == 1) & (final_feature_df['evaluation_label'] == 1)])

(89, 84, 111, 104)

In [72]:
# min-max scaling on tfidf score
cos_class_tfidf_max = final_feature_df['class_count_tf_idf_score'].max()
cos_class_tfidf_min = final_feature_df['class_count_tf_idf_score'].min()
cos_property_tfidf_max = final_feature_df['property_count_tf_idf_score'].max()
cos_property_tfidf_min = final_feature_df['property_count_tf_idf_score'].min()
lof_class_tfidf_max = final_feature_df['lof_class_count_tf_idf_score'].max()
lof_class_tfidf_min = final_feature_df['lof_class_count_tf_idf_score'].min()
lof_property_tfidf_max = final_feature_df['lof_property_count_tf_idf_score'].max()
lof_property_tfidf_min = final_feature_df['lof_property_count_tf_idf_score'].min()
final_feature_df['class_count_tf_idf_score'] = (final_feature_df['class_count_tf_idf_score'] - cos_class_tfidf_min) / (cos_class_tfidf_max - cos_class_tfidf_min)
final_feature_df['property_count_tf_idf_score'] = (final_feature_df['property_count_tf_idf_score'] - cos_property_tfidf_min) / (cos_property_tfidf_max - cos_property_tfidf_min)
final_feature_df['lof_class_count_tf_idf_score'] = (final_feature_df['lof_class_count_tf_idf_score'] - lof_class_tfidf_min) / (lof_class_tfidf_max - lof_class_tfidf_min)
final_feature_df['lof_property_count_tf_idf_score'] = (final_feature_df['lof_property_count_tf_idf_score'] - lof_property_tfidf_min) / (lof_property_tfidf_max - lof_property_tfidf_min)
# final_feature_df

In [73]:
# final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, [
#     'kg_id', 'method', 'class_count_tf_idf_score', 'property_count_tf_idf_score', 'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score'
# ]]

In [76]:
final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, ['kg_id', 'method', 'class_count_tf_idf_score', 'property_count_tf_idf_score', 'lof_class_count_tf_idf_score', 'lof_property_count_tf_idf_score']]

Unnamed: 0,kg_id,method,class_count_tf_idf_score,property_count_tf_idf_score,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score
6,Q1292819,fuzzy-augmented,0.304966,0.032547,0.29146,0.029453
101,Q77394,fuzzy-augmented,0.390429,0.183536,0.383292,0.155774
204,Q132148,fuzzy-augmented,0.309165,0.1467,0.270659,0.115792
303,Q132148,exact-match,0.309165,0.1467,0.270659,0.115792
304,Q2469151,fuzzy-augmented,0.245325,0.045938,0.177,0.04157
404,Q2469151,exact-match,0.245325,0.045938,0.177,0.04157
405,Q172377,fuzzy-augmented,0.622049,0.199081,0.594501,0.180155
505,Q172377,exact-match,0.622049,0.199081,0.594501,0.180155
509,Q133063,fuzzy-augmented,0.761638,0.477952,0.727907,0.432515
607,Q160931,fuzzy-augmented,0.638851,0.221932,0.610559,0.200834


In [57]:
final_feature_df.sort_values(by=['class_count_tf_idf_score'], ascending=False).head(10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,GT_kg_id,GT_kg_label,evaluation_label,monge_elkan,des_cont_jaccard,jaro_winkler,graph-embedding-score,singleton,reciprocal_rank,num_char,num_tokens,class_count_tf_idf_score,property_count_tf_idf_score,lof-graph-embedding-score,is_lof,lof-reciprocal-rank,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score
7838,2,74,Genes & Development,75|93,Genes Development,Q112067,"Genes, Brain and Behavior",G2B|Genes Brain Behav.,fuzzy-augmented,Scientific journal,3.539613e-09,15.829847,Q1524533,Genes & Development,-1,0.501199,0.0,0.642679,0.830504,0,0.166667,25,4,0.922595,0.182474,0.811784,-1,0.333333,0.76351,0.136446
8108,2,76,Antimicrobial Agents and Chemotherapy,77|91,Antimicrobial Agents and Chemotherapy,Q578004,Antimicrobial Agents and Chemotherapy,Antimicrobial agents and chemotherapy|Antimicrob Agents Chemother (Bethesda)|Antimicrob Agents Chemother|Antimicrob. Agents Chemother.,exact-match,scientific journal,3.539613e-09,21.693314,Q578004,Antimicrobial Agents and Chemotherapy,1,1.0,0.0,1.0,0.825053,1,0.111111,37,4,0.905342,0.218164,0.776517,1,0.166667,0.749232,0.163133
8008,2,76,Antimicrobial Agents and Chemotherapy,77|91,Antimicrobial Agents and Chemotherapy,Q578004,Antimicrobial Agents and Chemotherapy,Antimicrobial agents and chemotherapy|Antimicrob Agents Chemother (Bethesda)|Antimicrob Agents Chemother|Antimicrob. Agents Chemother.,fuzzy-augmented,scientific journal,3.539613e-09,50.660416,Q578004,Antimicrobial Agents and Chemotherapy,1,1.0,0.0,1.0,0.825053,0,0.125,37,4,0.905342,0.218164,0.776517,-1,0.2,0.749232,0.163133
3709,2,39,Journal of Molecular Biology,40|162,Journal of Molecular Biology,Q1524048,Journal of Leukocyte Biology,J. Leukoc. Biol.|J Leukoc Biol,fuzzy-augmented,scientific journal,4.445894e-09,22.634863,Q925779,Journal of Molecular Biology,-1,0.894874,0.0,0.909524,0.829085,0,0.033333,28,4,0.899549,0.193976,0.749735,-1,0.030303,0.744438,0.145046
7551,2,71,The Journal of Pharmacology And Experimental Therapeutics,72|98,The Journal of Pharmacology And Experimental Therapeutics,Q176044,British Journal of Clinical Pharmacology,Br. J. Clin. Pharmacol.|Br J Clin Pharmacol.,fuzzy-augmented,scientific journal,4.492359e-09,23.825903,Q1500272,Journal of Pharmacology and Experimental Therapeutics,-1,0.739087,0.0,0.630885,0.775199,0,0.016129,40,5,0.899549,0.208187,0.685951,-1,0.014286,0.744438,0.155673
9311,2,84,The Journal of Clinical Endocrinology & Metabolism,85|85,The Journal of Clinical Endocrinology Metabolism,Q15745208,European Journal of Clinical Investigation,,fuzzy-augmented,journal,3.539613e-09,21.430408,Q3186902,The Journal of Clinical Endocrinology and Metabolism,-1,0.710816,0.0,0.713492,0.872266,0,0.045455,42,5,0.899549,0.15667,0.799611,-1,0.071429,0.744438,0.117151
7222,2,69,Human Immunology,70|102,Human Immunology,Q13731918,Immunology & Cell Biology,Immunology and Cell Biology,fuzzy-augmented,scientific journal,3.539613e-09,15.896554,Q15709955,Human Immunology,-1,0.607937,0.0,0.557652,0.897541,0,0.333333,25,4,0.899549,0.13752,0.833978,-1,0.5,0.744438,0.102831
3213,2,36,The Journal of Bacteriology,37|165,The Journal of Bacteriology,Q400296,The Journal of Pathology,Journal of Pathology,fuzzy-augmented,peer-reviewed scientific journal,3.539613e-09,26.566677,Q478419,Journal of Bacteriology,-1,0.946759,0.0,0.946296,0.868306,0,0.142857,24,4,0.899549,0.194082,0.801616,-1,0.333333,0.744438,0.145125
9248,2,84,The Journal of Clinical Endocrinology & Metabolism,85|85,The Journal of Clinical Endocrinology Metabolism,Q3186904,Journal of Clinical Investigation,The journal of clinical investigation|J. Clin. Invest.|J Clin Invest,fuzzy-augmented,peer-reviewed scientific journal,4.542504e-09,24.363552,Q3186902,The Journal of Clinical Endocrinology and Metabolism,-1,0.736237,0.0,0.648066,0.839285,0,0.02381,33,4,0.899549,0.215837,0.767367,-1,0.027778,0.744438,0.161394
2464,2,29,The American Journal of Clinical Nutrition,30|185,The American Journal of Clinical Nutrition,Q2842959,The American Journal of Medicine,American Journal of Medicine,fuzzy-augmented,journal,3.631214e-09,22.533007,Q7713500,The American Journal of Clinical Nutrition,-1,0.905886,0.0,0.894048,0.806349,0,0.017241,32,5,0.899549,0.208783,0.719791,-1,0.016667,0.744438,0.156119


In [58]:
final_feature_df.sort_values(by=['lof_class_count_tf_idf_score'], ascending=False).head(10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,GT_kg_id,GT_kg_label,evaluation_label,monge_elkan,des_cont_jaccard,jaro_winkler,graph-embedding-score,singleton,reciprocal_rank,num_char,num_tokens,class_count_tf_idf_score,property_count_tf_idf_score,lof-graph-embedding-score,is_lof,lof-reciprocal-rank,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score
7838,2,74,Genes & Development,75|93,Genes Development,Q112067,"Genes, Brain and Behavior",G2B|Genes Brain Behav.,fuzzy-augmented,Scientific journal,3.539613e-09,15.829847,Q1524533,Genes & Development,-1,0.501199,0.0,0.642679,0.830504,0,0.166667,25,4,0.922595,0.182474,0.811784,-1,0.333333,0.76351,0.136446
8008,2,76,Antimicrobial Agents and Chemotherapy,77|91,Antimicrobial Agents and Chemotherapy,Q578004,Antimicrobial Agents and Chemotherapy,Antimicrobial agents and chemotherapy|Antimicrob Agents Chemother (Bethesda)|Antimicrob Agents Chemother|Antimicrob. Agents Chemother.,fuzzy-augmented,scientific journal,3.539613e-09,50.660416,Q578004,Antimicrobial Agents and Chemotherapy,1,1.0,0.0,1.0,0.825053,0,0.125,37,4,0.905342,0.218164,0.776517,-1,0.2,0.749232,0.163133
8108,2,76,Antimicrobial Agents and Chemotherapy,77|91,Antimicrobial Agents and Chemotherapy,Q578004,Antimicrobial Agents and Chemotherapy,Antimicrobial agents and chemotherapy|Antimicrob Agents Chemother (Bethesda)|Antimicrob Agents Chemother|Antimicrob. Agents Chemother.,exact-match,scientific journal,3.539613e-09,21.693314,Q578004,Antimicrobial Agents and Chemotherapy,1,1.0,0.0,1.0,0.825053,1,0.111111,37,4,0.905342,0.218164,0.776517,1,0.166667,0.749232,0.163133
9317,2,84,The Journal of Clinical Endocrinology & Metabolism,85|85,The Journal of Clinical Endocrinology Metabolism,Q176044,British Journal of Clinical Pharmacology,Br. J. Clin. Pharmacol.|Br J Clin Pharmacol.,fuzzy-augmented,scientific journal,4.492359e-09,21.430408,Q3186902,The Journal of Clinical Endocrinology and Metabolism,-1,0.710099,0.0,0.666717,0.775199,0,0.013699,40,5,0.899549,0.208187,0.685951,-1,0.012821,0.744438,0.155673
9248,2,84,The Journal of Clinical Endocrinology & Metabolism,85|85,The Journal of Clinical Endocrinology Metabolism,Q3186904,Journal of Clinical Investigation,The journal of clinical investigation|J. Clin. Invest.|J Clin Invest,fuzzy-augmented,peer-reviewed scientific journal,4.542504e-09,24.363552,Q3186902,The Journal of Clinical Endocrinology and Metabolism,-1,0.736237,0.0,0.648066,0.839285,0,0.02381,33,4,0.899549,0.215837,0.767367,-1,0.027778,0.744438,0.161394
5067,2,50,Biochimica et Biophysica Acta,51|128,Biochimica et Biophysica Acta,Q4676737,Acta Obstetricia et Gynecologica Scandinavica,,fuzzy-augmented,peer-reviewed scientific journal,3.539613e-09,21.003864,Q864239,Biochimica et Biophysica Acta,-1,0.78851,0.0,0.597655,0.802834,0,0.034483,45,5,0.899549,0.171279,0.727653,-1,0.055556,0.744438,0.128075
7668,2,72,Journal of Clinical Microbiology,73|96,Journal of Clinical Microbiology,Q176044,British Journal of Clinical Pharmacology,Br. J. Clin. Pharmacol.|Br J Clin Pharmacol.,fuzzy-augmented,scientific journal,4.492359e-09,21.430408,Q4041880,Journal of Clinical Microbiology,-1,0.88244,0.0,0.740278,0.775199,0,0.013514,40,5,0.899549,0.208187,0.685951,-1,0.012195,0.744438,0.155673
7662,2,72,Journal of Clinical Microbiology,73|96,Journal of Clinical Microbiology,Q15745208,European Journal of Clinical Investigation,,fuzzy-augmented,journal,3.539613e-09,21.430408,Q4041880,Journal of Clinical Microbiology,-1,0.865871,0.0,0.66949,0.872266,0,0.043478,42,5,0.899549,0.15667,0.799611,-1,0.0625,0.744438,0.117151
3213,2,36,The Journal of Bacteriology,37|165,The Journal of Bacteriology,Q400296,The Journal of Pathology,Journal of Pathology,fuzzy-augmented,peer-reviewed scientific journal,3.539613e-09,26.566677,Q478419,Journal of Bacteriology,-1,0.946759,0.0,0.946296,0.868306,0,0.142857,24,4,0.899549,0.194082,0.801616,-1,0.333333,0.744438,0.145125
4719,2,48,The Journal of Clinical Investigation,49|129,The Journal of Clinical Investigation,Q15745208,European Journal of Clinical Investigation,,fuzzy-augmented,journal,3.539613e-09,31.789219,Q3186904,Journal of Clinical Investigation,-1,0.911651,0.0,0.774045,0.872266,0,0.043478,42,5,0.899549,0.15667,0.799611,-1,0.0625,0.744438,0.117151


In [48]:
# final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, [
#     'cos_property_count_tf_idf_score', 'lof_property_count_tf_idf_score',
#     'cos_class_count_tf_idf_score', 'lof_class_count_tf_idf_score'
# ]]

In [49]:
# final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, [
#     'cos_class_count_tf_idf_score_tf', 'cos_class_count_tf_idf_score_idf',
#     'lof_class_count_tf_idf_score_tf', 'lof_class_count_tf_idf_score_idf'
# ]]

In [50]:
# final_feature_df[final_feature_df['evaluation_label'] == 1].loc[:, [
#     'cos_property_count_tf_idf_score_tf', 'cos_property_count_tf_idf_score_idf',
#     'lof_property_count_tf_idf_score_tf', 'lof_property_count_tf_idf_score_idf'
# ]]

### Evaluation of lof-graph-embedding-score
- baseline: graph-embedding-score (centroid-of-singleton)

In [14]:
# use top 1/5 accuracy
def embedding_eval(eval_file):
    assert "graph-embedding-score" in eval_file
    assert "lof-graph-embedding-score" in eval_file
    
    cos_top1_count = 0
    cos_top5_count = 0
    lof_top1_count = 0
    lof_top5_count = 0
    all_count = 0
    
    for ((col, row), group) in eval_file.groupby(['column', 'row']):
        all_count += 1
        
        # sort by centroid-of-singleton embedding score
        eval_labels = group.sort_values(by=['graph-embedding-score'], ascending=False)['evaluation_label']
        if eval_labels.iloc[0] == 1:
            cos_top1_count += 1
        if 1 in eval_labels.iloc[:5].values:
            cos_top5_count += 1
            
        # sort by centroid-of-lof embedding score
        eval_labels = group.sort_values(by=['lof-graph-embedding-score'], ascending=False)['evaluation_label']
        if eval_labels.iloc[0] == 1:
            lof_top1_count += 1
        if 1 in eval_labels.iloc[:5].values:
            lof_top5_count += 1
    
    return {
        'cos_top1_accuracy': cos_top1_count / all_count, 
        'cos_top5_accuracy': cos_top5_count / all_count, 
        'lof_top1_accuracy': lof_top1_count / all_count, 
        'lof_top5_accuracy': lof_top5_count / all_count,
        'all_count': all_count
    }

In [15]:
res_top_accuracy = {}
for fid in file_ids:
    final_df = pd.read_csv(f"{HOME_DIR}/merged-lof-score/{fid}.csv")
    res_top_accuracy[fid] = embedding_eval(final_df)
res_top_accuracy

{'14380604_4_3329235705746762392': {'cos_top1_accuracy': 0.7,
  'cos_top5_accuracy': 0.9,
  'lof_top1_accuracy': 0.75,
  'lof_top5_accuracy': 0.9,
  'all_count': 20},
 '29414811_2_4773219892816395776': {'cos_top1_accuracy': 0.13636363636363635,
  'cos_top5_accuracy': 0.5,
  'lof_top1_accuracy': 0.045454545454545456,
  'lof_top5_accuracy': 0.36363636363636365,
  'all_count': 22},
 '84575189_0_6365692015941409487': {'cos_top1_accuracy': 0.11,
  'cos_top5_accuracy': 0.23,
  'lof_top1_accuracy': 0.15,
  'lof_top5_accuracy': 0.37,
  'all_count': 100},
 '52299421_0_4473286348258170200': {'cos_top1_accuracy': 0.0,
  'cos_top5_accuracy': 0.0,
  'lof_top1_accuracy': 0.8586956521739131,
  'lof_top5_accuracy': 0.8804347826086957,
  'all_count': 92},
 '28086084_0_3127660530989916727': {'cos_top1_accuracy': 0.23660714285714285,
  'cos_top5_accuracy': 0.5223214285714286,
  'lof_top1_accuracy': 0.27232142857142855,
  'lof_top5_accuracy': 0.5535714285714286,
  'all_count': 224},
 '39759273_0_142789830

In [16]:
top_accuracy_df = pd.DataFrame(res_top_accuracy)
top_accuracy_df = top_accuracy_df.transpose()
len(top_accuracy_df[top_accuracy_df['lof_top1_accuracy'] < top_accuracy_df['cos_top1_accuracy']]), \
len(top_accuracy_df[top_accuracy_df['lof_top5_accuracy'] < top_accuracy_df['cos_top5_accuracy']]), \
len(top_accuracy_df)

(2, 2, 9)

In [17]:
# visualize embedding-score difference
def highlight_greaterthan_1(x):
    if x.lof_top1_accuracy < x.cos_top1_accuracy:
        return ['background-color: yellow']*5
    else:
        return ['background-color: white']*5
    
top_accuracy_df.style.apply(highlight_greaterthan_1, axis=1)

Unnamed: 0,cos_top1_accuracy,cos_top5_accuracy,lof_top1_accuracy,lof_top5_accuracy,all_count
14380604_4_3329235705746762392,0.7,0.9,0.75,0.9,20.0
29414811_2_4773219892816395776,0.136364,0.5,0.045455,0.363636,22.0
84575189_0_6365692015941409487,0.11,0.23,0.15,0.37,100.0
52299421_0_4473286348258170200,0.0,0.0,0.858696,0.880435,92.0
28086084_0_3127660530989916727,0.236607,0.522321,0.272321,0.553571,224.0
39759273_0_1427898308030295194,0.58,0.93,0.65,0.96,100.0
45073662_0_3179937335063201739,0.407407,0.888889,0.444444,0.888889,27.0
14067031_0_559833072073397908,0.886792,0.943396,0.943396,0.943396,53.0
50270082_0_444360818941411589,0.547619,0.922619,0.529762,0.916667,168.0
