In [4]:
import pandas as pd
import numpy as np
import os

In [5]:
HOME_DIR = '/Users/summ7t/dev/novartis/table-linker/t2dv2-candidates-april-28/dev'

### Generate lof-graph-embedding-score for any table

Required datasets
- candidate file
- candidate feature file
- graph_embedding_complex.tsv (generated and stored during candidate generation)

Script used `lof-script.sh`

```
filename=$1
tsv_postfix=_graph_embedding_complex

tl smallest-qnode-number train-candidates/candidates-$filename.csv \
/ align-page-rank \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard \
/ vote-by-classifier --prob-threshold 0.995 --model weighted_lr.pkl \
> model-voted/$filename.csv

tl score-using-embedding model-voted/$filename.csv \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o graph-embedding-score \
--embedding-file train-graph-embeddings/$filename$tsv_postfix.tsv \
--embedding-url http://ckg07:9200/wikidatadwd-augmented/ \
> lof-score/$filename.csv
```

cmd: `bash {HOME_DIR}/lof-script.sh {fid}`

output: lof-score/$filename.csv contains `is_lof` and `graph-embedding-score` (centroid-of-lof)

In [6]:
!mkdir -p $HOME_DIR/model-voted
!mkdir -p $HOME_DIR/lof-score
!mkdir -p $HOME_DIR/merged-lof-score
!mkdir -p $HOME_DIR/final-features

In [10]:
# list all files in candidates dir
file_names = []
file_ids = []

for (dirpath, dirnames, filenames) in os.walk(f'{HOME_DIR}/dev-candidates/'):
    for fn in filenames:
        if "csv" not in fn:
            continue
        abs_fn = dirpath + fn
        assert os.path.isfile(abs_fn)
        if os.path.getsize(abs_fn) == 0:
            continue
        file_names.append(abs_fn)
        file_ids.append(fn.split('.csv')[0].split('candidates-')[1])
len(file_names), file_ids[:3]

(9,
 ['14380604_4_3329235705746762392',
  '29414811_2_4773219892816395776',
  '84575189_0_6365692015941409487'])

In [12]:
for idx, fid in enumerate(file_ids):
    print(f"Generating score for {idx}th file: {fid}...")
    os.system(f'bash {HOME_DIR}/lof-script.sh {fid}')
    assert os.path.isfile(f'{HOME_DIR}/model-voted/{fid}.csv'), f"Something wrong with model-voted result: {idx}th file: {fid}"
    assert os.path.isfile(f'{HOME_DIR}/lof-score/{fid}.csv'), f"Something wrong with lof-score result: {idx}th file: {fid}"

Generating score for 0th file: 14380604_4_3329235705746762392...
Generating score for 1th file: 29414811_2_4773219892816395776...
Generating score for 2th file: 84575189_0_6365692015941409487...
Generating score for 3th file: 52299421_0_4473286348258170200...
Generating score for 4th file: 28086084_0_3127660530989916727...
Generating score for 5th file: 39759273_0_1427898308030295194...
Generating score for 6th file: 45073662_0_3179937335063201739...
Generating score for 7th file: 14067031_0_559833072073397908...
Generating score for 8th file: 50270082_0_444360818941411589...


In [13]:
# check model-voted and lof-score files
fid = '14380604_4_3329235705746762392'
model_voted_df = pd.read_csv(f'{HOME_DIR}/model-voted/{fid}.csv')
model_voted_df[model_voted_df['vote_by_classifier'] > 0]

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,vote_by_classifier
405,1,4,JPMorgan Chase & Co.,"5|USA|Banking|99.30|14.44|1,351.52|170.97",JPMorgan Chase Co.,Q192314,JPMorgan Chase,Bank of the Manhattan Company|JPMorgan Chase &...,fuzzy-augmented,American multinational banking and financial s...,1.038304e-07,34.48898,1,0.0,0.751111,0.125,0.208333,1
574,1,5,American International Group,6|USA|Insurance|113.19|14.01|979.41|174.47,American International Group,Q8774,International Airlines Group,International Consolidated Airlines Group SA|I...,fuzzy-augmented,British-Spanish multinational airline holding ...,4.579354e-08,15.741833,1,0.0,0.894444,0.0,0.0,1
606,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,exact-match,American multinational oil and gas corporation,7.353359e-08,21.693314,1,7.353359e-08,1.0,0.5,1.0,1
607,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,fuzzy-augmented,American multinational oil and gas corporation,7.353359e-08,21.561049,1,0.0,1.0,0.5,0.833333,1
1124,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",exact-match,British multinational oil and gas company,1.013292e-07,18.212986,0,1.013292e-07,1.0,0.5,1.0,1
1152,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",fuzzy-augmented,British multinational oil and gas company,1.013292e-07,16.17667,0,0.0,1.0,0.5,0.833333,1
2028,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,exact-match,British multinational banking and financial se...,4.450382e-08,20.621593,1,4.450382e-08,1.0,0.142857,0.285714,1
2031,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,fuzzy-augmented,British multinational banking and financial se...,4.450382e-08,18.98323,1,0.0,1.0,0.142857,0.238095,1


In [14]:
fid = '14380604_4_3329235705746762392'
score_df = pd.read_csv(f'{HOME_DIR}/lof-score/{fid}.csv')
score_df.sort_values(by=['graph-embedding-score'], ascending=False).head(10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,retrieval_score,smallest_qnode_number,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,vote_by_classifier,singleton,is_lof,graph-embedding-score
171,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",exact-match,British multinational oil and gas company,...,18.212986,0,1.013292e-07,1.0,0.5,1.0,1,0,1,0.885915
199,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",fuzzy-augmented,British multinational oil and gas company,...,16.17667,0,0.0,1.0,0.5,0.833333,1,0,1,0.885915
1075,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,exact-match,British multinational banking and financial se...,...,20.621593,1,4.450382e-08,1.0,0.142857,0.285714,1,0,1,0.863573
1078,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,fuzzy-augmented,British multinational banking and financial se...,...,18.98323,1,0.0,1.0,0.142857,0.238095,1,0,1,0.863573
1898,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,fuzzy-augmented,American multinational oil and gas corporation,...,21.561049,1,0.0,1.0,0.5,0.833333,1,0,1,0.843475
1897,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,exact-match,American multinational oil and gas corporation,...,21.693314,1,7.353359e-08,1.0,0.5,1.0,1,1,1,0.843475
0,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q219508,Citigroup,City Bank of New York|Citigroup Inc.|Citi,exact-match,American investment bank and financial service...,...,21.693314,0,8.027813e-08,1.0,0.0,0.0,0,1,1,0.811528
1,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q219508,Citigroup,City Bank of New York|Citigroup Inc.|Citi,fuzzy-augmented,American investment bank and financial service...,...,19.26316,0,0.0,1.0,0.0,0.0,0,0,-1,0.811528
1696,1,4,JPMorgan Chase & Co.,"5|USA|Banking|99.30|14.44|1,351.52|170.97",JPMorgan Chase Co.,Q192314,JPMorgan Chase,Bank of the Manhattan Company|JPMorgan Chase &...,fuzzy-augmented,American multinational banking and financial s...,...,34.48898,1,0.0,0.751111,0.125,0.208333,1,0,-1,0.792447
1797,1,5,American International Group,6|USA|Insurance|113.19|14.01|979.41|174.47,American International Group,Q212235,American International Group,"AIG|American International Group, Inc.",fuzzy-augmented,American multinational insurance corporation,...,28.054087,0,0.0,1.0,0.25,0.416667,0,0,-1,0.789462


In [15]:
# merge lof candidate (graph-embedding-score) with candidate feature file
for idx, fid in enumerate(file_ids):
    print(f"Merging embedding score for {idx}th file: {fid}...")
    train_features_df = pd.read_csv(f'{HOME_DIR}/dev-features/{fid}.csv')
    lof_score_df = pd.read_csv(f'{HOME_DIR}/lof-score/{fid}.csv')
    lof_score_df.rename(columns = {'graph-embedding-score':'lof-graph-embedding-score'}, inplace = True)
    trimmed_lof_score_df = lof_score_df.loc[:, ['column', 'row', 'kg_id', 'lof-graph-embedding-score']]
    
    # merge two df on row, column, kg_id
    final_df = pd.merge(train_features_df, trimmed_lof_score_df, left_on=['column', 'row', 'kg_id'], right_on = ['column', 'row', 'kg_id'])
    final_df.drop_duplicates(inplace=True)
    assert len(final_df) == len(train_features_df), f"{len(train_features_df)}, {len(final_df)}"
    
    final_df.to_csv(f"{HOME_DIR}/merged-lof-score/{fid}.csv", index=False)
    assert os.path.isfile(f'{HOME_DIR}/merged-lof-score/{fid}.csv'), f"Something wrong with merged score result: {idx}th file: {fid}"

Merging embedding score for 0th file: 14380604_4_3329235705746762392...
Merging embedding score for 1th file: 29414811_2_4773219892816395776...
Merging embedding score for 2th file: 84575189_0_6365692015941409487...
Merging embedding score for 3th file: 52299421_0_4473286348258170200...
Merging embedding score for 4th file: 28086084_0_3127660530989916727...
Merging embedding score for 5th file: 39759273_0_1427898308030295194...
Merging embedding score for 6th file: 45073662_0_3179937335063201739...
Merging embedding score for 7th file: 14067031_0_559833072073397908...
Merging embedding score for 8th file: 50270082_0_444360818941411589...


In [16]:
# check merged train feature files
fid = '14380604_4_3329235705746762392'
merged_score_df = pd.read_csv(f'{HOME_DIR}/merged-lof-score/{fid}.csv')
merged_score_df.sort_values(by=['lof-graph-embedding-score'], ascending=False).head(10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,des_cont_jaccard,jaro_winkler,graph-embedding-score,singleton,reciprocal_rank,num_char,num_tokens,class_count_tf_idf_score,property_count_tf_idf_score,lof-graph-embedding-score
177,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",fuzzy-augmented,British multinational oil and gas company,...,0.5,1.0,0.74522,0,1.0,2,1,0.349661,0.531144,0.885915
178,1,10,BP,11|UK|Oil and gas|265.91|22.29|217.60|198.14,BP,Q152057,BP,"BP P.L.C.|BP plc,|British Petroleum",exact-match,British multinational oil and gas company,...,0.5,1.0,0.74522,0,0.5,2,1,0.349661,0.531144,0.885915
1075,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,fuzzy-augmented,British multinational banking and financial se...,...,0.142857,1.0,0.770165,0,1.0,8,1,0.598094,0.43189,0.863573
1076,1,17,Barclays,"18|UK|Banking|67.71|8.95|1,949.17|94.79",Barclays,Q245343,Barclays,Barclays (United Kingdom)|Barclays PLC,exact-match,British multinational banking and financial se...,...,0.142857,1.0,0.770165,0,0.5,8,1,0.598094,0.43189,0.863573
1898,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,exact-match,American multinational oil and gas corporation,...,0.5,1.0,0.82417,1,0.5,10,1,0.620813,0.550966,0.843475
1897,1,6,ExxonMobil,7|USA|Oil and gas|335.09|39.50|223.95|410.65,ExxonMobil,Q156238,ExxonMobil,Exxon|Exxon Mobil Corp.|Exxon Mobil|Exxon Mobi...,fuzzy-augmented,American multinational oil and gas corporation,...,0.5,1.0,0.82417,0,1.0,10,1,0.620813,0.550966,0.843475
0,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q219508,Citigroup,City Bank of New York|Citigroup Inc.|Citi,fuzzy-augmented,American investment bank and financial service...,...,0.0,1.0,0.84192,0,1.0,9,1,0.445131,0.575829,0.811528
1,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q219508,Citigroup,City Bank of New York|Citigroup Inc.|Citi,exact-match,American investment bank and financial service...,...,0.0,1.0,0.84192,1,0.5,9,1,0.445131,0.575829,0.811528
1695,1,4,JPMorgan Chase & Co.,"5|USA|Banking|99.30|14.44|1,351.52|170.97",JPMorgan Chase Co.,Q192314,JPMorgan Chase,Bank of the Manhattan Company|JPMorgan Chase &...,fuzzy-augmented,American multinational banking and financial s...,...,0.125,0.94,0.840305,0,1.0,14,2,0.598094,0.494966,0.792447
1797,1,5,American International Group,6|USA|Insurance|113.19|14.01|979.41|174.47,American International Group,Q212235,American International Group,"AIG|American International Group, Inc.",exact-match,American multinational insurance corporation,...,0.25,1.0,0.814121,1,0.5,28,3,0.349661,0.335018,0.789462


In [17]:
# Generate lof-reciprocal-rank
for idx, fid in enumerate(file_ids):
    print(f"generating final feature for {idx}th file: {fid}")
    merged_lof_f = f'{HOME_DIR}/merged-lof-score/{fid}.csv'
    final_features_f = f'{HOME_DIR}/final-features/{fid}.csv'
    script = f"""
    tl generate-reciprocal-rank {merged_lof_f} \
    -c lof-graph-embedding-score \
    -o lof-reciprocal-rank \
    > {final_features_f}
    """
    os.system(script)
    assert os.path.isfile(final_features_f), f"Something wrong with final feature result: {idx}th file: {fid}"

generating final feature for 0th file: 14380604_4_3329235705746762392
generating final feature for 1th file: 29414811_2_4773219892816395776
generating final feature for 2th file: 84575189_0_6365692015941409487
generating final feature for 3th file: 52299421_0_4473286348258170200
generating final feature for 4th file: 28086084_0_3127660530989916727
generating final feature for 5th file: 39759273_0_1427898308030295194
generating final feature for 6th file: 45073662_0_3179937335063201739
generating final feature for 7th file: 14067031_0_559833072073397908
generating final feature for 8th file: 50270082_0_444360818941411589


In [18]:
# check final feature files
fid = '14380604_4_3329235705746762392'
final_feature_df = pd.read_csv(f'{HOME_DIR}/final-features/{fid}.csv')
final_feature_df

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,jaro_winkler,graph-embedding-score,singleton,reciprocal_rank,num_char,num_tokens,class_count_tf_idf_score,property_count_tf_idf_score,lof-graph-embedding-score,lof-reciprocal-rank
0,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q219508,Citigroup,City Bank of New York|Citigroup Inc.|Citi,fuzzy-augmented,American investment bank and financial service...,...,1.000000,0.841920,0,1.000000,9,1,0.445131,0.575829,0.811528,1.000000
1,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q219508,Citigroup,City Bank of New York|Citigroup Inc.|Citi,exact-match,American investment bank and financial service...,...,1.000000,0.841920,1,0.500000,9,1,0.445131,0.575829,0.811528,0.500000
2,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q781961,One Court Square,Citigroup Building,fuzzy-augmented,"office skyscraper in Queens, New York",...,0.481481,0.649274,0,0.058824,16,3,0.072752,0.034964,0.614354,0.055556
3,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q867663,Citigroup Centre,Canary Wharf DS5|33 Canada Square|Citigroup Ce...,fuzzy-augmented,"building complex in London, England",...,0.912500,0.645837,0,0.055556,16,2,0.036686,0.016251,0.662709,0.100000
4,1,0,Citigroup,"1|USA|Banking|146.56|21.54|1,884.32|247.42",Citigroup,Q5122510,Citigroup Global Markets Japan,,fuzzy-augmented,,...,0.860000,0.702050,0,0.125000,30,4,0.125893,0.051901,0.688678,0.142857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2406,1,9,ING Group,10|Netherlands|Diversified Financials|153.44|9...,ING Group,Q375181,ingroups and outgroups,ingroup,fuzzy-augmented,sociological notions where in-group is a socia...,...,0.698653,0.564814,0,0.058824,22,3,0.065596,0.010566,0.494376,0.050000
2407,1,9,ING Group,10|Netherlands|Diversified Financials|153.44|9...,ING Group,Q3146595,ID Group,,fuzzy-augmented,French company producing toys and furniture fo...,...,0.852976,0.550410,0,0.050000,8,2,0.125893,0.015766,0.490349,0.047619
2408,1,9,ING Group,10|Netherlands|Diversified Financials|153.44|9...,ING Group,Q5392021,IC Group,,fuzzy-augmented,,...,0.852976,0.583275,0,0.071429,8,2,0.349661,0.125247,0.558739,0.076923
2409,1,9,ING Group,10|Netherlands|Diversified Financials|153.44|9...,ING Group,Q1653679,IHI Corporation,"Ishikawajima-Harima Heavy Industries Co., Ltd|...",fuzzy-augmented,manufacturing company in Japan,...,0.562963,0.670830,0,0.142857,15,2,0.349661,0.216225,0.653480,0.200000


### Evaluation of lof-graph-embedding-score
- baseline: graph-embedding-score (centroid-of-singleton)

In [19]:
# use top 1/5 accuracy
def embedding_eval(eval_file):
    assert "graph-embedding-score" in eval_file
    assert "lof-graph-embedding-score" in eval_file
    
    cos_top1_count = 0
    cos_top5_count = 0
    lof_top1_count = 0
    lof_top5_count = 0
    all_count = 0
    
    for ((col, row), group) in eval_file.groupby(['column', 'row']):
        all_count += 1
        
        # sort by centroid-of-singleton embedding score
        eval_labels = group.sort_values(by=['graph-embedding-score'], ascending=False)['evaluation_label']
        if eval_labels.iloc[0] == 1:
            cos_top1_count += 1
        if 1 in eval_labels.iloc[:5].values:
            cos_top5_count += 1
            
        # sort by centroid-of-lof embedding score
        eval_labels = group.sort_values(by=['lof-graph-embedding-score'], ascending=False)['evaluation_label']
        if eval_labels.iloc[0] == 1:
            lof_top1_count += 1
        if 1 in eval_labels.iloc[:5].values:
            lof_top5_count += 1
    
    return {
        'cos_top1_accuracy': cos_top1_count / all_count, 
        'cos_top5_accuracy': cos_top5_count / all_count, 
        'lof_top1_accuracy': lof_top1_count / all_count, 
        'lof_top5_accuracy': lof_top5_count / all_count,
        'all_count': all_count
    }

In [20]:
res_top_accuracy = {}
for fid in file_ids:
    final_df = pd.read_csv(f"{HOME_DIR}/merged-lof-score/{fid}.csv")
    res_top_accuracy[fid] = embedding_eval(final_df)
res_top_accuracy

{'14380604_4_3329235705746762392': {'cos_top1_accuracy': 0.7,
  'cos_top5_accuracy': 0.9,
  'lof_top1_accuracy': 0.75,
  'lof_top5_accuracy': 0.9,
  'all_count': 20},
 '29414811_2_4773219892816395776': {'cos_top1_accuracy': 0.13636363636363635,
  'cos_top5_accuracy': 0.5,
  'lof_top1_accuracy': 0.045454545454545456,
  'lof_top5_accuracy': 0.36363636363636365,
  'all_count': 22},
 '84575189_0_6365692015941409487': {'cos_top1_accuracy': 0.11,
  'cos_top5_accuracy': 0.23,
  'lof_top1_accuracy': 0.15,
  'lof_top5_accuracy': 0.37,
  'all_count': 100},
 '52299421_0_4473286348258170200': {'cos_top1_accuracy': 0.0,
  'cos_top5_accuracy': 0.0,
  'lof_top1_accuracy': 0.8586956521739131,
  'lof_top5_accuracy': 0.8804347826086957,
  'all_count': 92},
 '28086084_0_3127660530989916727': {'cos_top1_accuracy': 0.23660714285714285,
  'cos_top5_accuracy': 0.5223214285714286,
  'lof_top1_accuracy': 0.27232142857142855,
  'lof_top5_accuracy': 0.5535714285714286,
  'all_count': 224},
 '39759273_0_142789830

In [21]:
top_accuracy_df = pd.DataFrame(res_top_accuracy)
top_accuracy_df = top_accuracy_df.transpose()
len(top_accuracy_df[top_accuracy_df['lof_top1_accuracy'] < top_accuracy_df['cos_top1_accuracy']]), \
len(top_accuracy_df[top_accuracy_df['lof_top5_accuracy'] < top_accuracy_df['cos_top5_accuracy']]), \
len(top_accuracy_df)

(2, 2, 9)

In [22]:
# visualize embedding-score difference
def highlight_greaterthan_1(x):
    if x.lof_top1_accuracy < x.cos_top1_accuracy:
        return ['background-color: yellow']*5
    else:
        return ['background-color: white']*5
    
top_accuracy_df.style.apply(highlight_greaterthan_1, axis=1)

Unnamed: 0,cos_top1_accuracy,cos_top5_accuracy,lof_top1_accuracy,lof_top5_accuracy,all_count
14380604_4_3329235705746762392,0.7,0.9,0.75,0.9,20.0
29414811_2_4773219892816395776,0.136364,0.5,0.045455,0.363636,22.0
84575189_0_6365692015941409487,0.11,0.23,0.15,0.37,100.0
52299421_0_4473286348258170200,0.0,0.0,0.858696,0.880435,92.0
28086084_0_3127660530989916727,0.236607,0.522321,0.272321,0.553571,224.0
39759273_0_1427898308030295194,0.58,0.93,0.65,0.96,100.0
45073662_0_3179937335063201739,0.407407,0.888889,0.444444,0.888889,27.0
14067031_0_559833072073397908,0.886792,0.943396,0.943396,0.943396,53.0
50270082_0_444360818941411589,0.547619,0.922619,0.529762,0.916667,168.0
