In [1]:
import pandas as pd
import os
import glob
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import sklearn.metrics
from collections import defaultdict
import shutil

pd.reset_option('all')

As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead.

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



In [2]:
table = '/Users/rijulvohra/Documents/work/Novartis-ISI/table_linker_pipeline/'
table_name = 'cricketers.csv'
wikify_column_name = "cricketers"

In [10]:
#intermediate files
canonical = 'canonical.csv'
candidates = "candidates.csv"
feature_votes = "feature_votes.csv"
score_file = "scores.csv"
temp_dir = os.path.join(os.path.join(table,'temp'))
table_path = os.path.join(table,table_name)
if not(os.path.exists(temp_dir)):
    os.makedirs(temp_dir)
embedding_file = os.path.join(temp_dir,'graph_embedding_complex.tsv')

## Canonicalize

In [4]:
canonical_file = os.path.join(temp_dir,canonical)
!tl canonicalize -c "$wikify_column_name" --add-context $table_path > $canonical_file

In [5]:
pd.read_csv(canonical_file, nrows = 10)

Unnamed: 0,column,row,label,context
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88
1,0,1,Tendulkar,mumbai indians|137|24/04/1973
2,0,2,Dhoni,chennai super kings|154|7/7/81
3,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93
4,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88
5,0,5,Rohit Sharma,mumbai indians|159|30/04/1987
6,0,6,Bhuvneshwar Kumar,deccan chargers|154|5/2/90
7,0,7,Ravindra Jadeja,chennai super kings|132|6/12/88
8,0,8,Rishabh Pant,delhi capitals|136|4/8/97
9,0,9,Shikhar Dhawan,delhi capitals|157|5/12/85


## Candidate Generation

In [20]:
%%time
candidates_file = os.path.join(temp_dir,candidates)
aux_field = 'graph_embedding_complex'
!tl clean -c label -o label_clean $canonical_file \
        / --url http://ckg06:9200 --index wikidatadwd-augmented-01 get-fuzzy-augmented-matches -c label_clean \
        --auxiliary-fields $aux_field \
        --auxiliary-folder $temp_dir \
        / --url http://ckg06:9200 --index wikidatadwd-augmented-01 get-exact-matches \
        -c label_clean --auxiliary-fields $aux_field \
        --auxiliary-folder $temp_dir > $candidates_file
                
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{aux_field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={aux_field: 'embedding'})
    aux_df.to_csv(f'{temp_dir}/{aux_field}.tsv', sep='\t', index=False)

CPU times: user 908 ms, sys: 321 ms, total: 1.23 s
Wall time: 25 s


In [21]:
pd.read_csv(candidates_file,nrows = 10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39385
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.48463
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.48463
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.520416
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,3.539613e-09,19.623405
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,3.539613e-09,19.601748
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,3.539613e-09,19.233713
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,3.539613e-09,19.010628
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.539613e-09,19.010628


## Feature Voting

In [22]:
%%time
feature_votes_file = os.path.join(temp_dir,feature_votes)
!tl smallest-qnode-number $candidates_file \
    / string-similarity -i --method monge_elkan:tokenizer=word -o monge_elkan \
    / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
    / feature-voting -c "pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard" > $feature_votes_file

CPU times: user 54.7 ms, sys: 26.8 ms, total: 81.5 ms
Wall time: 3.79 s


In [23]:
pd.read_csv(feature_votes_file,nrows = 10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,monge_elkan,des_cont_jaccard,votes
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39385,0,1.0,0.0,1
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.48463,0,0.733333,0.0,0
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.48463,0,0.5,0.0,0
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0,0.5,0.0,0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.520416,0,0.483333,0.0,0
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,3.539613e-09,19.623405,0,0.0,0.0,0
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,3.539613e-09,19.601748,0,0.5,0.0,0
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,3.539613e-09,19.233713,1,0.488889,0.0,1
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,3.539613e-09,19.010628,0,0.685185,0.0,0
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.539613e-09,19.010628,0,0.733333,0.0,0


## Compute Embedding Score using Column Vector Strategy

In [24]:
%%time
score = os.path.join(temp_dir,score_file)
!tl score-using-embedding $feature_votes_file --column-vector-strategy centroid-of-singletons -o graph-embedding-score \
--embedding-file $embedding_file \
> $score

Qnodes to lookup: 1260
Qnodes from file: 1238
CPU times: user 19.5 ms, sys: 15.6 ms, total: 35.1 ms
Wall time: 1.21 s


In [25]:
pd.read_csv(score,nrows = 10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,monge_elkan,des_cont_jaccard,votes,graph-embedding-score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39385,0,1.0,0.0,1,0.886983
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.48463,0,0.733333,0.0,0,0.469873
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.48463,0,0.5,0.0,0,0.586232
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0,0.5,0.0,0,0.0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.520416,0,0.483333,0.0,0,0.513397
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,3.539613e-09,19.623405,0,0.0,0.0,0,0.215469
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,3.539613e-09,19.601748,0,0.5,0.0,0,0.354691
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,3.539613e-09,19.233713,1,0.488889,0.0,1,0.474148
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,3.539613e-09,19.010628,0,0.685185,0.0,0,0.557991
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.539613e-09,19.010628,0,0.733333,0.0,0,0.826771


## Compute final score

In [26]:
final_score = os.path.join(temp_dir,'final_score.csv')
!tl normalize-scores -c graph-embedding-score -t zscore -o normalized-graph-embedding-score $score \
    / normalize-scores -c pagerank -t zscore -o normalized-pagerank \
    / normalize-scores -c monge_elkan -t zscore -o normalized-monge-elkan \
    / combine-linearly -w normalized-graph-embedding-score:1,normalized-pagerank:1,normalized-monge-elkan:1 \
     -o final_score > $final_score
df = pd.read_csv(final_score)
df.head()

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,retrieval_score,smallest_qnode_number,monge_elkan,des_cont_jaccard,votes,graph-embedding-score,normalized-graph-embedding-score,normalized-pagerank,normalized-monge-elkan,final_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,...,21.138376,0,1.0,0.0,1,0.886983,0.889363,-0.029806,0.166667,1.026224
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,...,36.39385,0,1.0,0.0,1,0.886983,1.896483,-0.113447,1.707388,3.490425
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,...,23.48463,0,0.733333,0.0,0,0.469873,-0.217603,-0.094306,-0.186569,-0.498478
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),...,23.48463,0,0.5,0.0,0,0.586232,0.372149,-0.115848,-1.843782,-1.587481
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,...,20.582134,0,0.5,0.0,0,0.0,-2.599118,-0.152836,-1.843782,-4.595735


## Get Top5 KG Links

In [27]:
linked_table_name = "topk-" + table_name
output_path = os.path.join(temp_dir,linked_table_name)
!tl get-kg-links -c final_score -l label -k 5 $final_score > $output_path
pd.read_csv(output_path, nrows = 10)

Unnamed: 0,column,row,label,kg_id,kg_label,ranking_score
0,0,0,Virat Kohli,Q213854|Q4747495|Q84923882|Q7686953|Q65228712,Virat Kohli|Amolak Rathan Kohli|Parth Kohli|Ta...,3.49|1.73|1.67|1.45|1.45
1,0,1,Tendulkar,Q22327439|Q9488|Q7699668|Q27736339|Q24906003,Arjun Tendulkar|Sachin Tendulkar|Tendulkar|Vin...,3.38|3.02|2.94|2.68|2.54
2,0,10,Cheteshwar Pujara,Q142613|Q105044382|Q16225224|Q31321408|Q57047988,Cheteshwar Pujara|List of international cricke...,3.64|1.4|1.21|0.95|0.67
3,0,11,Ishant Sharma,Q3522062|Q2277230|Q2756945|Q4766068|Q16225462,Ishant Sharma|Sharma|Chetan Sharma|Ankit Sharm...,3.81|3.44|2.67|2.47|2.45
4,0,12,Mohammad Shami,Q6891901|Q310247|Q207728|Q44819|Q7487531,Mohammad Bazar community development block|Moh...,6.25|5.4|5.34|4.35|3.14
5,0,2,Dhoni,Q470774|Q65236705|Q65223368|Q63996951|Q58813167,MS Dhoni|Dhani Ram|Dhani Ram|Dhani Ram|Shyam D...,3.21|2.4|2.39|2.19|2.1
6,0,3,Jasprit Bumrah,Q35484|Q7055|Q16227998|Q48195|Q455438,Medina|Buddha|Jasprit Bumrah|Basra|Allan Hume,6.3|4.27|3.83|14.76|1.51
7,0,4,Ajinkya Rahane,Q137669|Q4699748|Q63999029|Q82298759|Q4699751,Ajinkya Rahane|Ajinkya Joshi|Ajinkya Dhondu Ga...,3.73|1.54|1.51|0.89|0.69
8,0,5,Rohit Sharma,Q26436595|Q26837282|Q21622845|Q21622847|Q65233196,Rohit Sharma|Rohit Sharma|Rohit Sharma|Rohit S...,4.06|3.54|3.53|3.38|3.01
9,0,6,Bhuvneshwar Kumar,Q171771|Q2003153|Q4902309|Q1226319|Q5620107,Bhubaneswar|Bhuvneshwar Kumar|Bhuvneshwari Kum...,5.52|3.79|2.21|1.91|1.71


## Join to Produce final result

In [28]:
final_output = 'linked-' + table_name
path = os.path.join(table,final_output)
!tl join -f $table_path --csv -c ranking_score $output_path > $path

## CleanUp temporary files

In [29]:
shutil.rmtree(temp_dir)