In [1]:
import pandas as pd
import os
import glob
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import sklearn.metrics
from collections import defaultdict
import shutil

pd.reset_option('all')

As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead.

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



In [2]:
table = '/Users/rijulvohra/Documents/work/Novartis-ISI/table_linker_pipeline/'
table_name = 'cricketers.csv'
wikify_column_name = "cricketers"

In [3]:
#intermediate files
canonical = 'canonical.csv'
candidates = "candidates.csv"
feature_votes = "feature_votes.csv"
score_file = "scores.csv"
gt_file = 'cricketers-gt.csv'
temp_dir = os.path.join(os.path.join(table,'temp'))
table_path = os.path.join(table,table_name)
if not(os.path.exists(temp_dir)):
    os.makedirs(temp_dir)
fuzzy_embedding_path = os.path.join(temp_dir,'fuzzy_augmented_graph_embedding_complex.tsv')
exact_embedding_path = os.path.join(temp_dir, 'exact_graph_embedding_complex.tsv')

## Canonicalize

In [4]:
canonical_file = os.path.join(temp_dir,canonical)
!tl canonicalize -c "$wikify_column_name" --add-context $table_path > $canonical_file

In [5]:
pd.read_csv(canonical_file, nrows = 10)

Unnamed: 0,column,row,label,context
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88
1,0,1,Tendulkar,mumbai indians|137|24/04/1973
2,0,2,Dhoni,chennai super kings|154|7/7/81
3,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93
4,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88
5,0,5,Rohit Sharma,mumbai indians|159|30/04/1987
6,0,6,Bhuvneshwar Kumar,deccan chargers|154|5/2/90
7,0,7,Ravindra Jadeja,chennai super kings|132|6/12/88
8,0,8,Rishabh Pant,delhi capitals|136|4/8/97
9,0,9,Shikhar Dhawan,delhi capitals|157|5/12/85


## Candidate Generation

In [6]:
%%time
candidates_file = os.path.join(temp_dir,candidates)
gt_file = os.path.join(table,gt_file)
print(gt_file)
!tl clean -c label -o label_clean $canonical_file \
        / --url http://ckg06:9200 --index wikidatadwd-augmented-01 get-fuzzy-augmented-matches -c label_clean \
        --auxiliary-fields graph_embedding_complex \
        --auxiliary-folder $temp_dir \
        / --url http://ckg06:9200 --index wikidatadwd-augmented-01 get-exact-matches \
        -c label_clean \
        / ground-truth-labeler --gt-file $gt_file > $candidates_file

/Users/rijulvohra/Documents/work/Novartis-ISI/table_linker_pipeline/cricketers-gt.csv
CPU times: user 1.17 s, sys: 441 ms, total: 1.61 s
Wall time: 36.7 s


In [7]:
pd.read_csv(candidates_file,nrows = 10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,GT_kg_id,GT_kg_label,evaluation_label
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39385,Q213854,Virat Kohli,1
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.48463,Q213854,Virat Kohli,-1
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.48463,Q213854,Virat Kohli,-1
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,Q213854,Virat Kohli,-1
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.520416,Q213854,Virat Kohli,-1
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,3.539613e-09,19.623405,Q213854,Virat Kohli,-1
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,3.539613e-09,19.601748,Q213854,Virat Kohli,-1
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,3.539613e-09,19.233713,Q213854,Virat Kohli,-1
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,3.539613e-09,19.010628,Q213854,Virat Kohli,-1
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.539613e-09,19.010628,Q213854,Virat Kohli,-1


## Feature Voting

In [8]:
%%time
feature_votes_file = os.path.join(temp_dir,feature_votes)
!tl smallest-qnode-number $candidates_file \
    / string-similarity -i --method monge_elkan:tokenizer=word -o monge_elkan \
    / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
    / feature-voting -c "pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard" > $feature_votes_file

CPU times: user 58.5 ms, sys: 29.4 ms, total: 87.9 ms
Wall time: 3.95 s


In [9]:
pd.read_csv(feature_votes_file,nrows = 10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,GT_kg_id,GT_kg_label,evaluation_label,smallest_qnode_number,monge_elkan,des_cont_jaccard,votes
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39385,Q213854,Virat Kohli,1,0,1.0,0.0,1
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.48463,Q213854,Virat Kohli,-1,0,0.733333,0.0,0
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.48463,Q213854,Virat Kohli,-1,0,0.5,0.0,0
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,Q213854,Virat Kohli,-1,0,0.5,0.0,0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.520416,Q213854,Virat Kohli,-1,0,0.483333,0.0,0
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,3.539613e-09,19.623405,Q213854,Virat Kohli,-1,0,0.0,0.0,0
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,3.539613e-09,19.601748,Q213854,Virat Kohli,-1,0,0.5,0.0,0
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,3.539613e-09,19.233713,Q213854,Virat Kohli,-1,1,0.488889,0.0,1
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,3.539613e-09,19.010628,Q213854,Virat Kohli,-1,0,0.685185,0.0,0
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.539613e-09,19.010628,Q213854,Virat Kohli,-1,0,0.733333,0.0,0


## Compute Embedding Score using Column Vector Strategy

In [11]:
%%time
score = os.path.join(temp_dir,score_file)
!tl score-using-embedding $feature_votes_file --column-vector-strategy centroid-of-singletons -o graph-embedding-score \
--embedding-file $fuzzy_embedding_path \
> $score

Qnodes to lookup: 1260
Qnodes from file: 1238
CPU times: user 19.3 ms, sys: 15.4 ms, total: 34.7 ms
Wall time: 1.29 s


In [12]:
pd.read_csv(score,nrows = 10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,GT_kg_id,GT_kg_label,evaluation_label,smallest_qnode_number,monge_elkan,des_cont_jaccard,votes,graph-embedding-score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39385,Q213854,Virat Kohli,1,0,1.0,0.0,1,0.886983
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.48463,Q213854,Virat Kohli,-1,0,0.733333,0.0,0,0.469873
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.48463,Q213854,Virat Kohli,-1,0,0.5,0.0,0,0.586232
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,Q213854,Virat Kohli,-1,0,0.5,0.0,0,0.0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.520416,Q213854,Virat Kohli,-1,0,0.483333,0.0,0,0.513397
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,3.539613e-09,19.623405,Q213854,Virat Kohli,-1,0,0.0,0.0,0,0.215469
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,3.539613e-09,19.601748,Q213854,Virat Kohli,-1,0,0.5,0.0,0,0.354691
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,3.539613e-09,19.233713,Q213854,Virat Kohli,-1,1,0.488889,0.0,1,0.474148
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,3.539613e-09,19.010628,Q213854,Virat Kohli,-1,0,0.685185,0.0,0,0.557991
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.539613e-09,19.010628,Q213854,Virat Kohli,-1,0,0.733333,0.0,0,0.826771


## Compute final score

In [13]:
final_score = os.path.join(table,'final_score.csv')
!tl normalize-scores -c graph-embedding-score -t zscore -o normalized-graph-embedding-score $score \
    / normalize-scores -c pagerank -t zscore -o normalized-pagerank \
    / normalize-scores -c monge_elkan -t zscore -o normalized-monge-elkan \
    / combine-linearly -w normalized-graph-embedding-score:1,normalized-pagerank:1,normalized-monge-elkan:1 \
     -o final_score > $final_score
df = pd.read_csv(final_score)
df.head()

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,evaluation_label,smallest_qnode_number,monge_elkan,des_cont_jaccard,votes,graph-embedding-score,normalized-graph-embedding-score,normalized-pagerank,normalized-monge-elkan,final_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,...,1,0,1.0,0.0,1,0.886983,0.889363,-0.029806,0.166667,1.026224
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,...,1,0,1.0,0.0,1,0.886983,1.896483,-0.113447,1.707388,3.490425
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,...,-1,0,0.733333,0.0,0,0.469873,-0.217603,-0.094306,-0.186569,-0.498478
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),...,-1,0,0.5,0.0,0,0.586232,0.372149,-0.115848,-1.843782,-1.587481
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,...,-1,0,0.5,0.0,0,0.0,-2.599118,-0.152836,-1.843782,-4.595735


## Get Top5 KG Links

In [14]:
linked_table_name = "linked_graph-" + table_name
output_path = os.path.join(table,linked_table_name)
!tl get-kg-links -c final_score -l label_clean -k 1 $final_score > $output_path
pd.read_csv(output_path, nrows = 10)

Unnamed: 0,column,row,label,kg_id,kg_label,ranking_score
0,0,0,Virat Kohli,Q213854,Virat Kohli,3.49
1,0,1,Tendulkar,Q22327439,Arjun Tendulkar,3.38
2,0,10,Cheteshwar Pujara,Q142613,Cheteshwar Pujara,3.64
3,0,11,Ishant Sharma,Q3522062,Ishant Sharma,3.81
4,0,12,Mohammad Shami,Q6891901,Mohammad Bazar community development block,6.25
5,0,2,Dhoni,Q470774,MS Dhoni,3.21
6,0,3,Jasprit Bumrah,Q35484,Medina,6.3
7,0,4,Ajinkya Rahane,Q137669,Ajinkya Rahane,3.73
8,0,5,Rohit Sharma,Q26436595,Rohit Sharma,4.06
9,0,6,Bhuvneshwar Kumar,Q171771,Bhubaneswar,5.52


### Output

In [15]:
linked_table_name = "linked1_graph-" + table_name
final_output = os.path.join(table,linked_table_name)
!tl join -f $table_path --csv -c ranking_score $output_path > $final_output

## CleanUp temporary files

In [66]:
shutil.rmtree(temp_dir)