In [1]:
import numpy as np
import pandas as pd

### This pipeline focus on getting LOF-voted graph-embedding score from candidates file
Steps as following:
- generate required 4 features for voting classifier: 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized'
- generate model-voted candidates result
- in score-using-embedding: use --column-vector-strategy==centroid-of-lof and specified --lof-strategy to obtain lof-voted candidates as centroid and compute respective graph-embedding-score

In [2]:
HOME_DIR = '/Users/summ7t/dev/novartis/table-linker/t2dv2-dev-score/pipeline'
input_candidate_file_path = f'{HOME_DIR}/temp/candidates.csv'
aligned_pagerank_candidate_file_path = f'{HOME_DIR}/apr_test.csv'
model_voted_candidate_file_path = f'{HOME_DIR}/mv_test.csv'
graph_embedding_file_path = f'{HOME_DIR}/score_test.csv'

graph_embedding_complex_file = f'{HOME_DIR}/temp/graph_embedding_complex.tsv'
index_url = 'http://ckg07:9200/wikidatadwd-augmented/'

##### Generate required 4 features for voting classifier

In [3]:
!tl align-page-rank $input_candidate_file_path \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard / smallest-qnode-number \
> $aligned_pagerank_candidate_file_path

In [4]:
features_df = pd.read_csv(aligned_pagerank_candidate_file_path)
features_df.loc[:, ['method', 'pagerank', 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']].head()

Unnamed: 0,method,pagerank,aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized
0,exact-match,3.983031e-09,3.983031e-09,0,1.0,0.0
1,fuzzy-augmented,3.983031e-09,0.0,0,1.0,0.0
2,fuzzy-augmented,5.918546e-09,0.0,0,0.772222,0.0
3,fuzzy-augmented,3.740191e-09,0.0,0,0.640476,0.0
4,fuzzy-augmented,0.0,0.0,0,0.75,0.0


##### Generate model-voted candidates result

In [5]:
!tl vote-by-classifier $aligned_pagerank_candidate_file_path \
--prob-threshold 0.995 \
--model /Users/summ7t/dev/novartis/table-linker/weighted_lr.pkl \
> $model_voted_candidate_file_path

In [6]:
model_voted_df = pd.read_csv(model_voted_candidate_file_path)
model_voted_df.head()

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,vote_by_classifier
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,3.983031e-09,21.693314,3.983031e-09,1.0,0.0,0.0,0,0
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39384,0.0,1.0,0.0,0.0,0,0
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.48463,0.0,0.772222,0.0,0.0,0,0
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.48463,0.0,0.640476,0.0,0.0,0,0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0.0,0.75,0.0,0.0,0,0


##### Generate graph-embedding-score using centroid-of-lof and lof-strategy

In [7]:
!tl score-using-embedding $model_voted_candidate_file_path \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o graph-embedding-score \
--embedding-file $graph_embedding_complex_file \
--embedding-url $index_url \
> $graph_embedding_file_path

Qnodes to lookup: 1260
Qnodes from file: 1241
Qnodes from server: 0
_centroid_of_lof: Missing 1 of 16
Outlier removal generates 10 lof-voted candidates


In [8]:
score_df = pd.read_csv(graph_embedding_file_path)
score_df.head(10)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,retrieval_score,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,vote_by_classifier,singleton,is_lof,graph-embedding-score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,...,21.693314,3.983031e-09,1.0,0.0,0.0,0,0,1,-1,0.849984
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,...,36.39384,0.0,1.0,0.0,0.0,0,0,0,-1,0.849984
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,...,23.48463,0.0,0.772222,0.0,0.0,0,0,0,-1,0.464953
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),...,23.48463,0.0,0.640476,0.0,0.0,0,0,0,-1,0.589845
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,...,20.582134,0.0,0.75,0.0,0.0,0,0,0,-1,0.0
5,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,...,20.520416,0.0,0.725,0.0,0.0,0,0,0,-1,0.511026
6,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16682735,,,fuzzy-augmented,,...,19.623405,0.0,0.0,0.0,0.0,0,0,0,-1,0.194887
7,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q6426050,Kohli,,fuzzy-augmented,,...,19.601744,0.0,0.75,0.0,0.0,0,0,0,-1,0.335949
8,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q46251,Fränzi Mägert-Kohli,Franziska Kohli|Fraenzi Maegert-Kohli,fuzzy-augmented,Swiss snowboarder,...,19.233713,0.0,0.488889,0.0,0.0,1,0,0,-1,0.473921
9,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16434086,Wirat Wachirarattanawong,,fuzzy-augmented,,...,19.010628,0.0,0.685185,0.0,0.0,0,0,0,-1,0.54734


In [9]:
score_df[score_df['is_lof'] == 1]

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,retrieval_score,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,vote_by_classifier,singleton,is_lof,graph-embedding-score
140,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q142613,Cheteshwar Pujara,Cheteshwar Arvind Pujara,exact-match,Indian cricket player,...,21.693314,3.93661e-09,1.0,0.0,0.0,1,1,1,1,0.94191
141,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q142613,Cheteshwar Pujara,Cheteshwar Arvind Pujara,fuzzy-augmented,Indian cricket player,...,40.74549,0.0,1.0,0.0,0.0,1,1,0,1,0.94191
241,0,11,Ishant Sharma,delhi capitals|168|2/9/88,Ishant Sharma,Q3522062,Ishant Sharma,,exact-match,Indian cricket player.,...,21.693314,3.539613e-09,1.0,0.0,0.0,0,0,1,1,0.957569
578,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93,Jasprit Bumrah,Q16227998,Jasprit Bumrah,Jasprit Jasbirsingh Bumrah|Jasprit Jasbir Sing...,exact-match,cricketer,...,21.693314,3.539613e-09,1.0,0.0,0.0,0,0,1,1,0.948009
666,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88,Ajinkya Rahane,Q137669,Ajinkya Rahane,Ajinkya Madhukar Rahane|rahane,exact-match,Indian cricketer,...,21.693314,3.539613e-09,1.0,0.0,0.0,1,1,1,1,0.952638
667,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88,Ajinkya Rahane,Q137669,Ajinkya Rahane,Ajinkya Madhukar Rahane|rahane,fuzzy-augmented,Indian cricketer,...,41.210064,0.0,1.0,0.0,0.0,1,1,0,1,0.952638
890,0,6,Bhuvneshwar Kumar,deccan chargers|154|5/2/90,Bhuvneshwar Kumar,Q2003153,Bhuvneshwar Kumar,Bhuvneshwar Kumar Singh,exact-match,Indian cricket player,...,21.693314,3.539613e-09,1.0,0.0,0.0,0,0,1,1,0.947998
993,0,7,Ravindra Jadeja,chennai super kings|132|6/12/88,Ravindra Jadeja,Q2721457,Ravindra Jadeja,Ravindrasinh Jadeja|Sir Jaddu|Ravindrasinh Ani...,exact-match,Indian cricketer,...,21.693314,3.539613e-09,1.0,0.0,0.0,0,0,1,1,0.929021
1094,0,8,Rishabh Pant,delhi capitals|136|4/8/97,Rishabh Pant,Q21622311,Rishabh Pant,,exact-match,Indian cricketer,...,21.693314,3.539613e-09,1.0,0.0,0.0,0,0,1,1,0.938138
1195,0,9,Shikhar Dhawan,delhi capitals|157|5/12/85,Shikhar Dhawan,Q7487024,Shikhar Dhawan,Gabbar,exact-match,Indian cricketer,...,21.693314,3.539613e-09,1.0,0.0,0.0,0,0,1,1,0.935368


In [10]:
score_df.sort_values(by=['graph-embedding-score'], ascending=False).loc[:, ['kg_id', 'kg_labels', 'kg_descriptions', 'method', 'vote_by_classifier', 'graph-embedding-score']].head(20)

Unnamed: 0,kg_id,kg_labels,kg_descriptions,method,vote_by_classifier,graph-embedding-score
241,Q3522062,Ishant Sharma,Indian cricket player.,exact-match,0,0.957569
242,Q3522062,Ishant Sharma,Indian cricket player.,fuzzy-augmented,0,0.957569
666,Q137669,Ajinkya Rahane,Indian cricketer,exact-match,1,0.952638
667,Q137669,Ajinkya Rahane,Indian cricketer,fuzzy-augmented,1,0.952638
579,Q16227998,Jasprit Bumrah,cricketer,fuzzy-augmented,0,0.948009
578,Q16227998,Jasprit Bumrah,cricketer,exact-match,0,0.948009
891,Q2003153,Bhuvneshwar Kumar,Indian cricket player,fuzzy-augmented,0,0.947998
890,Q2003153,Bhuvneshwar Kumar,Indian cricket player,exact-match,0,0.947998
140,Q142613,Cheteshwar Pujara,Indian cricket player,exact-match,1,0.94191
141,Q142613,Cheteshwar Pujara,Indian cricket player,fuzzy-augmented,1,0.94191
