In [17]:
import numpy as np
import pandas as pd
import os
import glob
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [59]:
HOME_DIR = '/Users/summ7t/dev/novartis/table-linker/t2dv2-dev-score/pipeline'
table_path = f'{HOME_DIR}/temp/cricketers.csv'
wikify_column_name = "cricketers"

canonical_file_path = f'{HOME_DIR}/temp/canonical.csv'
candidate_file_path = f'{HOME_DIR}/temp/candidates.csv'
aux_field = 'graph_embedding_complex,class_count,property_count'
temp_dir = f'{HOME_DIR}/temp/temp'

aligned_pagerank_candidate_file_path = f'{HOME_DIR}/apr_test.csv'
model_file_path = '/Users/summ7t/dev/novartis/table-linker/weighted_lr.pkl'
model_voted_candidate_file_path = f'{HOME_DIR}/mv_test.csv'
graph_embedding_file_path = f'{HOME_DIR}/score_test.csv'

lof_reciprocal_rank_file_path = f'{HOME_DIR}/lof_rr_test.csv'
lof_tfidf_file_path = f'{HOME_DIR}/lof_tfidf_test.csv'
lof_feature_file = f'{HOME_DIR}/lof_feature.csv'

graph_embedding_complex_file = f'{temp_dir}/graph_embedding_complex.tsv'
class_count_file = f'{temp_dir}/class_count.tsv'
property_count_file = f'{temp_dir}/property_count.tsv'
index_url = 'http://ckg07:9200/wikidatadwd-augmented/'

### Canonicalize

In [11]:
!tl canonicalize -c "$wikify_column_name" --add-context "$table_path" \
> "$canonical_file_path"

In [13]:
pd.read_csv(canonical_file_path, nrows = 5)

Unnamed: 0,column,row,label,context
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88
1,0,1,Tendulkar,mumbai indians|137|24/04/1973
2,0,2,Dhoni,chennai super kings|154|7/7/81
3,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93
4,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88


### Candidate Generation

In [33]:
!tl clean -c label -o label_clean "$canonical_file_path" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" \
/ --url http://ckg07:9200 --index wikidatadwd-augmented get-exact-matches \
-c label_clean --auxiliary-fields "$aux_field" \
--auxiliary-folder "$temp_dir" > "$candidate_file_path"

In [39]:
column_rename_dict = {
    'graph_embedding_complex': 'embedding',
     'class_count': 'class_count',
    'property_count': 'property_count'
}
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={field: column_rename_dict[field]})
    aux_df.to_csv(f'{temp_dir}/{field}.tsv', sep='\t', index=False)

In [40]:
pd.read_csv(candidate_file_path, nrows=5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39384
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.48463
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.48463
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.520416


In [41]:
!ls $temp_dir

class_count.tsv
exact_matches_class_count.tsv
exact_matches_graph_embedding_complex.tsv
exact_matches_property_count.tsv
fuzzy_augmented_class_count.tsv
fuzzy_augmented_graph_embedding_complex.tsv
fuzzy_augmented_property_count.tsv
graph_embedding_complex.tsv
property_count.tsv


### Generate lof-related features: lof-graph-embedding-score, lof-reciprocal-rank, lof-tfidf
##### Generate required 4 features for voting classifier

In [42]:
!tl align-page-rank $candidate_file_path \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard / smallest-qnode-number \
> $aligned_pagerank_candidate_file_path

In [43]:
features_df = pd.read_csv(aligned_pagerank_candidate_file_path)
features_df.loc[:, ['method', 'pagerank', 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']].head()

Unnamed: 0,method,pagerank,aligned_pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard_normalized
0,exact-match,3.983031e-09,3.983031e-09,0,1.0,0.0
1,fuzzy-augmented,3.983031e-09,0.0,0,1.0,0.0
2,fuzzy-augmented,5.918546e-09,0.0,0,0.772222,0.0
3,fuzzy-augmented,3.740191e-09,0.0,0,0.640476,0.0
4,fuzzy-augmented,0.0,0.0,0,0.75,0.0


##### Generate model-voted candidates result

In [44]:
!tl vote-by-classifier $aligned_pagerank_candidate_file_path \
--prob-threshold 0.995 \
--model $model_file_path \
> $model_voted_candidate_file_path

In [45]:
model_voted_df = pd.read_csv(model_voted_candidate_file_path)
model_voted_df.head()

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,vote_by_classifier
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,3.983031e-09,21.693314,3.983031e-09,1.0,0.0,0.0,0,0
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39384,0.0,1.0,0.0,0.0,0,0
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.48463,0.0,0.772222,0.0,0.0,0,0
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.48463,0.0,0.640476,0.0,0.0,0,0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0.0,0.75,0.0,0.0,0,0


##### Generate graph-embedding-score using centroid-of-lof and lof-strategy

In [51]:
!tl score-using-embedding $model_voted_candidate_file_path \
--column-vector-strategy centroid-of-lof \
--lof-strategy ems-mv \
-o lof-graph-embedding-score \
--embedding-file $graph_embedding_complex_file \
--embedding-url $index_url \
> $graph_embedding_file_path

Qnodes to lookup: 1260
Qnodes from file: 1241
Qnodes from server: 0
_centroid_of_lof: Missing 1 of 16
Outlier removal generates 10 lof-voted candidates


In [52]:
score_df = pd.read_csv(graph_embedding_file_path)
score_df.head(5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,retrieval_score,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,vote_by_classifier,singleton,is_lof,lof-graph-embedding-score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,...,21.693314,3.983031e-09,1.0,0.0,0.0,0,0,1,-1,0.849984
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,...,36.39384,0.0,1.0,0.0,0.0,0,0,0,-1,0.849984
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,...,23.48463,0.0,0.772222,0.0,0.0,0,0,0,-1,0.464953
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),...,23.48463,0.0,0.640476,0.0,0.0,0,0,0,-1,0.589845
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,...,20.582134,0.0,0.75,0.0,0.0,0,0,0,-1,0.0


In [54]:
score_df.sort_values(by=['lof-graph-embedding-score'], ascending=False).loc[:, [
    'kg_id', 'kg_labels', 'kg_descriptions', 'method', 'singleton', 'vote_by_classifier', 'is_lof', 'lof-graph-embedding-score'
]].head(20)

Unnamed: 0,kg_id,kg_labels,kg_descriptions,method,singleton,vote_by_classifier,is_lof,lof-graph-embedding-score
241,Q3522062,Ishant Sharma,Indian cricket player.,exact-match,1,0,1,0.957569
242,Q3522062,Ishant Sharma,Indian cricket player.,fuzzy-augmented,0,0,-1,0.957569
666,Q137669,Ajinkya Rahane,Indian cricketer,exact-match,1,1,1,0.952638
667,Q137669,Ajinkya Rahane,Indian cricketer,fuzzy-augmented,0,1,1,0.952638
579,Q16227998,Jasprit Bumrah,cricketer,fuzzy-augmented,0,0,-1,0.948009
578,Q16227998,Jasprit Bumrah,cricketer,exact-match,1,0,1,0.948009
891,Q2003153,Bhuvneshwar Kumar,Indian cricket player,fuzzy-augmented,0,0,-1,0.947998
890,Q2003153,Bhuvneshwar Kumar,Indian cricket player,exact-match,1,0,1,0.947998
140,Q142613,Cheteshwar Pujara,Indian cricket player,exact-match,1,1,1,0.94191
141,Q142613,Cheteshwar Pujara,Indian cricket player,fuzzy-augmented,0,1,1,0.94191


##### Generate lof reciprocal rank feature

In [56]:
!tl generate-reciprocal-rank "$graph_embedding_file_path" \
-c lof-graph-embedding-score \
-o lof-reciprocal-rank \
> "$lof_reciprocal_rank_file_path"

In [57]:
pd.read_csv(lof_reciprocal_rank_file_path, nrows=5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,aligned_pagerank,monge_elkan,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,vote_by_classifier,singleton,is_lof,lof-graph-embedding-score,lof-reciprocal-rank
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,...,3.983031e-09,1.0,0.0,0.0,0,0,1,-1,0.849984,0.5
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,...,0.0,1.0,0.0,0.0,0,0,0,-1,0.849984,0.333333
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,...,0.0,0.772222,0.0,0.0,0,0,0,-1,0.464953,0.014925
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),...,0.0,0.640476,0.0,0.0,0,0,0,-1,0.589845,0.026316
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,...,0.0,0.75,0.0,0.0,0,0,0,-1,0.0,0.009901


##### Generate lof tfidf feature

In [60]:
!tl compute-tf-idf "$lof_reciprocal_rank_file_path" \
--feature-file "$class_count_file" \
--feature-name class_count \
--singleton-column is_lof \
-o lof_class_count_tf_idf_score \
/ compute-tf-idf \
--feature-file "$property_count_file" \
--feature-name property_count \
--singleton-column is_lof \
-o lof_property_count_tf_idf_score \
> "$lof_feature_file"

In [61]:
pd.read_csv(lof_feature_file, nrows=5)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,des_cont_jaccard,des_cont_jaccard_normalized,smallest_qnode_number,vote_by_classifier,singleton,is_lof,lof-graph-embedding-score,lof-reciprocal-rank,lof_class_count_tf_idf_score,lof_property_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,exact-match,Indian cricket player,...,0.0,0.0,0,0,1,-1,0.849984,0.5,1.0,0.639741
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,...,0.0,0.0,0,0,0,-1,0.849984,0.333333,1.0,0.639741
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,...,0.0,0.0,0,0,0,-1,0.464953,0.014925,0.544223,0.000124
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),...,0.0,0.0,0,0,0,-1,0.589845,0.026316,0.544223,0.159754
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,...,0.0,0.0,0,0,0,-1,0.0,0.009901,0.0,0.016055


### [Rijul's Ranking Model]