In [1]:
import pandas as pd
import os
import glob
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import sklearn.metrics
from collections import defaultdict
import shutil
import pickle

pd.reset_option('all')

As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead.

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



In [2]:
input_file_path = '/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files/cricketers.csv'
wikify_column_name = "cricketers"
output_path = '/tmp/cricketers'
es_index = 'wikidatadwd-augmented'
es_url = 'http://ckg07:9200'

In [3]:
temp_dir = f'{output_path}/temp'

In [4]:
!mkdir -p $output_path
!mkdir -p $temp_dir

In [71]:
#intermediate files
canonical = f'{temp_dir}/canonical.csv'
candidates = f"{temp_dir}/candidates.csv"
feature_class_count = f"{temp_dir}/feature_class_count.csv"
feature_property_count = f"{temp_dir}/feature_property_count.csv"
feature_class_property_count = f"{temp_dir}/feature_property_class_count.csv"
score_file = f"{temp_dir}/scores.csv"
model_name = 'rf_tuned_ranking.pkl'

embedding_file = f'{temp_dir}/graph_embedding_complex.tsv'
aux_field = 'graph_embedding_complex,class_count,property_count'
final_score = f'{temp_dir}/final_score.csv'
top_k_file = f"{temp_dir}/topk-hormones.csv" 
final_output = f"{output_path}/linked-hormones.csv" 

## Peek at the input file

In [6]:
pd.read_csv(input_file_path).fillna("")

Unnamed: 0,cricketers,teams,weight,dob
0,Virat Kohli,royal challengers bangalore,152,5/11/88
1,Tendulkar,mumbai indians,137,24/04/1973
2,Dhoni,chennai super kings,154,7/7/81
3,Jasprit Bumrah,mumbai indians,154,6/12/93
4,Ajinkya Rahane,rajasthan royals,134,6/6/88
5,Rohit Sharma,mumbai indians,159,30/04/1987
6,Bhuvneshwar Kumar,deccan chargers,154,5/2/90
7,Ravindra Jadeja,chennai super kings,132,6/12/88
8,Rishabh Pant,delhi capitals,136,4/8/97
9,Shikhar Dhawan,delhi capitals,157,5/12/85


## Canonicalize

In [25]:
!tl canonicalize \
-c "$wikify_column_name" \
--add-context \
{input_file_path} > {canonical}

In [26]:
df = pd.read_csv(canonical)
df

Unnamed: 0,column,row,label,context
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88
1,0,1,Tendulkar,mumbai indians|137|24/04/1973
2,0,2,Dhoni,chennai super kings|154|7/7/81
3,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93
4,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88
5,0,5,Rohit Sharma,mumbai indians|159|30/04/1987
6,0,6,Bhuvneshwar Kumar,deccan chargers|154|5/2/90
7,0,7,Ravindra Jadeja,chennai super kings|132|6/12/88
8,0,8,Rishabh Pant,delhi capitals|136|4/8/97
9,0,9,Shikhar Dhawan,delhi capitals|157|5/12/85


## Candidate Generation

In [27]:
candidates

'/tmp/cricketers/temp/candidates.csv'

In [28]:
%%time
!tl clean -c label -o label_clean {canonical} / \
--url $es_url --index $es_index \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields {aux_field} \
--auxiliary-folder $temp_dir / \
--url $es_url --index $es_index \
get-exact-matches -c label_clean \
--auxiliary-fields {aux_field} \
--auxiliary-folder {temp_dir} > {candidates}

CPU times: user 440 ms, sys: 150 ms, total: 589 ms
Wall time: 22.6 s


In [29]:
column_rename_dict = {
    'graph_embedding_complex': 'embedding',
     'class_count': 'class_count',
    'property_count': 'property_count'
}
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={field: column_rename_dict[field]})
    aux_df.to_csv(f'{temp_dir}/{field}.tsv', sep='\t', index=False)

graph_embedding_complex /tmp/cricketers/temp/fuzzy_augmented_graph_embedding_complex.tsv
graph_embedding_complex /tmp/cricketers/temp/exact_matches_graph_embedding_complex.tsv
class_count /tmp/cricketers/temp/exact_matches_class_count.tsv
class_count /tmp/cricketers/temp/fuzzy_augmented_class_count.tsv
property_count /tmp/cricketers/temp/exact_matches_property_count.tsv
property_count /tmp/cricketers/temp/fuzzy_augmented_property_count.tsv


In [30]:
pd.read_csv(candidates, nrows = 150).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.393840
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546e-09,23.484630
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.740191e-09,23.484630
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.000000e+00,20.582134
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.890132e-09,20.520416
...,...,...,...,...,...,...,...,...,...,...,...,...
145,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q31728772,Radio Dhoni,,fuzzy-augmented,radio station in Dhaka,0.000000e+00,17.156477
146,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q102351448,Shahnawaz Dhani,,fuzzy-augmented,Pakistani cricketer,3.539613e-09,17.115705
147,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q2627581,Omar Dhani,Omar Dhani,fuzzy-augmented,Commander of the Indonesian Air Force,3.539613e-09,17.115705
148,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q4695305,Ahmad Dhani,,fuzzy-augmented,Indonesian musician and songwriter,2.755712e-08,17.115705


### Add Class Count TF IDF Feature

In [44]:
!tl compute-tf-idf \
--feature-file /tmp/cricketers/temp/class_count.tsv \
--feature-name class_count \
-o class_count_tf_idf_score \
{candidates} > {feature_class_count}

#### Peak at class count tf idf feature file

In [45]:
pd.read_csv(feature_class_count, dtype=object).head().fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,class_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031232217997e-09,36.39384,0,1479.121497305729
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546005357847e-09,23.48463,0,216.2054543689757
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.7401912005599e-09,23.48463,0,216.2054543689757
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0,0.0
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.8901323967569805e-09,20.520416,0,12.690497759955488


#### Get top 1 candidate for each cell

In [46]:
!tl get-kg-links -c class_count_tf_idf_score -l label -k 1 --k-rows  $feature_class_count > $temp_dir/class_count_top_k.csv

In [47]:
pd.read_csv(f"{temp_dir}/class_count_top_k.csv").fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,class_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39384,0,1479.121497
1,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q9488,Sachin Tendulkar,Sachin Ramesh Tendulkar|Master Blaster,fuzzy-augmented,Indian former cricketer,1.196003e-08,28.334663,0,1479.121497
2,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q142613,Cheteshwar Pujara,Cheteshwar Arvind Pujara,fuzzy-augmented,Indian cricket player,3.93661e-09,40.74549,0,1479.121497
3,0,11,Ishant Sharma,delhi capitals|168|2/9/88,Ishant Sharma,Q3522062,Ishant Sharma,,fuzzy-augmented,Indian cricket player.,3.539613e-09,30.923111,0,1479.121497
4,0,12,Mohammad Shami,kings XI punjab|152|3/9/90,Mohammad Shami,Q7487531,Mohammed Shami,Mohammad Shami|Mohammed Shami Ahmed|Mohammad S...,fuzzy-augmented,Indian cricketer,3.539613e-09,28.241823,0,1479.121497
5,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q470774,MS Dhoni,Mr Cool|Mahi|Mahendra Singh Dhoni|Finisher|Cap...,fuzzy-augmented,Indian cricket player,6.350345e-09,21.508753,0,1479.121497
6,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93,Jasprit Bumrah,Q16227998,Jasprit Bumrah,Jasprit Jasbirsingh Bumrah|Jasprit Jasbir Sing...,fuzzy-augmented,cricketer,3.539613e-09,40.825333,0,1479.121497
7,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88,Ajinkya Rahane,Q137669,Ajinkya Rahane,Ajinkya Madhukar Rahane|rahane,fuzzy-augmented,Indian cricketer,3.539613e-09,41.210064,0,1479.121497
8,0,5,Rohit Sharma,mumbai indians|159|30/04/1987,Rohit Sharma,Q3520045,Rohit Sharma,Rohit Gurunath Sharma|Hitman,fuzzy-augmented,Indian cricketer,3.84048e-09,31.262672,0,1479.121497
9,0,6,Bhuvneshwar Kumar,deccan chargers|154|5/2/90,Bhuvneshwar Kumar,Q2003153,Bhuvneshwar Kumar,Bhuvneshwar Kumar Singh,fuzzy-augmented,Indian cricket player,3.539613e-09,43.885715,0,1479.121497


### Add Property Count TF IDF Feature

In [49]:
!tl compute-tf-idf \
--feature-file /tmp/cricketers/temp/property_count.tsv \
--feature-name property_count \
-o property_count_tf_idf_score \
{candidates} > {feature_property_count}

#### Peak at property count tf idf feature file

In [50]:
pd.read_csv(feature_property_count, dtype=object).head().fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,property_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031232217997e-09,36.39384,0,3081.211367106489
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546005357847e-09,23.48463,0,0.0077586069205912
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.7401912005599e-09,23.48463,0,275.78898873927227
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0,27.831375784604106
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.8901323967569805e-09,20.520416,0,107.924017529278


#### Get top 1 candidate for each cell

In [51]:
!tl get-kg-links -c property_count_tf_idf_score -l label -k 1 --k-rows  $feature_property_count > $temp_dir/property_count_top_k.csv

In [52]:
pd.read_csv(f"{temp_dir}/property_count_top_k.csv").fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,property_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39384,0,3081.211367
1,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q9488,Sachin Tendulkar,Sachin Ramesh Tendulkar|Master Blaster,fuzzy-augmented,Indian former cricketer,1.196003e-08,28.334663,0,3061.350962
2,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q142613,Cheteshwar Pujara,Cheteshwar Arvind Pujara,fuzzy-augmented,Indian cricket player,3.93661e-09,40.74549,0,3107.739609
3,0,11,Ishant Sharma,delhi capitals|168|2/9/88,Ishant Sharma,Q3522062,Ishant Sharma,,fuzzy-augmented,Indian cricket player.,3.539613e-09,30.923111,0,3094.331175
4,0,12,Mohammad Shami,kings XI punjab|152|3/9/90,Mohammad Shami,Q7487531,Mohammed Shami,Mohammad Shami|Mohammed Shami Ahmed|Mohammad S...,fuzzy-augmented,Indian cricketer,3.539613e-09,28.241823,0,2131.555699
5,0,2,Dhoni,chennai super kings|154|7/7/81,Dhoni,Q470774,MS Dhoni,Mr Cool|Mahi|Mahendra Singh Dhoni|Finisher|Cap...,fuzzy-augmented,Indian cricket player,6.350345e-09,21.508753,0,3384.487501
6,0,3,Jasprit Bumrah,mumbai indians|154|6/12/93,Jasprit Bumrah,Q16227998,Jasprit Bumrah,Jasprit Jasbirsingh Bumrah|Jasprit Jasbir Sing...,fuzzy-augmented,cricketer,3.539613e-09,40.825333,0,2768.780291
7,0,4,Ajinkya Rahane,rajasthan royals|134|6/6/88,Ajinkya Rahane,Q137669,Ajinkya Rahane,Ajinkya Madhukar Rahane|rahane,fuzzy-augmented,Indian cricketer,3.539613e-09,41.210064,0,3121.176672
8,0,5,Rohit Sharma,mumbai indians|159|30/04/1987,Rohit Sharma,Q3520045,Rohit Sharma,Rohit Gurunath Sharma|Hitman,fuzzy-augmented,Indian cricketer,3.84048e-09,31.262672,0,2912.910107
9,0,6,Bhuvneshwar Kumar,deccan chargers|154|5/2/90,Bhuvneshwar Kumar,Q2003153,Bhuvneshwar Kumar,Bhuvneshwar Kumar Singh,fuzzy-augmented,Indian cricket player,3.539613e-09,43.885715,0,2973.657419


## Use the combined property and class counts

In [70]:
pdf = pd.read_csv(f"{temp_dir}/property_count.tsv", sep='\t')
cdf = pd.read_csv(f"{temp_dir}/class_count.tsv", sep='\t')

class_prop_file = f"{temp_dir}/class_property_count.tsv"
df = pdf.merge(cdf, on='qnode', how='left').fillna("")
df['class_prop_count_temp'] = list(zip(df.property_count, df.class_count))
df['class_property_count'] = df['class_prop_count_temp'].map(lambda x: "|".join(x) if x[1] != "" else x[0])
df.drop(columns=['class_prop_count_temp', 'class_count', 'property_count'], inplace=True)
df.to_csv(class_prop_file, sep='\t', index=False)
df

Unnamed: 0,qnode,class_property_count
0,Q213854,P106:6339031|P140:361513|P1532:159268|P166:515...
1,Q1711834,P1889:440686|P31:41379394|Q104624828:6675312|Q...
2,Q7699668,P1705:608753|P1889:440686|P31:41379394|Q101352...
3,Q142613,P106:6339031|P1532:159268|P166:515832|P18:3544...
4,Q3522062,P106:6339031|P1532:159268|P166:515832|P18:3544...
...,...,...
1252,Q16045629,P106:6339031|P19:2698945|P214:2562940|P21:6913...
1253,Q4121790,P106:6339031|P18:3544200|P19:2698945|P2020:491...
1254,Q13512643,P106:6339031|P1412:909907|P18:3544200|P19:2698...
1255,Q21064029,P102:392278|P106:6339031|P19:2698945|P21:69137...


In [72]:
!tl compute-tf-idf \
--feature-file /tmp/cricketers/temp/class_property_count.tsv \
--feature-name class_property_count \
-o class_property_count_tf_idf_score \
{candidates} > {feature_class_property_count}

#### Peak at class property count tf idf feature file

In [73]:
pd.read_csv(feature_class_property_count, dtype=object).head().fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,class_property_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031232217997e-09,36.39384,0,2508.491890173929
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q102354285,Marie Virat,,fuzzy-augmented,Ph. D. 2009,5.918546005357847e-09,23.48463,0,77.29470334383954
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q16027751,Bernard Virat,,fuzzy-augmented,French biologist (1921-2003),3.7401912005599e-09,23.48463,0,254.4889035932415
3,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7907059,VIRAT,,fuzzy-augmented,,0.0,20.582134,0,17.882139301600645
4,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q2978459,Virata,Virat,fuzzy-augmented,character from the epic Mahabharata,6.8901323967569805e-09,20.520416,0,73.8796779096061


#### Get top 1 candidate for each cell

In [76]:
!tl get-kg-links -c class_property_count_tf_idf_score -l label -k 3 --k-rows  $feature_class_property_count > $temp_dir/class_property_count_top_k.csv

In [77]:
pd.read_csv(f"{temp_dir}/class_property_count_top_k.csv").fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,singleton,class_property_count_tf_idf_score
0,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q213854,Virat Kohli,Cheeku,fuzzy-augmented,Indian cricket player,3.983031e-09,36.39384,0,2508.49189
1,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q19899153,Virat Singh,,fuzzy-augmented,Indian cricketer,3.539613e-09,19.010628,0,1228.066355
2,0,0,Virat Kohli,royal challengers bangalore|152|5/11/88,Virat Kohli,Q7686953,Taruwar Kohli,Taruwar Sushil Kohli,fuzzy-augmented,Indian cricketer,3.539613e-09,17.400097,0,1116.052487
3,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q9488,Sachin Tendulkar,Sachin Ramesh Tendulkar|Master Blaster,fuzzy-augmented,Indian former cricketer,1.196003e-08,28.334663,0,2495.731237
4,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q22327439,Arjun Tendulkar,,fuzzy-augmented,cricketer,4.609075e-09,20.530342,0,1035.470736
5,0,1,Tendulkar,mumbai indians|137|24/04/1973,Tendulkar,Q55744,Vijay Tendulkar,Vijay Dhondopant Tendulkar,fuzzy-augmented,Indian writer,1.156015e-08,20.728312,0,328.256882
6,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q142613,Cheteshwar Pujara,Cheteshwar Arvind Pujara,fuzzy-augmented,Indian cricket player,3.93661e-09,40.74549,0,2525.536743
7,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q16225224,Arvind Pujara,,fuzzy-augmented,cricketer,3.736444e-09,21.39463,0,955.226463
8,0,10,Cheteshwar Pujara,deccan chargers|157|25/01/1988,Cheteshwar Pujara,Q5833940,Enrique Wong,Enrique Wong Pujada,fuzzy-augmented,Peruvian politician,3.539613e-09,15.250788,0,537.159524
9,0,11,Ishant Sharma,delhi capitals|168|2/9/88,Ishant Sharma,Q3522062,Ishant Sharma,,fuzzy-augmented,Indian cricket player.,3.539613e-09,30.923111,0,2516.921593
