In [1]:
import pandas as pd
import os
import glob
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import sklearn.metrics
from collections import defaultdict
import shutil
import pickle

pd.reset_option('all')

As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead.

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



In [7]:
input_file_path = '/Users/amandeep/Downloads/mathematician.csv'
wikify_column_name = "Mathematician"
output_path = '/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files/output'
es_index = 'wikidatadwd-augmented-02'
es_url = 'http://ckg06:9200'

In [3]:
temp_dir = f'{output_path}/temp'

In [4]:
!mkdir -p $output_path
!mkdir -p $temp_dir

In [5]:
#intermediate files
canonical = f'{temp_dir}/canonical.csv'
candidates = f"{temp_dir}/candidates.csv"
feature_votes = f"{temp_dir}/feature_votes.csv"
score_file = f"{temp_dir}/scores.csv"
model_name = 'rf_tuned_ranking.pkl'

embedding_file = f'{temp_dir}/graph_embedding_complex.tsv'
aux_field = 'graph_embedding_complex'
final_score = f'{temp_dir}/final_score.csv'
top_k_file = f"{temp_dir}/topk.csv" 
final_output = f"{output_path}/linked-mathematician.csv" 

## Peak at the input file

In [6]:
pd.read_csv(input_file_path).fillna("")

Unnamed: 0,Mathematician,Year of PhD,Granting Institution,Supervisor,Thesis
0,Keith F. Taylor,1975,University of Alberta,Anthony To-Ming Lau,The Structure of the Regular Representation of...
1,Anthony To-Ming Lau,1969,University of British Columbia,Edmond E. Granirer,Topological Semigroups
2,Edmond E. Granirer,1962,Hebrew University,Harry Kesten,On Amenable Semigroups with a Finite Dimension...
3,Harry Kesten,1958,Cornell University,Mark Kac,Symmetric Random Walks on Groups
4,Mark Kac,1937,University of Lw?w,Hugo Steinhaus,unknown
5,Hugo Steinhaus,1911,Georg-August-Universit?t G?ttingen,David Hilbert,Neue Anwendungen des Dirichlet'schen Prinzips
6,David Hilbert,1885,Universit?t K?nigsberg,C. L. Ferdinand Lindemann,?ber invariante Eigenschaften specieller bin?r...
7,C. L. Ferdinand Lindemann,1873,Friedrich-Alexander-Universit?t Erlangen-N?rnberg,C. Felix Klein,?ber unendlich kleine Bewegungen und ?ber Kraf...
8,C. Felix Klein,1868,Rheinische Friedrich-Wilhelms-Universit?t Bonn,Julius Pl?cker and Rudolf Lipschitz,?ber die Transformation der allgemeinen Gleich...
9,Julius Pl?cker,1823,Philipps - Universit?t Marburg,Christian Gerling,Generalem analyeseos applicationem ad ea quae ...


## Canonicalize

In [8]:
!tl canonicalize \
-c "$wikify_column_name" \
--add-context \
{input_file_path} > {canonical}

In [9]:
pd.read_csv(canonical, nrows = 10)

Unnamed: 0,column,row,label,context
0,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...
1,0,1,Anthony To-Ming Lau,1969|University of British Columbia|Edmond E. ...
2,0,2,Edmond E. Granirer,1962|Hebrew University|Harry Kesten|On Amenabl...
3,0,3,Harry Kesten,1958|Cornell University|Mark Kac|Symmetric Ran...
4,0,4,Mark Kac,1937|University of Lw?w|Hugo Steinhaus|unknown
5,0,5,Hugo Steinhaus,1911|Georg-August-Universit?t G?ttingen|David ...
6,0,6,David Hilbert,1885|Universit?t K?nigsberg|C. L. Ferdinand Li...
7,0,7,C. L. Ferdinand Lindemann,1873|Friedrich-Alexander-Universit?t Erlangen-...
8,0,8,C. Felix Klein,1868|Rheinische Friedrich-Wilhelms-Universit?t...
9,0,9,Julius Pl?cker,1823|Philipps - Universit?t Marburg|Christian ...


## Candidate Generation

In [10]:
%%time
!tl clean -c label -o label_clean {canonical} / \
--url $es_url --index $es_index \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields {aux_field} \
--auxiliary-folder $temp_dir / \
--url $es_url --index $es_index \
get-exact-matches -c label_clean \
--auxiliary-fields {aux_field} \
--auxiliary-folder {temp_dir} > {candidates}

CPU times: user 1.05 s, sys: 347 ms, total: 1.4 s
Wall time: 46.8 s


In [11]:
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{aux_field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={aux_field: 'embedding'})
    aux_df.to_csv(f'{temp_dir}/{aux_field}.tsv', sep='\t', index=False)

In [12]:
pd.read_csv(candidates, nrows = 10).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q98543829,F Keith Taylor,,fuzzy-augmented,UK election candidate,3.539613e-09,23.37343
1,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q22087035,Keith Taylor,,fuzzy-augmented,British political scientist,3.539613e-09,23.102688
2,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385100,Keith Taylor,,fuzzy-augmented,Canadian poet,3.539613e-09,22.614727
3,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q5079714,Charles Keith Taylor,,fuzzy-augmented,Canadian politician,3.539613e-09,22.286064
4,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q102164684,Keith Frederick Taylor,,fuzzy-augmented,Ph.D. University of Alberta 1975,1.120288e-08,22.128487
5,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q1738098,Keith Taylor,Keith Richard Taylor,fuzzy-augmented,British politician (born 1953),3.712462e-09,21.732487
6,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385099,Keith Taylor,,fuzzy-augmented,American football player,3.539613e-09,21.732487
7,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385101,Keith Taylor,Dennis More|Cadmus Evans,fuzzy-augmented,Australian science fiction and fantasy writer,3.539613e-09,20.802166
8,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q53567269,Keith A. Taylor,,fuzzy-augmented,,3.539613e-09,19.758533
9,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q5541266,George Keith Taylor,,fuzzy-augmented,United States federal judge,3.539613e-09,19.758533


## Feature Voting

In [13]:
%%time
!tl smallest-qnode-number {candidates} \
    / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
    / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
    / string-similarity -i --method jaro_winkler -o jaro_winkler \
    / feature-voting -c "pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard" > {feature_votes}

CPU times: user 63.7 ms, sys: 29.8 ms, total: 93.4 ms
Wall time: 4.53 s


In [14]:
pd.read_csv(feature_votes, nrows = 10).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,monge_elkan,des_cont_jaccard,jaro_winkler,votes
0,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q98543829,F Keith Taylor,,fuzzy-augmented,UK election candidate,3.539613e-09,23.37343,0,0.95,0.0,0.906349,1
1,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q22087035,Keith Taylor,,fuzzy-augmented,British political scientist,3.539613e-09,23.102688,0,0.833333,0.0,0.96,0
2,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385100,Keith Taylor,,fuzzy-augmented,Canadian poet,3.539613e-09,22.614727,0,0.833333,0.0,0.96,0
3,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q5079714,Charles Keith Taylor,,fuzzy-augmented,Canadian politician,3.539613e-09,22.286064,0,0.755291,0.0,0.605556,0
4,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q102164684,Keith Frederick Taylor,,fuzzy-augmented,Ph.D. University of Alberta 1975,1.120288e-08,22.128487,0,0.845679,0.6,0.871082,1
5,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q1738098,Keith Taylor,Keith Richard Taylor,fuzzy-augmented,British politician (born 1953),3.712462e-09,21.732487,0,0.833333,0.0,0.96,0
6,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385099,Keith Taylor,,fuzzy-augmented,American football player,3.539613e-09,21.732487,0,0.833333,0.0,0.96,0
7,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385101,Keith Taylor,Dennis More|Cadmus Evans,fuzzy-augmented,Australian science fiction and fantasy writer,3.539613e-09,20.802166,0,0.833333,0.0,0.96,0
8,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q53567269,Keith A. Taylor,,fuzzy-augmented,,3.539613e-09,19.758533,0,0.888889,0.0,0.944762,0
9,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q5541266,George Keith Taylor,,fuzzy-augmented,United States federal judge,3.539613e-09,19.758533,0,0.759259,0.0,0.773369,0


## Compute Embedding Score using Column Vector Strategy

In [15]:
!tl score-using-embedding $feature_votes \
--column-vector-strategy centroid-of-singletons \
-o graph-embedding-score --embedding-file $embedding_file \
> $score_file

Qnodes to lookup: 2611
Qnodes from file: 2573


In [16]:
df = pd.read_csv(score_file).fillna("")
df.sort_values(by=['votes'], ascending=False)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,monge_elkan,des_cont_jaccard,jaro_winkler,votes,graph-embedding-score
1280,0,11,Carl Friedrich Gauss,1799|Universit?t Helmstedt|Johann Friedrich Pf...,Carl Friedrich Gauss,Q6722,Carl Friedrich Gauss,Carl Friedrich Gauß|Gauss|Karl Gauss|Johann Ca...,exact-match,German mathematician and physicist,1.180936e-06,20.508780,1,1.000000,0.0,1.000000,3,0.731958
2205,0,21,Leonhard Euler,1726|Universit?t Basel|Johann Bernoulli|unknown,Leonhard Euler,Q7604,Leonhard Euler,"L. Euler|Euler, Leonhard",fuzzy-augmented,Swiss mathematician,3.595727e-06,29.803192,1,1.000000,0.0,1.000000,3,0.810736
640,0,6,David Hilbert,1885|Universit?t K?nigsberg|C. L. Ferdinand Li...,David Hilbert,Q41585,David Hilbert,,fuzzy-augmented,German mathematician,1.301386e-06,24.905128,1,1.000000,0.0,1.000000,3,0.815841
769,0,6,David Hilbert,1885|Universit?t K?nigsberg|C. L. Ferdinand Li...,David Hilbert,Q41585,David Hilbert,,exact-match,German mathematician,1.301386e-06,21.138376,1,1.000000,0.0,1.000000,3,0.815841
1892,0,18,Simeon Denis Poisson,unknown|unknown|Joseph Lagrange|unknown,Simeon Denis Poisson,Q190772,Siméon Denis Poisson,Simeon Denis Poisson,fuzzy-augmented,"French mathematician, mechanician and physicis...",6.675443e-08,35.616287,1,0.974074,0.0,0.952105,3,0.804483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,9,Julius Pl?cker,1823|Philipps - Universit?t Marburg|Christian ...,Julius Pl?cker,Q85908031,Template:Rail-interchange/doc/PL,,fuzzy-augmented,,0.000000e+00,14.887209,0,0.523934,0.0,0.371131,0,0.000000
996,0,9,Julius Pl?cker,1823|Philipps - Universit?t Marburg|Christian ...,Julius Pl?cker,Q12371561,Oskar Cher,,fuzzy-augmented,,3.539613e-09,14.676394,0,0.280952,0.0,0.619048,0,0.562300
997,0,9,Julius Pl?cker,1823|Philipps - Universit?t Marburg|Christian ...,Julius Pl?cker,Q59536428,Sébastien Cher,,fuzzy-augmented,,3.539613e-09,14.676394,0,0.296958,0.0,0.507937,0,0.527238
998,0,9,Julius Pl?cker,1823|Philipps - Universit?t Marburg|Christian ...,Julius Pl?cker,Q95263526,Jean-Claude Caer,,fuzzy-augmented,,3.539613e-09,14.676394,0,0.312410,0.0,0.550595,0,0.589779



## Generate Additional Features required for Model Prediction

In [17]:
## TODO: Need to add these features as cli commands in Table Linker

def create_singleton_feature(df):
        d = df[df['method'] == 'exact-match'].groupby(['column','row'])[['kg_id']].count()
        l = list(d[d['kg_id'] == 1].index)
        singleton_feat = []
        for i,row in df.iterrows():
            col_num,row_num = row['column'],row['row']
            if (col_num,row_num) in l:
                singleton_feat.append(1)
            else:
                singleton_feat.append(0)
        df['singleton'] = singleton_feat
        return df
        
def generate_reciprocal_rank(df):
    final_list = []
    grouped_obj = df.groupby(['row', 'column'])
    for cell in grouped_obj:
        reciprocal_rank = list(1/cell[1]['graph-embedding-score'].rank())
        cell[1]['reciprocal_rank'] = reciprocal_rank
        final_list.extend(cell[1].to_dict(orient='records'))
    odf = pd.DataFrame(final_list)
    return odf

In [19]:
features_df = pd.read_csv(score_file)
features_df = create_singleton_feature(features_df)
features_df['num_char'] = features_df['kg_labels'].apply(lambda x: len(x) if not(pd.isna(x)) else 0)
features_df['num_tokens'] = features_df['kg_labels'].apply(lambda x: len(x.split()) if not(pd.isna(x)) else 0)
features_df = generate_reciprocal_rank(features_df)
features_df.head().fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,smallest_qnode_number,monge_elkan,des_cont_jaccard,jaro_winkler,votes,graph-embedding-score,singleton,num_char,num_tokens,reciprocal_rank
0,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q98543829,F Keith Taylor,,fuzzy-augmented,UK election candidate,...,0,0.95,0.0,0.906349,1,0.497217,0,14,3,0.034483
1,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q22087035,Keith Taylor,,fuzzy-augmented,British political scientist,...,0,0.833333,0.0,0.96,0,0.530489,0,12,2,0.021277
2,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385100,Keith Taylor,,fuzzy-augmented,Canadian poet,...,0,0.833333,0.0,0.96,0,0.622374,0,12,2,0.010638
3,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q5079714,Charles Keith Taylor,,fuzzy-augmented,Canadian politician,...,0,0.755291,0.0,0.605556,0,0.622501,0,20,3,0.010526
4,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q102164684,Keith Frederick Taylor,,fuzzy-augmented,Ph.D. University of Alberta 1975,...,0,0.845679,0.6,0.871082,1,0.525196,0,22,3,0.022222


## Final Ranking Score Predicted by Model

In [20]:
features = ['pagerank','retrieval_score','monge_elkan',
            'des_cont_jaccard','jaro_winkler','graph-embedding-score',
            'singleton','num_char','num_tokens','reciprocal_rank']

model = pickle.load(open(model_name,'rb'))
data = features_df[features]
predicted_score = model.predict(data)
features_df['model_prediction'] = predicted_score
features_df.to_csv(final_score,index=False)



In [21]:
pd.read_csv(final_score, nrows=10).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,monge_elkan,des_cont_jaccard,jaro_winkler,votes,graph-embedding-score,singleton,num_char,num_tokens,reciprocal_rank,model_prediction
0,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q98543829,F Keith Taylor,,fuzzy-augmented,UK election candidate,...,0.95,0.0,0.906349,1,0.497217,0,14,3,0.034483,-0.887403
1,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q22087035,Keith Taylor,,fuzzy-augmented,British political scientist,...,0.833333,0.0,0.96,0,0.530489,0,12,2,0.021277,-0.462113
2,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385100,Keith Taylor,,fuzzy-augmented,Canadian poet,...,0.833333,0.0,0.96,0,0.622374,0,12,2,0.010638,-0.799375
3,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q5079714,Charles Keith Taylor,,fuzzy-augmented,Canadian politician,...,0.755291,0.0,0.605556,0,0.622501,0,20,3,0.010526,-0.988095
4,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q102164684,Keith Frederick Taylor,,fuzzy-augmented,Ph.D. University of Alberta 1975,...,0.845679,0.6,0.871082,1,0.525196,0,22,3,0.022222,-0.846921
5,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q1738098,Keith Taylor,Keith Richard Taylor,fuzzy-augmented,British politician (born 1953),...,0.833333,0.0,0.96,0,0.558978,0,12,2,0.015385,-0.4642
6,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385099,Keith Taylor,,fuzzy-augmented,American football player,...,0.833333,0.0,0.96,0,0.571414,0,12,2,0.014286,-0.429756
7,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q6385101,Keith Taylor,Dennis More|Cadmus Evans,fuzzy-augmented,Australian science fiction and fantasy writer,...,0.833333,0.0,0.96,0,0.47518,0,12,2,0.071429,-0.80769
8,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q53567269,Keith A. Taylor,,fuzzy-augmented,,...,0.888889,0.0,0.944762,0,0.535436,0,15,3,0.020408,-0.781714
9,0,0,Keith F. Taylor,1975|University of Alberta|Anthony To-Ming Lau...,Keith F. Taylor,Q5541266,George Keith Taylor,,fuzzy-augmented,United States federal judge,...,0.759259,0.0,0.773369,0,0.651823,0,19,3,0.010101,-0.99


## Get Top5 KG Links

In [22]:
!tl get-kg-links -c model_prediction -l label -k 3 $final_score > $top_k_file

In [23]:
pd.read_csv(top_k_file, nrows = 10)

Unnamed: 0,column,row,label,kg_id,kg_label,kg_description,ranking_score
0,0,0,Keith F. Taylor,Q6385099|Q60614647|Q22087035,Keith Taylor|Keith Taylor|Keith Taylor,American football player|astronomer|British po...,-0.43|-0.45|-0.46
1,0,1,Anthony To-Ming Lau,Q5033813|Q89783524|Q24835859,Canti Lau|Anthony Lau|Lau Ka Ming,Hong Kong actor|mathematician|association foot...,-0.24|-0.36|-0.5
2,0,10,Christian Ludwig Gerling,Q72995|Q1080895|Q1579806,Christian Ludwig Gerling|Christian Ludwig Gerl...,German astronomer|German theologian|German ent...,0.06|-0.31|-0.39
3,0,11,Carl Friedrich Gauss,Q6722|Q5669038|Q20986428,Carl Friedrich Gauss|Harry Gauss|Walter Gauss,German mathematician and physicist|German foot...,0.39|-0.39|-0.43
4,0,12,Johann Friedrich Pfaff,Q77361|Q94917687|Q105314,Johann Friedrich Pfaff|Johann Friedrich Pfaff|...,German mathematician||Agriculturalist,0.34|-0.1|-0.32
5,0,13,Abraham Gotthelf Kaestner,Q61813|Q4730637|Q24231370,Abraham Gotthelf Kästner|Allan Gotthelf|Kaestn...,German mathematician|American philosopher|ency...,0.73|0.13|-0.21
6,0,14,Christian August Hausen,Q1078821|Q91657|Q4820760,Christian August Hausen|Christian August Hause...,German historian and theologian|German mathema...,0.15|-0.32|-0.45
7,0,15,Christian Andreas Siber,Q102233334|Q1078783|Q350894,Christian Andreas Siber|Christian Andreas Sibe...,Dr. theol. Martin-Luther-Universität Halle-Wit...,0.79|0.09|-0.26
8,0,16,Rudolf Lipschitz,Q77322|Q1782725|Q105466753,Rudolf Lipschitz|Konstantin Lifšits|Adolf Lips...,German mathematician|Russian pianist and unive...,0.92|-0.57|-0.68
9,0,17,Gustav Peter Lejeune Dirichlet,Q29193|Q2070240|Q95404,Johann Peter Gustav Lejeune Dirichlet|Florian ...,German mathematician|French association footba...,0.67|-0.08|-0.31


## Join to Produce final result

In [23]:
!tl join -f $input_file_path --csv -c ranking_score $top_k_file  > $final_output

In [24]:
pd.read_csv(final_output).fillna("")

Unnamed: 0,Name,Abbreviation,Tissue,Cells,Receptor,Target_Tissue,Effect,kg_id,kg_label,score
0,Amylin (or Islet Amyloid Polypeptide),IAPP,pancreas,pancreatic β-cells,amylin receptor,,"slowing down gastric emptying, inhibition of d...",Q9618|Q15762014|Q11939959,pancreas|Pancreas|Pancreas,0.99|0.24|0.23
1,Anti-Müllerian hormone (or Müllerian inhibitin...,AMH,testes,Sertoli cell,AMHR2,,Inhibit release of prolactin and TRH from ante...,Q27318|Q7247798|Q907375,test|product testing|unit testing,-0.55|-0.64|-0.64
2,Adiponectin,Acrp30,adipose tissue,,adiponectin receptors,,,Q193583|Q40397|Q12898553,adipose tissue|tissue|adipose tissue macrophages,0.95|0.16|-0.09
3,Adrenocorticotropic hormone (or corticotropin),ACTH,anterior pituitary,corticotrope,ACTH receptor → cAMP,,synthesis of corticosteroids ( glucocorticoids...,,,
4,Angiotensinogen and angiotensin,AGT,liver,,angiotensin receptor → IP 3,,vasoconstriction release of aldosterone from a...,,,
5,"Antidiuretic hormone (or vasopressin, arginine...",ADH,posterior pituitary,Parvocellular neurosecretory neurons in hypoth...,"AVPRs , VACM-1",,retention of water in kidneys \nmoderate vaso...,,,
6,Atrial-natriuretic peptide (or atriopeptin),ANP,heart,,ANP receptor → cGMP,,,Q1072|Q5223785|Q3129019,heart|Heart|Heart,0.7|0.57|0.57
7,Brain natriuretic peptide,BNP,heart,Cardiac myocytes,NPR,,(To a minor degree than ANP) reduce blood pres...,,,
8,Calcitonin,CT,thyroid gland,parafollicular cell,CT receptor → cAMP,,"Construct bone, reduce blood Ca 2+",Q16399|Q66564199|Q6673122,thyroid gland|parafollicular cell of thyroid g...,0.88|-0.4|-0.5
9,Cholecystokinin,CCK,duodenum,,CCK receptor,,Release of digestive enzymes from pancreas \n...,,,


## Clean up temporary files

In [25]:
shutil.rmtree(temp_dir)