In [1]:
import pandas as pd
import os
import glob
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import sklearn.metrics
from collections import defaultdict
import shutil
import pickle

pd.reset_option('all')

As the xlwt package is no longer maintained, the xlwt engine will be removed in a future version of pandas. This is the only engine in pandas that supports writing in the xls format. Install openpyxl and write to an xlsx file instead.

: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.



In [2]:
input_file_path = '/Users/amandeep/Downloads/List_of_human_hormones_dbpv.csv'
wikify_column_name = "Tissue"
output_path = '/Users/amandeep/Github/wikidata-wikifier/wikifier/sample_files/output'
es_index = 'wikidatadwd-augmented-02'
es_url = 'http://ckg06:9200'

In [3]:
temp_dir = f'{output_path}/temp'

In [27]:
!mkdir -p $output_path
!mkdir -p $temp_dir

In [5]:
#intermediate files
canonical = f'{temp_dir}/canonical.csv'
candidates = f"{temp_dir}/candidates.csv"
feature_votes = f"{temp_dir}/feature_votes.csv"
score_file = f"{temp_dir}/scores.csv"
model_name = 'rf_tuned_ranking.pkl'

embedding_file = f'{temp_dir}/graph_embedding_complex.tsv'
aux_field = 'graph_embedding_complex'
final_score = f'{temp_dir}/final_score.csv'
top_k_file = f"{temp_dir}/topk-hormones.csv" 
final_output = f"{output_path}/linked-hormones.csv" 

## Peak at the input file

In [6]:
pd.read_csv(input_file_path).fillna("")

Unnamed: 0,Name,Abbreviation,Tissue,Cells,Receptor,Target_Tissue,Effect
0,Amylin (or Islet Amyloid Polypeptide),IAPP,pancreas,pancreatic β-cells,amylin receptor,,"slowing down gastric emptying, inhibition of d..."
1,Anti-Müllerian hormone (or Müllerian inhibitin...,AMH,testes,Sertoli cell,AMHR2,,Inhibit release of prolactin and TRH from ante...
2,Adiponectin,Acrp30,adipose tissue,,adiponectin receptors,,
3,Adrenocorticotropic hormone (or corticotropin),ACTH,anterior pituitary,corticotrope,ACTH receptor → cAMP,,synthesis of corticosteroids ( glucocorticoids...
4,Angiotensinogen and angiotensin,AGT,liver,,angiotensin receptor → IP 3,,vasoconstriction release of aldosterone from a...
5,"Antidiuretic hormone (or vasopressin, arginine...",ADH,posterior pituitary,Parvocellular neurosecretory neurons in hypoth...,"AVPRs , VACM-1",,retention of water in kidneys \nmoderate vaso...
6,Atrial-natriuretic peptide (or atriopeptin),ANP,heart,,ANP receptor → cGMP,,
7,Brain natriuretic peptide,BNP,heart,Cardiac myocytes,NPR,,(To a minor degree than ANP) reduce blood pres...
8,Calcitonin,CT,thyroid gland,parafollicular cell,CT receptor → cAMP,,"Construct bone, reduce blood Ca 2+"
9,Cholecystokinin,CCK,duodenum,,CCK receptor,,Release of digestive enzymes from pancreas \n...


## Canonicalize

In [28]:
!tl canonicalize \
-c "$wikify_column_name" \
--add-context \
{input_file_path} > {canonical}

In [29]:
pd.read_csv(canonical, nrows = 10)

Unnamed: 0,column,row,label,context
0,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...
1,2,1,testes,Anti-Müllerian hormone (or Müllerian inhibitin...
2,2,2,adipose tissue,Adiponectin|Acrp30|adiponectin receptors
3,2,3,anterior pituitary,Adrenocorticotropic hormone (or corticotropin)...
4,2,4,liver,Angiotensinogen and angiotensin|AGT|angiotensi...
5,2,5,posterior pituitary,"Antidiuretic hormone (or vasopressin, arginine..."
6,2,6,heart,Atrial-natriuretic peptide (or atriopeptin)|AN...
7,2,7,heart,Brain natriuretic peptide|BNP|Cardiac myocytes...
8,2,8,thyroid gland,Calcitonin|CT|parafollicular cell|CT receptor ...
9,2,9,duodenum,Cholecystokinin|CCK|CCK receptor|Release of di...


## Candidate Generation

In [30]:
%%time
!tl clean -c label -o label_clean {canonical} / \
--url $es_url --index $es_index \
get-fuzzy-augmented-matches -c label_clean \
--auxiliary-fields {aux_field} \
--auxiliary-folder $temp_dir / \
--url $es_url --index $es_index \
get-exact-matches -c label_clean \
--auxiliary-fields {aux_field} \
--auxiliary-folder {temp_dir} > {candidates}

CPU times: user 1.26 s, sys: 406 ms, total: 1.66 s
Wall time: 1min 2s


In [31]:
for field in aux_field.split(','):
    aux_list = []
    for f in glob.glob(f'{temp_dir}/*{aux_field}.tsv'):
        aux_list.append(pd.read_csv(f, sep='\t', dtype=object))
    aux_df = pd.concat(aux_list).drop_duplicates(subset=['qnode']).rename(columns={aux_field: 'embedding'})
    aux_df.to_csv(f'{temp_dir}/{aux_field}.tsv', sep='\t', index=False)

In [32]:
pd.read_csv(candidates, nrows = 10).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score
0,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q9618,pancreas,,fuzzy-augmented,glandular organ that plays a role in the diges...,4.444854e-07,22.858023
1,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q518159,pancreas transplantation,,fuzzy-augmented,operation,3.539613e-09,19.148045
2,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q1601921,annular pancreas,"PANCREAS, ANNULAR",fuzzy-augmented,pancreas disease characterized by autosomal do...,3.539613e-09,18.461271
3,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q7130407,pancreas disease,disease of pancreas|Pancreatic Disorder|pancre...,fuzzy-augmented,endocrine system disease that is located in th...,2.228324e-07,17.857887
4,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q7130412,pancreatic neoplasm,Pancreatic Neoplasms|pancreatic tumor|pancreat...,fuzzy-augmented,undifferentiated growth detected in the pancreas,8.85713e-08,17.857887
5,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q5334293,Ectopic pancreas,,fuzzy-augmented,,3.539613e-09,17.79446
6,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q585140,Pancreas divisum,,fuzzy-augmented,congenital disorder of digestive system,3.539613e-09,17.79446
7,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q212961,pancreatic cancer,Ca tail of pancreas|Ca body of pancreas|malign...,fuzzy-augmented,endocrine gland cancer located in the pancreas,7.187924e-07,17.570473
8,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q14859942,pancreas development,GO:0031016,fuzzy-augmented,process whose specific outcome is the progress...,3.340817e-08,17.212078
9,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q11939959,Pancreas,,fuzzy-augmented,song performed by \Weird Al\ Yankovic,3.539613e-09,17.212078


## Feature Voting

In [33]:
%%time
!tl smallest-qnode-number {candidates} \
    / string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
    / string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
    / string-similarity -i --method jaro_winkler -o jaro_winkler \
    / feature-voting -c "pagerank,smallest_qnode_number,monge_elkan,des_cont_jaccard" > {feature_votes}

CPU times: user 128 ms, sys: 50.5 ms, total: 178 ms
Wall time: 7.69 s


In [34]:
pd.read_csv(feature_votes, nrows = 10).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,monge_elkan,des_cont_jaccard,jaro_winkler,votes
0,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q9618,pancreas,,fuzzy-augmented,glandular organ that plays a role in the diges...,4.444854e-07,22.858023,1,1.0,0.12,1.0,3
1,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q518159,pancreas transplantation,,fuzzy-augmented,operation,3.539613e-09,19.148045,0,0.879861,0.0,0.866667,0
2,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q1601921,annular pancreas,"PANCREAS, ANNULAR",fuzzy-augmented,pancreas disease characterized by autosomal do...,3.539613e-09,18.461271,0,0.901786,0.029412,0.5,0
3,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q7130407,pancreas disease,disease of pancreas|Pancreatic Disorder|pancre...,fuzzy-augmented,endocrine system disease that is located in th...,2.228324e-07,17.857887,0,0.87252,0.045455,0.9,0
4,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q7130412,pancreatic neoplasm,Pancreatic Neoplasms|pancreatic tumor|pancreat...,fuzzy-augmented,undifferentiated growth detected in the pancreas,8.85713e-08,17.857887,0,0.840417,0.052632,0.848684,0
5,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q5334293,Ectopic pancreas,,fuzzy-augmented,,3.539613e-09,17.79446,0,0.855655,0.0,0.409722,0
6,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q585140,Pancreas divisum,,fuzzy-augmented,congenital disorder of digestive system,3.539613e-09,17.79446,0,0.855655,0.117647,0.9,0
7,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q212961,pancreatic cancer,Ca tail of pancreas|Ca body of pancreas|malign...,fuzzy-augmented,endocrine gland cancer located in the pancreas,7.187924e-07,17.570473,0,0.857778,0.05,0.857353,0
8,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q14859942,pancreas development,GO:0031016,fuzzy-augmented,process whose specific outcome is the progress...,3.340817e-08,17.212078,0,0.851326,0.073171,0.88,0
9,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q11939959,Pancreas,,fuzzy-augmented,song performed by \Weird Al\ Yankovic,3.539613e-09,17.212078,0,1.0,0.0,1.0,1


## Compute Embedding Score using Column Vector Strategy

In [35]:
!tl score-using-embedding $feature_votes \
--column-vector-strategy centroid-of-singletons \
-o graph-embedding-score --embedding-file $embedding_file \
> $score_file

Qnodes to lookup: 3212
Qnodes from file: 2991
_centroid_of_singletons: Missing 1 of 27


In [36]:
df = pd.read_csv(score_file).fillna("")
df.sort_values(by=['votes'], ascending=False)

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,pagerank,retrieval_score,smallest_qnode_number,monge_elkan,des_cont_jaccard,jaro_winkler,votes,graph-embedding-score
0,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q9618,pancreas,,fuzzy-augmented,glandular organ that plays a role in the diges...,4.444854e-07,22.858023,1,1.000000,0.120000,1.000000,3,0.818295
5222,2,43,hypothalamus,Prolactin releasing hormone|PRH|Release prolac...,hypothalamus,Q164386,hypothalamus,,exact-match,part of diencephalon,2.170467e-07,21.138376,1,1.000000,0.000000,1.000000,3,0.865884
2763,2,23,hypothalamus,Growth hormone-releasing hormone|GHRH|GHRH rec...,hypothalamus,Q164386,hypothalamus,,fuzzy-augmented,part of diencephalon,2.170467e-07,38.967945,1,1.000000,0.000000,1.000000,3,0.865884
2762,2,22,hypothalamus,Gonadotropin-releasing hormone|GnRH|GnRH recep...,hypothalamus,Q164386,hypothalamus,,exact-match,part of diencephalon,2.170467e-07,21.138376,1,1.000000,0.090909,1.000000,3,0.865884
2662,2,22,hypothalamus,Gonadotropin-releasing hormone|GnRH|GnRH recep...,hypothalamus,Q164386,hypothalamus,,fuzzy-augmented,part of diencephalon,2.170467e-07,38.967945,1,1.000000,0.090909,1.000000,3,0.865884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2172,2,17,mucosa of the duodenum and the jejunum,Gastric inhibitory polypeptide|GIP|K cell|GIPR...,mucosa of the duodenum and the jejunum,Q8033647,Woody Durham,The Voice of the Tar Heels,fuzzy-augmented,former UNC radio sports announcer,3.905260e-09,24.242168,0,0.497775,0.000000,0.441520,0,0.272794
2171,2,17,mucosa of the duodenum and the jejunum,Gastric inhibitory polypeptide|GIP|K cell|GIPR...,mucosa of the duodenum and the jejunum,Q81294593,Alexei Tellerias,The troubadour of the gray speaker,fuzzy-augmented,dominican poet,3.539613e-09,24.242168,0,0.500425,0.000000,0.414693,0,0.284075
2170,2,17,mucosa of the duodenum and the jejunum,Gastric inhibitory polypeptide|GIP|K cell|GIPR...,mucosa of the duodenum and the jejunum,Q313604,Andres Bonifacio,the father of the Philippine Revolution|Andrés...,fuzzy-augmented,Filipino nationalist and revolutionary hero (1...,1.655642e-08,24.242168,0,0.561139,0.000000,0.445332,0,0.199039
2169,2,17,mucosa of the duodenum and the jejunum,Gastric inhibitory polypeptide|GIP|K cell|GIPR...,mucosa of the duodenum and the jejunum,Q31725336,Promila Gupta,Indian independence movement.|The revolutionar...,fuzzy-augmented,Revolutionary women.,3.539613e-09,24.256697,0,0.382171,0.000000,0.428700,0,0.307184



## Generate Additional Features required for Model Prediction

In [40]:
## TODO: Need to add these features as cli commands in Table Linker

def create_singleton_feature(df):
        d = df[df['method'] == 'exact-match'].groupby(['column','row'])[['kg_id']].count()
        l = list(d[d['kg_id'] == 1].index)
        singleton_feat = []
        for i,row in df.iterrows():
            col_num,row_num = row['column'],row['row']
            if (col_num,row_num) in l:
                singleton_feat.append(1)
            else:
                singleton_feat.append(0)
        df['singleton'] = singleton_feat
        return df
        
def generate_reciprocal_rank(df):
    final_list = []
    grouped_obj = df.groupby(['row', 'column'])
    for cell in grouped_obj:
        reciprocal_rank = list(1/cell[1]['graph-embedding-score'].rank())
        cell[1]['reciprocal_rank'] = reciprocal_rank
        final_list.extend(cell[1].to_dict(orient='records'))
    odf = pd.DataFrame(final_list)
    return odf

In [41]:
features_df = pd.read_csv(score_file)
features_df = create_singleton_feature(features_df)
features_df['num_char'] = features_df['kg_labels'].apply(lambda x: len(x) if not(pd.isna(x)) else 0)
features_df['num_tokens'] = features_df['kg_labels'].apply(lambda x: len(x.split()) if not(pd.isna(x)) else 0)
features_df = generate_reciprocal_rank(features_df)
features_df.head().fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,smallest_qnode_number,monge_elkan,des_cont_jaccard,jaro_winkler,votes,graph-embedding-score,singleton,num_char,num_tokens,reciprocal_rank
0,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q9618,pancreas,,fuzzy-augmented,glandular organ that plays a role in the diges...,...,1,1.0,0.12,1.0,3,0.818295,1,8,1,0.008811
1,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q518159,pancreas transplantation,,fuzzy-augmented,operation,...,0,0.879861,0.0,0.866667,0,0.639507,1,24,2,0.010638
2,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q1601921,annular pancreas,"PANCREAS, ANNULAR",fuzzy-augmented,pancreas disease characterized by autosomal do...,...,0,0.901786,0.029412,0.5,0,0.621945,1,16,2,0.011111
3,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q7130407,pancreas disease,disease of pancreas|Pancreatic Disorder|pancre...,fuzzy-augmented,endocrine system disease that is located in th...,...,0,0.87252,0.045455,0.9,0,0.594813,1,16,2,0.011628
4,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q7130412,pancreatic neoplasm,Pancreatic Neoplasms|pancreatic tumor|pancreat...,fuzzy-augmented,undifferentiated growth detected in the pancreas,...,0,0.840417,0.052632,0.848684,0,0.598422,1,19,2,0.011494


## Final Ranking Score Predicted by Model

In [42]:
features = ['pagerank','retrieval_score','monge_elkan',
            'des_cont_jaccard','jaro_winkler','graph-embedding-score',
            'singleton','num_char','num_tokens','reciprocal_rank']

model = pickle.load(open(model_name,'rb'))
data = features_df[features]
predicted_score = model.predict(data)
features_df['model_prediction'] = predicted_score
features_df.to_csv(final_score,index=False)



In [43]:
pd.read_csv(final_score, nrows=10).fillna("")

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,monge_elkan,des_cont_jaccard,jaro_winkler,votes,graph-embedding-score,singleton,num_char,num_tokens,reciprocal_rank,model_prediction
0,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q9618,pancreas,,fuzzy-augmented,glandular organ that plays a role in the diges...,...,1.0,0.12,1.0,3,0.818295,1,8,1,0.008811,0.985162
1,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q518159,pancreas transplantation,,fuzzy-augmented,operation,...,0.879861,0.0,0.866667,0,0.639507,1,24,2,0.010638,-0.968667
2,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q1601921,annular pancreas,"PANCREAS, ANNULAR",fuzzy-augmented,pancreas disease characterized by autosomal do...,...,0.901786,0.029412,0.5,0,0.621945,1,16,2,0.011111,-0.918571
3,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q7130407,pancreas disease,disease of pancreas|Pancreatic Disorder|pancre...,fuzzy-augmented,endocrine system disease that is located in th...,...,0.87252,0.045455,0.9,0,0.594813,1,16,2,0.011628,-0.465357
4,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q7130412,pancreatic neoplasm,Pancreatic Neoplasms|pancreatic tumor|pancreat...,fuzzy-augmented,undifferentiated growth detected in the pancreas,...,0.840417,0.052632,0.848684,0,0.598422,1,19,2,0.011494,-0.518667
5,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q5334293,Ectopic pancreas,,fuzzy-augmented,,...,0.855655,0.0,0.409722,0,0.634705,1,16,2,0.01087,-0.977778
6,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q585140,Pancreas divisum,,fuzzy-augmented,congenital disorder of digestive system,...,0.855655,0.117647,0.9,0,0.675759,1,16,2,0.010101,-0.656076
7,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q212961,pancreatic cancer,Ca tail of pancreas|Ca body of pancreas|malign...,fuzzy-augmented,endocrine gland cancer located in the pancreas,...,0.857778,0.05,0.857353,0,0.401326,1,17,2,0.025641,-0.793532
8,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q14859942,pancreas development,GO:0031016,fuzzy-augmented,process whose specific outcome is the progress...,...,0.851326,0.073171,0.88,0,0.708951,1,20,2,0.009804,-0.860528
9,2,0,pancreas,Amylin (or Islet Amyloid Polypeptide)|IAPP|pan...,pancreas,Q11939959,Pancreas,,fuzzy-augmented,song performed by \Weird Al\ Yankovic,...,1.0,0.0,1.0,1,0.392309,1,8,1,0.027027,0.226376


## Get Top5 KG Links

In [44]:
!tl get-kg-links -c model_prediction -l label -k 3 $final_score > $top_k_file

In [45]:
pd.read_csv(top_k_file, nrows = 10)

Unnamed: 0,column,row,label,kg_id,kg_label,kg_description,ranking_score
0,2,0,pancreas,Q9618|Q15762014|Q11939959,pancreas|Pancreas|Pancreas,glandular organ that plays a role in the diges...,0.99|0.24|0.23
1,2,1,testes,Q27318|Q7247798|Q907375,test|product testing|unit testing,merit assessment intended for measuring a subj...,-0.55|-0.64|-0.64
2,2,10,hypothalamus,Q164386|Q66570795|Q7240592,hypothalamus|right ventromedial nucleus of hyp...,part of diencephalon||Region of the anterior h...,0.96|-0.66|-0.69
3,2,11,cerebral cortex,Q5064089|Q75839|Q66599036,Cerebral Cortex|cerebral cortex|Lamina of cere...,journal|outer layer of the vertebrate cerebrum...,0.83|0.28|-0.62
4,2,12,Kidney,Q96721412|Q9377|Q36943715,Kidney|kidney|Kidney,"|internal organ in most animals, including ver...",0.42|0.37|0.34
5,2,13,Vascular endothelium,Q111140|Q6708239|Q30103326,endothelium|lymphatic endothelium|vascular end...,Inner lining of blood vessels|cellular lining ...,-0.03|-0.14|-0.57
6,2,15,anterior pituitary,Q356002|Q55790533|Q101003254,anterior pituitary|anterior pituitary gland di...,"glandular, anterior lobe that, together with t...",0.95|-0.31|-0.52
7,2,16,central nervous system and gastrointestinal tract,Q47273|Q6151460|Q5062119,central nervous system|human gastrointestinal ...,main information-processing organs of the nerv...,0.18|-0.02|-0.19
8,2,17,mucosa of the duodenum and the jejunum,Q185765|Q5541565|Q66823489,jejunum|George Lavington|Samuel Willard,part of small intestine|British bishop|born 17...,0.03|0.01|-0.04
9,2,18,"stomach , duodenum",Q25104465|Q5315860|Q1029907,black stomach|duodenal cap|stomach,endoscopic finding of extensive black discolou...,-0.3|-0.57|-0.6


## Join to Produce final result

In [23]:
!tl join -f $input_file_path --csv -c ranking_score $top_k_file  > $final_output

In [24]:
pd.read_csv(final_output).fillna("")

Unnamed: 0,Name,Abbreviation,Tissue,Cells,Receptor,Target_Tissue,Effect,kg_id,kg_label,score
0,Amylin (or Islet Amyloid Polypeptide),IAPP,pancreas,pancreatic β-cells,amylin receptor,,"slowing down gastric emptying, inhibition of d...",Q9618|Q15762014|Q11939959,pancreas|Pancreas|Pancreas,0.99|0.24|0.23
1,Anti-Müllerian hormone (or Müllerian inhibitin...,AMH,testes,Sertoli cell,AMHR2,,Inhibit release of prolactin and TRH from ante...,Q27318|Q7247798|Q907375,test|product testing|unit testing,-0.55|-0.64|-0.64
2,Adiponectin,Acrp30,adipose tissue,,adiponectin receptors,,,Q193583|Q40397|Q12898553,adipose tissue|tissue|adipose tissue macrophages,0.95|0.16|-0.09
3,Adrenocorticotropic hormone (or corticotropin),ACTH,anterior pituitary,corticotrope,ACTH receptor → cAMP,,synthesis of corticosteroids ( glucocorticoids...,,,
4,Angiotensinogen and angiotensin,AGT,liver,,angiotensin receptor → IP 3,,vasoconstriction release of aldosterone from a...,,,
5,"Antidiuretic hormone (or vasopressin, arginine...",ADH,posterior pituitary,Parvocellular neurosecretory neurons in hypoth...,"AVPRs , VACM-1",,retention of water in kidneys \nmoderate vaso...,,,
6,Atrial-natriuretic peptide (or atriopeptin),ANP,heart,,ANP receptor → cGMP,,,Q1072|Q5223785|Q3129019,heart|Heart|Heart,0.7|0.57|0.57
7,Brain natriuretic peptide,BNP,heart,Cardiac myocytes,NPR,,(To a minor degree than ANP) reduce blood pres...,,,
8,Calcitonin,CT,thyroid gland,parafollicular cell,CT receptor → cAMP,,"Construct bone, reduce blood Ca 2+",Q16399|Q66564199|Q6673122,thyroid gland|parafollicular cell of thyroid g...,0.88|-0.4|-0.5
9,Cholecystokinin,CCK,duodenum,,CCK receptor,,Release of digestive enzymes from pancreas \n...,,,


## Clean up temporary files

In [25]:
shutil.rmtree(temp_dir)