In [44]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import shelve
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import requests
from pyquery import PyQuery as pq
from collections import OrderedDict

In [37]:
pd.options.display.width

<pandas.core.config.DictWrapper at 0x7f477bddfa58>

In [77]:
pd.set_option("display.max_colwidth", 120)
pd.set_option("display.width", 120)

In [61]:
edges = pd.read_csv('edges_biolink.csv')
nodes = pd.read_csv("nodes_biolink.csv", index_col=0)

In [62]:
nodes.head()

Unnamed: 0_level_0,label,TYPE,bl_type
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0038195,State Medicine,Activities & Behaviors,activity_and_behavior
C1290952,Taking medication,Activities & Behaviors,activity_and_behavior
C0085092,Parenting behavior,Activities & Behaviors,activity_and_behavior
C1096771,Murderer,Activities & Behaviors,activity_and_behavior
C1299651,Serious reportable event,Activities & Behaviors,activity_and_behavior


In [70]:
node_label = dict(zip(nodes.index, nodes.label))
node_type = dict(zip(nodes.index, nodes.bl_type))

In [64]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred,bl_type
0,C0021769,C1182654,1597294,1,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_entity
1,C0001271,C0007635,6684662;11037792,2,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_entity
2,C0003339,C0014467,11857592,1,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_entity
3,C0028612,C0085080,1985199,1,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_entity
4,C0059249,C0431085,12739069,1,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_entity


In [71]:
edges['start_label'] = edges.START_ID.map(node_label.get)
edges['end_label'] = edges.END_ID.map(node_label.get)
edges['start_type'] = edges.START_ID.map(node_type.get)
edges['end_type'] = edges.END_ID.map(node_type.get)

In [72]:
edges = edges.sort_values("n_pmids", ascending=False)

In [73]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred,bl_type,start_label,end_label,start_type,end_type
10686996,C0023884,C0034693,6773194;1698569;5076198;20619621;8033224;4155684;6891338;1257600;7974499;20287910;5495281;242406...,45915,False,PART_OF,anatomical_entity.PART_OF.individual_organism,Liver,Rattus norvegicus,anatomical_entity,individual_organism
10700614,C0006104,C0034693,7142989;1647769;8855842;8836629;21945033;2995978;9252233;11596050;25303736;19785543;16689096;800...,45542,False,PART_OF,anatomical_entity.PART_OF.individual_organism,Brain,Rattus norvegicus,anatomical_entity,individual_organism
10740905,C0229671,C0030705,3479311;19445843;10895099;3219160;9916062;10219592;21893063;2200820;2515743;8270063;11147755;125...,32877,False,PART_OF,anatomical_entity.PART_OF.individual_organism,Serum,Patients,anatomical_entity,individual_organism
10741044,C0229671,C0020114,6185150;1413991;9613468;23595366;13326435;19730118;16107258;2266547;26612625;27554022;23305092;4...,25898,False,PART_OF,anatomical_entity.PART_OF.individual_organism,Serum,Human,anatomical_entity,individual_organism
8819362,C0229671,C0010294,15553306;25813256;21606119;2660616;7694990;18669772;22990506;23541229;9186889;10546024;11805181;...,22475,False,LOCATION_OF,anatomical_entity.LOCATION_OF.chemical_substance,Serum,Creatinine,anatomical_entity,chemical_substance


In [79]:
cols = ['start_label', 'end_label', 'start_type', 'end_type', 'bl_pred']
for node_type in set(nodes.bl_type):
    print("\n*****{}*****".format(node_type))
    print(edges[(edges.start_type == node_type) | (edges.end_type == node_type)][cols].head(10))


*****chemical_substance*****
                          start_label    end_label          start_type             end_type          bl_pred
8819362                         Serum   Creatinine   anatomical_entity   chemical_substance      LOCATION_OF
8817334                         Serum   Antibodies   anatomical_entity   chemical_substance      LOCATION_OF
8522401                        Plasma      Glucose   anatomical_entity   chemical_substance      LOCATION_OF
7612743                      Plasmids          DNA  chemical_substance   chemical_substance      SUBCLASS_OF
7671835             N-Methylaspartate    Aspartate  chemical_substance   chemical_substance      SUBCLASS_OF
11083364                      Calcium   Protoplasm  chemical_substance    anatomical_entity          PART_OF
95249     Pharmaceutical Preparations     Patients  chemical_substance  individual_organism  ADMINISTERED_TO
8817672                         Serum  Cholesterol   anatomical_entity   chemical_substance      L

In [55]:
types = set(edges.bl_pred)
t = list(types)[0]

In [58]:
for t in types:
    s = edges[edges.bl_pred == t].sort_values("n_pmids", ascending=False)[['START_ID', 'END_ID', 'bl_type', 'start_label', 'end_label']]
    print("\n*****{}*****".format(t))
    print("\ntop 5 edges")
    print(s.head())
    print("\ntop 5 start")
    print(list(OrderedDict.fromkeys(s.start_label))[:5])
    print("\ntop 5 end")
    print(list(OrderedDict.fromkeys(s.end_label))[:5])


*****COEXISTS_WITH*****

top 5 edges
         START_ID    END_ID                                          bl_type  \
5107643  C0752046  C0017337      genomic_entity.COEXISTS_WITH.genomic_entity   
4221706  C0012854  C0162326  chemical_substance.COEXISTS_WITH.genomic_entity   
4747477  C0021368  C0004096                    disease.COEXISTS_WITH.disease   
4845109  C0338656  C0002395                    disease.COEXISTS_WITH.disease   
4504882  C0277785  C0036341                    disease.COEXISTS_WITH.disease   

                            start_label            end_label  
5107643  Single Nucleotide Polymorphism                Genes  
4221706                             DNA         DNA Sequence  
4747477                    Inflammation               Asthma  
4845109              Impaired cognition  Alzheimer's Disease  
4504882             Functional disorder        Schizophrenia  

top 5 start
['Single Nucleotide Polymorphism', 'DNA', 'Inflammation', 'Impaired cognition', 'Functiona


*****INTERACTS_WITH*****

top 5 edges
         START_ID    END_ID                                               bl_type  \
7361227  C1366628  C0596902      genomic_entity.INTERACTS_WITH.chemical_substance   
7136191  C0017337  C0040648      genomic_entity.INTERACTS_WITH.chemical_substance   
7379769  C1366832  C0596902      genomic_entity.INTERACTS_WITH.chemical_substance   
6943786  C0003241  C0003320  chemical_substance.INTERACTS_WITH.chemical_substance   
6457818  C0033684  C0012854  chemical_substance.INTERACTS_WITH.chemical_substance   

                                                                    start_label  \
7361227                                                             ATP8A2 gene   
7136191                                                                   Genes   
7379769  ATPase, Aminophospholipid Transporter-Like, Class I, Type 8A, Member 2   
6943786                                                              Antibodies   
6457818                            


*****PREDISPOSES*****

top 5 edges
          START_ID    END_ID                                 bl_type           start_label  \
12019110  C0006560  C0021368  chemical_substance.PREDISPOSES.disease    C-reactive protein   
12074270  C0020538  C0007222             disease.PREDISPOSES.disease  Hypertensive disease   
12105977  C0948265  C0007222             disease.PREDISPOSES.disease    Metabolic syndrome   
12077203  C0004238  C0038454             disease.PREDISPOSES.disease   Atrial Fibrillation   
12069465  C0028754  C0007222             disease.PREDISPOSES.disease               Obesity   

                         end_label  
12019110              Inflammation  
12074270   Cardiovascular Diseases  
12105977   Cardiovascular Diseases  
12077203  Cerebrovascular accident  
12069465   Cardiovascular Diseases  

top 5 start
['C-reactive protein', 'Hypertensive disease', 'Metabolic syndrome', 'Atrial Fibrillation', 'Obesity']

top 5 end
['Inflammation', 'Cardiovascular Diseases', 'Cereb