In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import re
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
edges = pd.read_csv('edges_filtered2.csv')
nodes = pd.read_csv("nodes_filtered2.csv")
abv = pd.read_csv("abv.csv")
name_abv = dict(zip(abv.full_name, abv.abv))
abv_name = dict(zip(abv.abv, abv.full_name))

In [3]:
#nodes
# {name_abv.get(x):x for x in set(nodes.TYPE)}

In [None]:
#edges
# {abv_name.get(x):x for x in set(edges.PRED)}

In [5]:
node_mapping = {
 'Anatomy': 'anatomical_entity',
 'Activities & Behaviors': 'activity',  # doesn't exist in biolink
 'Chemicals & Drugs': 'chemical_substance',
 'Disorders': 'disease',
 'Genes & Molecular Sequences': 'genomic_entity',
 'Living Beings': 'organism', # doesn't exist in biolink
 'Objects': 'named_thing',  # mapping doesn't exist in biolink
 'Phenomena': 'biological_process',
 'Procedures': 'procedure', # doesn't exist in biolink
 'Physiology': 'biological_process',
}

In [6]:
nodes['bl_type'] = nodes.TYPE.apply(node_mapping.get)
nodes.head()

Unnamed: 0,ID,label,TYPE,bl_type
0,C0038195,State Medicine,Activities & Behaviors,activity
1,C1290952,Taking medication,Activities & Behaviors,activity
2,C0085092,Parenting behavior,Activities & Behaviors,activity
3,C1096771,Murderer,Activities & Behaviors,activity
4,C1299651,Serious reportable event,Activities & Behaviors,activity


In [7]:
# rename the edge domain and range to use the biolink types
edges['bl_domain'] = edges.DOMAIN.apply(abv_name.get).apply(node_mapping.get)
edges['bl_pred'] = edges.PRED.apply(abv_name.get)
edges['bl_range'] = edges.RANGE.apply(abv_name.get).apply(node_mapping.get)
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE,bl_domain,bl_pred,bl_range,bl_type
0,C0021769,C1182654,CDatA,1597294,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
1,C0001271,C0007635,CDatA,6684662;11037792,2,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
2,C0003339,C0014467,CDatA,11857592,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
3,C0028612,C0085080,CDatA,1985199,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
4,C0059249,C0431085,CDatA,12739069,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...


In [8]:
# get the most common domain and ranges for each pred
pred_type_count = edges.groupby("bl_pred").bl_type.value_counts()

In [9]:
pred_type_count['USES'][:15]

bl_type
procedure.USES.chemical_substance             289983
procedure.USES.genomic_entity                  74276
procedure.USES.procedure                       26970
chemical_substance.USES.chemical_substance     18128
procedure.USES.named_thing                      9886
chemical_substance.USES.genomic_entity          5653
activity.USES.chemical_substance                3688
disease.USES.chemical_substance                 1960
named_thing.USES.chemical_substance             1850
disease.USES.procedure                          1463
organism.USES.chemical_substance                1297
named_thing.USES.named_thing                    1228
Name: bl_type, dtype: int64

In [10]:
# working off https://docs.google.com/spreadsheets/d/1zXitcR1QjHyh6WocukgshSR7IoAVg7MJQG-HNh96Jec/edit#gid=579577728
allowed_domain_range = {
    "ADMINISTERED_TO": ({'chemical_substance', 'procedure', 'genomic_entity'}, 
                        {'organism', 'anatomical_entity'}),
    "AFFECTS": (None, None), # this means anything,
    "ASSOCIATED_WITH": ({'chemical_substance', 'disease', 'genomic_entity', 'biological_process'}, 
                         {'disease'}),
    'AUGMENTS': ({'chemical_substance', 'genomic_entity', 'disease'}, 
                         {'biological_process', 'disease', 'activity'}),
    'CAUSES': ({'chemical_substance', 'genomic_entity', 'disease', 'procedure', 'biological_process', 'activity'}, 
                         {'biological_process', 'disease'}),
    'COEXISTS_WITH': (None, None) ,
    'COMPLICATES': ({'disease'}, 
                    {'disease'}),
    'CONVERTS_TO': ({'chemical_substance', 'genomic_entity'}, 
                    {'chemical_substance', 'genomic_entity'}),
    'DIAGNOSES': ({'procedure'}, 
                  {'disease'}),
    'DISRUPTS': ({'chemical_substance', 'genomic_entity', 'biological_process'}, 
                 {'biological_process', 'disease'}),
    'INHIBITS': ({'chemical_substance', 'genomic_entity'}, 
                 {'genomic_entity', 'biological_process', 'disease'}),
    'INTERACTS_WITH': ({'chemical_substance', 'genomic_entity'}, 
                       {'chemical_substance', 'genomic_entity'}),
    'ISA': (None, None),
    'LOCATION_OF': ({'anatomical_entity'}, 
                    {'chemical_substance', 'disease', 'genomic_entity', 'anatomical_entity', 'procedure', 'biological_process'}),
    'MANIFESTATION_OF': ({'disease'}, 
                         {'disease', 'biological_process'}),
    'METHOD_OF': ({'procedure', 'activity'}, {'procedure'}),
    'OCCURS_IN': ({'disease'}, {'organism'}),
    'PART_OF': ({'chemical_substance', 'genomic_entity', 'anatomical_entity'}, 
                {'anatomical_entity', 'organism', 'chemical_substance', 'genomic_entity'}),
    'PRECEDES': ({'procedure', 'disease', 'biological_process'}, 
                 {'procedure', 'disease', 'biological_process'}),
    'PREDISPOSES': ({None}, 
                    {'disease'}),
    'PREVENTS': ({'chemical_substance', 'procedure', 'genomic_entity', 'activity'}, 
                 {'disease'}),
    'PROCESS_OF': ({'disease', 'biological_process', 'activity'}, 
                   {'organism', 'anatomical_entity', 'disease'}),
    'PRODUCES': ({'chemical_substance', 'anatomical_entity', 'genomic_entity', 'biological_process'}, 
                 {'chemical_substance', 'genomic_entity'}),
    'STIMULATES': ({'chemical_substance', 'procedure', 'genomic_entity'},
                  {'chemical_substance', 'biological_process', 'genomic_entity'}),
    'TREATS': ({'chemical_substance', 'procedure', 'genomic_entity', 'activity'}, 
               {'disease'}),
    'USES': ({'procedure'}, 
             {'chemical_substance', 'procedure', 'genomic_entity'}),
}

In [34]:
def is_allowed_edge(domain, pred, rnge):
    allowed_domain, allowed_range = allowed_domain_range[pred]
    return ((domain in allowed_domain if allowed_domain else True) and 
              (rnge in allowed_range if allowed_range else True))

In [35]:
d = {x: is_allowed_edge(*x.split(".")) for x in set(edges.bl_type)}
allowed_edges = {k for k,v in d.items() if v}

In [37]:
idx = edges.bl_type.isin(allowed_edges)
print(idx.value_counts())
edges = edges[idx]

True     16658956
False     2695598
Name: bl_type, dtype: int64


In [38]:
print(len(edges))

16658956


In [43]:
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE,bl_domain,bl_pred,bl_range,bl_type
0,C0021769,C1182654,CDatA,1597294,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
1,C0001271,C0007635,CDatA,6684662;11037792,2,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
2,C0003339,C0014467,CDatA,11857592,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
3,C0028612,C0085080,CDatA,1985199,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
4,C0059249,C0431085,CDatA,12739069,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...


In [44]:
del edges['bl_type']
del edges['DOMAIN']
del edges['PRED']
del edges['RANGE']
del edges['TYPE']

In [49]:
del edges['bl_domain']
del edges['bl_range']

In [50]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred
0,C0021769,C1182654,1597294,1,False,ADMINISTERED_TO
1,C0001271,C0007635,6684662;11037792,2,False,ADMINISTERED_TO
2,C0003339,C0014467,11857592,1,False,ADMINISTERED_TO
3,C0028612,C0085080,1985199,1,False,ADMINISTERED_TO
4,C0059249,C0431085,12739069,1,False,ADMINISTERED_TO


In [51]:
edges.to_csv("edges_biolink.csv", index=None)
nodes.to_csv("nodes_biolink.csv", index=None)