In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import re
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
edges = pd.read_csv('edges_filtered2.csv')
nodes = pd.read_csv("nodes_filtered2.csv")
abv = pd.read_csv("abv.csv")
name_abv = dict(zip(abv.full_name, abv.abv))
abv_name = dict(zip(abv.abv, abv.full_name))

In [3]:
#nodes
# {name_abv.get(x):x for x in set(nodes.TYPE)}

In [4]:
#edges
# {abv_name.get(x):x for x in set(edges.PRED)}

In [5]:
node_mapping = {
 'Anatomy': 'anatomical_entity',
 'Activities & Behaviors': 'activity_and_behavior',
 'Chemicals & Drugs': 'chemical_substance',
 'Disorders': 'disease',
 'Genes & Molecular Sequences': 'genomic_entity',
 'Living Beings': 'individual_organism',
 'Objects': 'named_thing',
 'Phenomena': 'biological_process',
 'Physiology': 'biological_process',
}

In [6]:
nodes['bl_type'] = nodes.TYPE.apply(node_mapping.get)
nodes.head()

Unnamed: 0,ID,label,TYPE,bl_type
0,C0038195,State Medicine,Activities & Behaviors,activity_and_behavior
1,C1290952,Taking medication,Activities & Behaviors,activity_and_behavior
2,C0085092,Parenting behavior,Activities & Behaviors,activity_and_behavior
3,C1096771,Murderer,Activities & Behaviors,activity_and_behavior
4,C1299651,Serious reportable event,Activities & Behaviors,activity_and_behavior


In [7]:
# rename the edge domain and range to use the biolink types
edges['bl_domain'] = edges.DOMAIN.apply(abv_name.get).apply(node_mapping.get)
edges['bl_pred'] = edges.PRED.apply(abv_name.get)
edges['bl_range'] = edges.RANGE.apply(abv_name.get).apply(node_mapping.get)
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE,bl_domain,bl_pred,bl_range,bl_type
0,C0021769,C1182654,CDatA,1597294,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
1,C0001271,C0007635,CDatA,6684662;11037792,2,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
2,C0003339,C0014467,CDatA,11857592,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
3,C0028612,C0085080,CDatA,1985199,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
4,C0059249,C0431085,CDatA,12739069,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...


In [8]:
# get the most common domain and ranges for each pred
pred_type_count = edges.groupby("bl_pred").bl_type.value_counts()

In [9]:
pred_type_count['ADMINISTERED_TO'][:15]

bl_type
chemical_substance.ADMINISTERED_TO.individual_organism    99994
chemical_substance.ADMINISTERED_TO.anatomical_entity      12618
genomic_entity.ADMINISTERED_TO.individual_organism         5801
genomic_entity.ADMINISTERED_TO.anatomical_entity           2473
Name: bl_type, dtype: int64

In [10]:
sorted(edges.bl_pred.value_counts().index)

['ADMINISTERED_TO',
 'AFFECTS',
 'ASSOCIATED_WITH',
 'AUGMENTS',
 'CAUSES',
 'COEXISTS_WITH',
 'COMPLICATES',
 'CONVERTS_TO',
 'DISRUPTS',
 'INHIBITS',
 'INTERACTS_WITH',
 'ISA',
 'LOCATION_OF',
 'MANIFESTATION_OF',
 'PART_OF',
 'PRECEDES',
 'PREDISPOSES',
 'PREVENTS',
 'PRODUCES',
 'STIMULATES',
 'TREATS']

In [11]:
# working off https://docs.google.com/spreadsheets/d/1zXitcR1QjHyh6WocukgshSR7IoAVg7MJQG-HNh96Jec/edit#gid=579577728
allowed_domain_range = {
    "ADMINISTERED_TO": ({'chemical_substance', 'procedure', 'genomic_entity'}, 
                        {'individual_organism', 'anatomical_entity'}),
    "AFFECTS": (None, None), # this means anything,
    "ASSOCIATED_WITH": ({'chemical_substance', 'disease', 'genomic_entity', 'biological_process'}, 
                         {'disease'}),
    'AUGMENTS': ({'chemical_substance', 'genomic_entity', 'disease'}, 
                         {'biological_process', 'disease', 'activity_and_behavior'}),
    'CAUSES': ({'chemical_substance', 'genomic_entity', 'disease', 'biological_process', 'activity_and_behavior'}, 
                         {'biological_process', 'disease'}),
    'COEXISTS_WITH': (None, None) ,
    'COMPLICATES': ({'disease'}, 
                    {'disease'}),
    'CONVERTS_TO': ({'chemical_substance', 'genomic_entity'}, 
                    {'chemical_substance', 'genomic_entity'}),
    'DISRUPTS': ({'chemical_substance', 'genomic_entity', 'biological_process'}, 
                 {'biological_process', 'disease'}),
    'INHIBITS': ({'chemical_substance', 'genomic_entity'}, 
                 {'genomic_entity', 'biological_process', 'disease'}),
    'INTERACTS_WITH': ({'chemical_substance', 'genomic_entity'}, 
                       {'chemical_substance', 'genomic_entity'}),
    'ISA': (None, None),
    'LOCATION_OF': ({'anatomical_entity'}, 
                    {'chemical_substance', 'disease', 'genomic_entity', 'anatomical_entity', 'procedure', 'biological_process'}),
    'MANIFESTATION_OF': ({'disease'}, 
                         {'disease', 'biological_process'}),
    'OCCURS_IN': ({'disease'}, {'individual_organism'}),
    'PART_OF': ({'chemical_substance', 'genomic_entity', 'anatomical_entity'}, 
                {'anatomical_entity', 'individual_organism', 'chemical_substance', 'genomic_entity'}),
    'PRECEDES': ({'disease', 'biological_process'}, 
                 {'disease', 'biological_process'}),
    'PREDISPOSES': ({None}, 
                    {'disease'}),
    'PREVENTS': ({'chemical_substance', 'genomic_entity', 'activity_and_behavior'}, 
                 {'disease'}),
    'PROCESS_OF': ({'disease', 'biological_process', 'activity_and_behavior'}, 
                   {'individual_organism', 'anatomical_entity', 'disease'}),
    'PRODUCES': ({'chemical_substance', 'anatomical_entity', 'genomic_entity', 'biological_process'}, 
                 {'chemical_substance', 'genomic_entity'}),
    'STIMULATES': ({'chemical_substance', 'genomic_entity'},
                  {'chemical_substance', 'biological_process', 'genomic_entity'}),
    'TREATS': ({'chemical_substance', 'genomic_entity', 'activity_and_behavior'}, 
               {'disease'}),
}

In [12]:
def is_allowed_edge(domain, pred, rnge):
    allowed_domain, allowed_range = allowed_domain_range[pred]
    return ((domain in allowed_domain if allowed_domain else True) and 
              (rnge in allowed_range if allowed_range else True))

In [13]:
d = {x: is_allowed_edge(*x.split(".")) for x in set(edges.bl_type)}
allowed_edges = {k for k,v in d.items() if v}

In [14]:
idx = edges.bl_type.isin(allowed_edges)
print(idx.value_counts())
edges = edges[idx]

True     14076598
False     2345261
Name: bl_type, dtype: int64


In [15]:
print(len(edges))

14076598


In [16]:
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE,bl_domain,bl_pred,bl_range,bl_type
0,C0021769,C1182654,CDatA,1597294,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
1,C0001271,C0007635,CDatA,6684662;11037792,2,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
2,C0003339,C0014467,CDatA,11857592,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
3,C0028612,C0085080,CDatA,1985199,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...
4,C0059249,C0431085,CDatA,12739069,1,False,CD,at,A,chemical_substance,ADMINISTERED_TO,anatomical_entity,chemical_substance.ADMINISTERED_TO.anatomical_...


In [17]:
del edges['bl_type']
del edges['DOMAIN']
del edges['PRED']
del edges['RANGE']
del edges['TYPE']

In [18]:
# rename 'isa' edge to 'subclass of'
edges.bl_pred[edges.bl_pred == "ISA"] = "SUBCLASS_OF"

In [19]:
# rename 'disrupts' edge to 'affects'
edges.bl_pred[edges.bl_pred == "DISRUPTS"] = "AFFECTS"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [20]:
# rename 'associated_with' edge to 'related_to'
edges.bl_pred[edges.bl_pred == "ASSOCIATED_WITH"] = "RELATED_TO"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [21]:
# associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition, 
# and leave the rest alone
edges.loc[(edges.bl_pred == "RELATED_TO") & (edges.bl_domain == "genomic_entity") & 
      (edges.bl_range == "disease"), 'bl_pred'] = 'gene_associated_with_condition'.upper()

In [22]:
# inhibits edges with domain: chemical_substance or genomic entity, range: genomic_entity or disease ->
# https://biolink.github.io/biolink-model/docs/negatively_regulates_entity_to_entity.html
edges.loc[(edges.bl_pred == "INHIBITS") & edges.bl_domain.isin({"genomic_entity", "chemical_substance"}) & 
      edges.bl_range.isin({"genomic_entity", "disease"}), 'bl_pred'] = 'negatively_regulates_entity_to_entity'.upper()

In [23]:
# stimulates edges with domain: chemical_substance or genomic entity, range: genomic_entity or disease ->
# https://biolink.github.io/biolink-model/docs/positively_regulates_entity_to_entity.html
edges.loc[(edges.bl_pred == "STIMULATES") & edges.bl_domain.isin({"genomic_entity", "chemical_substance"}) & 
      edges.bl_range.isin({"genomic_entity", "disease"}), 'bl_pred'] = 'positively_regulates_entity_to_entity'.upper()

In [24]:
# inhibits edges with domain: chemical_substance or genomic entity, range: biological_process ->
# https://biolink.github.io/biolink-model/docs/negatively_regulates.html
edges.loc[(edges.bl_pred == "INHIBITS") & edges.bl_domain.isin({"genomic_entity", "chemical_substance"}) & 
      edges.bl_range.isin({"biological_process"}), 'bl_pred'] = 'negatively_regulates'.upper()

In [25]:
# stimulates edges with domain: chemical_substance or genomic entity, range: biological_process ->
# https://biolink.github.io/biolink-model/docs/positively_regulates.html
edges.loc[(edges.bl_pred == "STIMULATES") & edges.bl_domain.isin({"genomic_entity", "chemical_substance"}) & 
      edges.bl_range.isin({"biological_process"}), 'bl_pred'] = 'positively_regulates'.upper()

In [26]:
edges.bl_pred.value_counts()

AFFECTS                                  2063026
LOCATION_OF                              1907245
INTERACTS_WITH                           1702586
COEXISTS_WITH                            1670436
PART_OF                                  1583418
STIMULATES                                868737
CAUSES                                    744245
TREATS                                    618788
RELATED_TO                                509114
AUGMENTS                                  459521
PRODUCES                                  442266
NEGATIVELY_REGULATES_ENTITY_TO_ENTITY     352136
SUBCLASS_OF                               334374
GENE_ASSOCIATED_WITH_CONDITION            265186
POSITIVELY_REGULATES_ENTITY_TO_ENTITY     142512
ADMINISTERED_TO                           120886
PREVENTS                                  120501
CONVERTS_TO                                57223
MANIFESTATION_OF                           38775
COMPLICATES                                30264
PRECEDES            

In [27]:
edges[(edges.bl_domain == "genomic_entity") & (edges.bl_range == "disease")].bl_pred.value_counts()

GENE_ASSOCIATED_WITH_CONDITION    265186
AFFECTS                           100636
TREATS                             87917
CAUSES                             84318
AUGMENTS                           38122
PREVENTS                           14385
Name: bl_pred, dtype: int64

In [28]:
## summary
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
print(len(set(edges.bl_type)))
print(len(edges))
print(len(nodes))
print(len(set(nodes.bl_type)))
print(len(set(edges.bl_pred)))

117
14076598
229751
8
22


In [29]:
del edges['bl_domain']
del edges['bl_range']

In [30]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred,bl_type
0,C0021769,C1182654,1597294,1,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_...
1,C0001271,C0007635,6684662;11037792,2,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_...
2,C0003339,C0014467,11857592,1,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_...
3,C0028612,C0085080,1985199,1,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_...
4,C0059249,C0431085,12739069,1,False,ADMINISTERED_TO,chemical_substance.ADMINISTERED_TO.anatomical_...


In [31]:
edges.to_csv("edges_biolink.csv", index=None)
nodes.to_csv("nodes_biolink.csv", index=None)