In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import re
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
edges = pd.read_csv('edges_filtered2.csv')
nodes = pd.read_csv("nodes_filtered2.csv")
abv = pd.read_csv("abv.csv")
name_abv = dict(zip(abv.full_name, abv.abv))
abv_name = dict(zip(abv.abv, abv.full_name))

In [3]:
#nodes
# {name_abv.get(x):x for x in set(nodes.TYPE)}

In [4]:
#edges
# {abv_name.get(x):x for x in set(edges.PRED)}

In [5]:
nodes.TYPE.value_counts()

Chemicals & Drugs              63674
Living Beings                  47099
Disorders                      38858
protein                        29386
Genes & Molecular Sequences    20801
Anatomy                        15289
Physiology                      8023
genomic_entity                  1848
Objects                         1437
Phenomena                       1148
Activities & Behaviors          1030
Name: TYPE, dtype: int64

In [6]:
node_mapping = {
 'Anatomy': 'anatomical_entity',
 'Activities & Behaviors': 'activity_and_behavior',
 'Chemicals & Drugs': 'chemical_substance',
 'Disorders': 'disease',
 'Genes & Molecular Sequences': 'genomic_entity',
 'Living Beings': 'individual_organism',
 'Objects': 'named_thing',
 'Phenomena': 'biological_process',
 'Physiology': 'biological_process',
 'protein': 'protein',
 'genomic_entity': 'genomic_entity'
}

In [7]:
nodes['bl_type'] = nodes.TYPE.apply(node_mapping.get)
nodes.head()

Unnamed: 0,ID,label,TYPE,bl_type
0,C0038195,State Medicine,Activities & Behaviors,activity_and_behavior
1,C1290952,Taking medication,Activities & Behaviors,activity_and_behavior
2,C0085092,Parenting behavior,Activities & Behaviors,activity_and_behavior
3,C1096771,Murderer,Activities & Behaviors,activity_and_behavior
4,C1299651,Serious reportable event,Activities & Behaviors,activity_and_behavior


In [8]:
# rename the edge domain and range to use the biolink types
edges['bl_domain'] = edges.DOMAIN.apply(abv_name.get).apply(node_mapping.get)
edges['bl_pred'] = edges.PRED.apply(abv_name.get)
edges['bl_range'] = edges.RANGE.apply(abv_name.get).apply(node_mapping.get)
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE,bl_domain,bl_pred,bl_range,bl_type
0,C0086931,C0037369,ABafAB,26656404;8816000;19745413;17710714;21412223;19...,6,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
1,C0950156,C0009671,ABafAB,12584106,1,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
2,C0024810,C0035651,ABafAB,15775958;11173168,2,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
3,C0556297,C0037369,ABafAB,12164677;9373700,2,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
4,C1154333,C0015745,ABafAB,23872405,1,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...


In [9]:
# get the most common domain and ranges for each pred
pred_type_count = edges.groupby("bl_pred").bl_type.value_counts()

In [10]:
pred_type_count['INHIBITS'][:15]

bl_type
chemical_substance.INHIBITS.chemical_substance    459534
chemical_substance.INHIBITS.genomic_entity        251465
genomic_entity.INHIBITS.genomic_entity             82386
chemical_substance.INHIBITS.biological_process     18774
chemical_substance.INHIBITS.disease                18285
Name: bl_type, dtype: int64

In [11]:
# working off https://docs.google.com/spreadsheets/d/1zXitcR1QjHyh6WocukgshSR7IoAVg7MJQG-HNh96Jec/edit#gid=579577728
allowed_domain_range = {
    "ADMINISTERED_TO": ({'chemical_substance', 'procedure', 'genomic_entity', 'protein'}, 
                        {'individual_organism', 'anatomical_entity'}),
    "AFFECTS": (None, None), # this means anything,
    "ASSOCIATED_WITH": ({'chemical_substance', 'disease', 'genomic_entity', 'biological_process', 'protein'}, 
                         {'disease'}),
    'AUGMENTS': ({'chemical_substance', 'genomic_entity', 'disease', 'protein'}, 
                         {'biological_process', 'disease', 'activity_and_behavior'}),
    'CAUSES': ({'chemical_substance', 'genomic_entity', 'disease', 'biological_process', 'activity_and_behavior'}, 
                         {'biological_process', 'disease'}),
    'COEXISTS_WITH': (None, None) ,
    'COMPLICATES': ({'disease'}, 
                    {'disease'}),
    'CONVERTS_TO': ({'chemical_substance', 'genomic_entity', 'protein'}, 
                    {'chemical_substance', 'genomic_entity', 'protein'}),
    'DISRUPTS': ({'chemical_substance', 'genomic_entity', 'biological_process', 'protein'}, 
                 {'biological_process', 'disease'}),
    'INHIBITS': ({'chemical_substance', 'genomic_entity', 'protein'}, 
                 {'chemical_substance', 'genomic_entity', 'biological_process', 'disease', 'protein'}),
    'INTERACTS_WITH': ({'chemical_substance', 'genomic_entity', 'protein'}, 
                       {'chemical_substance', 'genomic_entity', 'protein'}),
    'ISA': (None, None),
    'LOCATION_OF': ({'anatomical_entity'}, 
                    None),
    'MANIFESTATION_OF': ({'disease'}, 
                         {'disease', 'biological_process'}),
    'OCCURS_IN': ({'disease'}, {'individual_organism'}),
    'PART_OF': ({'chemical_substance', 'genomic_entity', 'anatomical_entity'}, 
                {'anatomical_entity', 'individual_organism', 'chemical_substance', 'genomic_entity'}),
    'PRECEDES': ({'disease', 'biological_process'}, 
                 {'disease', 'biological_process'}),
    'PREDISPOSES': (None, 
                    {'disease'}),
    'PREVENTS': ({'chemical_substance', 'genomic_entity', 'activity_and_behavior', 'protein'}, 
                 {'disease'}),
    'PROCESS_OF': ({'disease', 'biological_process', 'activity_and_behavior'}, 
                   {'individual_organism', 'anatomical_entity', 'disease'}),
    'PRODUCES': ({'chemical_substance', 'anatomical_entity', 'genomic_entity', 'biological_process', 'protein'}, 
                 {'chemical_substance', 'genomic_entity', 'protein'}),
    'STIMULATES': ({'chemical_substance', 'genomic_entity', 'protein'},
                  {'chemical_substance', 'genomic_entity', 'biological_process', 'disease', 'protein'}),
    'TREATS': ({'chemical_substance', 'genomic_entity', 'activity_and_behavior', 'protein'}, 
               {'disease'}),
}

In [12]:
def is_allowed_edge(domain, pred, rnge):
    allowed_domain, allowed_range = allowed_domain_range[pred]
    return ((domain in allowed_domain if allowed_domain else True) and 
              (rnge in allowed_range if allowed_range else True))

In [13]:
d = {x: is_allowed_edge(*x.split(".")) for x in set(edges.bl_type)}
allowed_edges = {k for k,v in d.items() if v}

In [14]:
idx = edges.bl_type.isin(allowed_edges)
print(idx.value_counts())
edges = edges[idx]

True     14201493
False     1373652
Name: bl_type, dtype: int64


In [15]:
print(len(edges))

14201493


In [16]:
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE,bl_domain,bl_pred,bl_range,bl_type
0,C0086931,C0037369,ABafAB,26656404;8816000;19745413;17710714;21412223;19...,6,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
1,C0950156,C0009671,ABafAB,12584106,1,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
2,C0024810,C0035651,ABafAB,15775958;11173168,2,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
3,C0556297,C0037369,ABafAB,12164677;9373700,2,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
4,C1154333,C0015745,ABafAB,23872405,1,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...


In [17]:
del edges['bl_type']
del edges['DOMAIN']
del edges['PRED']
del edges['RANGE']
del edges['TYPE']

In [18]:
# rename 'converts_to' edge to 'derives_into'
edges.bl_pred[edges.bl_pred == "CONVERTS_TO"] = "DERIVES_INTO"

In [19]:
# rename 'isa' edge to 'subclass of'
edges.bl_pred[edges.bl_pred == "ISA"] = "SUBCLASS_OF"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [20]:
# rename 'disrupts' edge to 'affects'
edges.bl_pred[edges.bl_pred == "DISRUPTS"] = "AFFECTS"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [21]:
# rename 'associated_with' edge to 'related_to'
edges.bl_pred[edges.bl_pred == "ASSOCIATED_WITH"] = "RELATED_TO"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [22]:
# rename 'STIMULATES' edge to 'positively_regulates'
edges.bl_pred[edges.bl_pred == "STIMULATES"] = "positively_regulates".upper()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [23]:
# rename 'INHIBITS' edge to 'negatively_regulates'
edges.bl_pred[edges.bl_pred == "INHIBITS"] = "negatively_regulates".upper()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [24]:
# associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition, 
# and leave the rest alone
edges.loc[(edges.bl_pred == "RELATED_TO") & (edges.bl_domain == "genomic_entity") & 
      (edges.bl_range == "disease"), 'bl_pred'] = 'gene_associated_with_condition'.upper()

In [25]:
edges.bl_pred.value_counts()

AFFECTS                           2063026
LOCATION_OF                       1953235
INTERACTS_WITH                    1702586
COEXISTS_WITH                     1670436
PART_OF                           1583418
POSITIVELY_REGULATES              1011249
NEGATIVELY_REGULATES               830444
CAUSES                             744245
TREATS                             618788
RELATED_TO                         509114
PRODUCES                           442266
SUBCLASS_OF                        334374
GENE_ASSOCIATED_WITH_CONDITION     265186
PREDISPOSES                        230042
PREVENTS                           120501
DERIVES_INTO                        57223
MANIFESTATION_OF                    38775
PRECEDES                            26585
Name: bl_pred, dtype: int64

In [26]:
edges[(edges.bl_domain == "genomic_entity") & (edges.bl_range == "disease")].bl_pred.value_counts()

GENE_ASSOCIATED_WITH_CONDITION    265186
AFFECTS                           100636
TREATS                             87917
CAUSES                             84318
PREDISPOSES                        49480
PREVENTS                           14385
Name: bl_pred, dtype: int64

In [27]:
print(len(nodes))
nodes = nodes[nodes.ID.isin(set(list(edges['START_ID']) + list(edges['END_ID'])))]
print(len(nodes))

228593
218484


In [28]:
## summary
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
print(len(edges))
print(len(nodes))
print(len(set(edges.bl_type)))
print(len(set(nodes.bl_type)))
print(len(set(edges.bl_pred)))

14201493
218484
116
9
18


In [29]:
del edges['bl_domain']
del edges['bl_range']

In [30]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred,bl_type
0,C0086931,C0037369,26656404;8816000;19745413;17710714;21412223;19...,6,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...
1,C0950156,C0009671,12584106,1,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...
2,C0024810,C0035651,15775958;11173168,2,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...
3,C0556297,C0037369,12164677;9373700,2,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...
4,C1154333,C0015745,23872405,1,False,AFFECTS,activity_and_behavior.AFFECTS.activity_and_beh...


In [31]:
edges.to_csv("edges_biolink.csv", index=None)
nodes.to_csv("nodes_biolink.csv", index=None)