04-filter_biolink
 - Filter specific domain and ranges for: CAUSES, LOCATION_OF, TREATS, PREDISPOSES, PREVENTS
 - rename 'converts_to' edge to 'derives_into'
 - rename 'isa' edge to 'subclass of'
 - rename 'disrupts' edge to 'affects'
 - rename 'associated_with' edge to 'related_to'
 - rename 'STIMULATES' edge to 'positively_regulates'
 - rename 'INHIBITS' edge to 'negatively_regulates'
 - associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition
 

In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import re
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
edges = pd.read_csv('edges_filtered.csv')
nodes = pd.read_csv("nodes_filtered.csv")
node_type = dict(zip(nodes.ID, nodes.blm_type))

In [3]:
nodes.blm_type.value_counts()

chemical_substance                58824
disease_or_phenotypic_feature     36288
gene                              20695
biological_entity                 14907
protein                           12645
gross_anatomical_structure         8472
biological_process_or_activity     6888
anatomical_entity                  2750
cell_component                     1644
cell                               1099
activity_and_behavior               936
phenotypic_feature                  442
genomic_entity                      175
Name: blm_type, dtype: int64

In [4]:
edges.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG
0,C1412045,AFFECTS,C0023946,20801151,False
1,C1412045,AFFECTS,C0028754,19789049,False
2,C1412045,AFFECTS,C0597304,1409557,False
3,C1412045,AFFECTS,C0599816,7617239,False
4,C1412045,ASSOCIATED_WITH,C0001807,8503828;8240219,False


In [5]:
# generate domain and range from the biolink node types
edges['bl_domain'] = edges.SUBJECT_CUI.apply(node_type.get)
edges['bl_pred'] = edges.PREDICATE
edges['bl_range'] = edges.OBJECT_CUI.apply(node_type.get)
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
edges.head()

Unnamed: 0,SUBJECT_CUI,PREDICATE,OBJECT_CUI,PMID,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C1412045,AFFECTS,C0023946,20801151,False,gene,AFFECTS,biological_process_or_activity,gene.AFFECTS.biological_process_or_activity
1,C1412045,AFFECTS,C0028754,19789049,False,gene,AFFECTS,disease_or_phenotypic_feature,gene.AFFECTS.disease_or_phenotypic_feature
2,C1412045,AFFECTS,C0597304,1409557,False,gene,AFFECTS,biological_process_or_activity,gene.AFFECTS.biological_process_or_activity
3,C1412045,AFFECTS,C0599816,7617239,False,gene,AFFECTS,biological_process_or_activity,gene.AFFECTS.biological_process_or_activity
4,C1412045,ASSOCIATED_WITH,C0001807,8503828;8240219,False,gene,ASSOCIATED_WITH,activity_and_behavior,gene.ASSOCIATED_WITH.activity_and_behavior


In [7]:
edges.rename(columns={'PREDICATE': 'SEMMED_PRED'}, inplace=True)

In [8]:
edges.bl_pred.value_counts()

LOCATION_OF         1863675
INTERACTS_WITH      1850601
COEXISTS_WITH       1707696
AFFECTS             1443544
PART_OF             1231194
STIMULATES          1159243
INHIBITS             925338
ASSOCIATED_WITH      813132
CAUSES               716141
TREATS               612205
PRODUCES             553511
DISRUPTS             471637
ISA                  247356
PREDISPOSES          230306
PREVENTS             117892
CONVERTS_TO           61644
MANIFESTATION_OF      36462
PRECEDES              24652
Name: bl_pred, dtype: int64

In [9]:
len(set(edges.bl_pred))

18

In [10]:
# get the most common domain and ranges for each pred
pred_type_count = edges.groupby("bl_pred").bl_type.value_counts()

In [11]:
pred_type_count['PREVENTS'][:30]

bl_type
chemical_substance.PREVENTS.disease_or_phenotypic_feature                73378
biological_entity.PREVENTS.disease_or_phenotypic_feature                 20696
gene.PREVENTS.disease_or_phenotypic_feature                              15899
protein.PREVENTS.disease_or_phenotypic_feature                            3499
activity_and_behavior.PREVENTS.disease_or_phenotypic_feature              3289
chemical_substance.PREVENTS.biological_process_or_activity                 286
disease_or_phenotypic_feature.PREVENTS.disease_or_phenotypic_feature       185
chemical_substance.PREVENTS.activity_and_behavior                          163
biological_entity.PREVENTS.biological_process_or_activity                  112
gene.PREVENTS.biological_process_or_activity                               101
biological_process_or_activity.PREVENTS.disease_or_phenotypic_feature       68
gene.PREVENTS.activity_and_behavior                                         49
biological_entity.PREVENTS.activity_and_beha

In [12]:
# working off https://docs.google.com/spreadsheets/d/1zXitcR1QjHyh6WocukgshSR7IoAVg7MJQG-HNh96Jec/edit#gid=579577728
allowed_domain_range = {
    'CAUSES': (None, {'biological_process_or_activity', 'disease_or_phenotypic_feature'}),
    'LOCATION_OF': ({'gross_anatomical_structure', 'anatomical_entity', 'cell_component', 'cell'}, None),
    'TREATS': (None, {'disease_or_phenotypic_feature'}),
    'PREDISPOSES': (None, {'disease_or_phenotypic_feature'}),
    'PREVENTS': (None, {'disease_or_phenotypic_feature'}),
}

In [13]:
def is_allowed_edge(domain, pred, rnge):
    allowed_domain, allowed_range = allowed_domain_range[pred] if pred in allowed_domain_range else (None, None)
    return ((domain in allowed_domain if allowed_domain else True) and 
              (rnge in allowed_range if allowed_range else True))

In [14]:
d = {x: is_allowed_edge(*x.split(".")) for x in set(edges.bl_type)}
allowed_edges = {k for k,v in d.items() if v}

In [15]:
idx = edges.bl_type.isin(allowed_edges)
print(idx.value_counts())
edges = edges[idx]

True     14033126
False       33103
Name: bl_type, dtype: int64


In [16]:
print(len(edges))

14033126


In [17]:
edges.head()

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C1412045,AFFECTS,C0023946,20801151,False,gene,AFFECTS,biological_process_or_activity,gene.AFFECTS.biological_process_or_activity
1,C1412045,AFFECTS,C0028754,19789049,False,gene,AFFECTS,disease_or_phenotypic_feature,gene.AFFECTS.disease_or_phenotypic_feature
2,C1412045,AFFECTS,C0597304,1409557,False,gene,AFFECTS,biological_process_or_activity,gene.AFFECTS.biological_process_or_activity
3,C1412045,AFFECTS,C0599816,7617239,False,gene,AFFECTS,biological_process_or_activity,gene.AFFECTS.biological_process_or_activity
4,C1412045,ASSOCIATED_WITH,C0001807,8503828;8240219,False,gene,ASSOCIATED_WITH,activity_and_behavior,gene.ASSOCIATED_WITH.activity_and_behavior


In [18]:
# rename 'converts_to' edge to 'derives_into'
edges.loc[edges.bl_pred == "CONVERTS_TO", "bl_pred"] = "DERIVES_INTO"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [19]:
# rename 'isa' edge to 'subclass of'
edges.loc[edges.bl_pred == "ISA", "bl_pred"] = "SUBCLASS_OF"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [20]:
# rename 'disrupts' edge to 'affects'
edges.loc[edges.bl_pred == "DISRUPTS", "bl_pred"] = "AFFECTS"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [21]:
# rename 'associated_with' edge to 'related_to'
edges.loc[edges.bl_pred == "ASSOCIATED_WITH", "bl_pred"] = "RELATED_TO"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [22]:
# rename 'STIMULATES' edge to 'positively_regulates'
edges.loc[edges.bl_pred == "STIMULATES", "bl_pred"] = "positively_regulates".upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [23]:
# rename 'INHIBITS' edge to 'negatively_regulates'
edges.loc[edges.bl_pred == "INHIBITS", "bl_pred"] = "negatively_regulates".upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [24]:
# associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition, 
# and leave the rest alone
edges.loc[(edges.bl_pred == "RELATED_TO") & (edges.bl_domain == "gene") & 
      (edges.bl_range == "disease_or_phenotypic_feature"), 'bl_pred'] = 'gene_associated_with_condition'.upper()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [25]:
edges.bl_pred.value_counts()

AFFECTS                           1915181
INTERACTS_WITH                    1850601
LOCATION_OF                       1843764
COEXISTS_WITH                     1707696
PART_OF                           1231194
POSITIVELY_REGULATES              1159243
NEGATIVELY_REGULATES               925338
CAUSES                             708849
TREATS                             609398
PRODUCES                           553511
RELATED_TO                         511024
GENE_ASSOCIATED_WITH_CONDITION     302108
SUBCLASS_OF                        247356
PREDISPOSES                        228045
PREVENTS                           117060
DERIVES_INTO                        61644
MANIFESTATION_OF                    36462
PRECEDES                            24652
Name: bl_pred, dtype: int64

In [26]:
edges[(edges.bl_domain == "gene") & (edges.bl_range == "disease_or_phenotypic_feature")].bl_pred.value_counts()

GENE_ASSOCIATED_WITH_CONDITION    302108
AFFECTS                           120203
TREATS                            101678
CAUSES                             96478
PREDISPOSES                        60088
PART_OF                            55699
PREVENTS                           15899
NEGATIVELY_REGULATES                 357
PRODUCES                              42
COEXISTS_WITH                         28
INTERACTS_WITH                        26
SUBCLASS_OF                           16
PRECEDES                              13
POSITIVELY_REGULATES                   9
DERIVES_INTO                           4
Name: bl_pred, dtype: int64

In [28]:
print(len(nodes))
nodes = nodes[nodes.ID.isin(set(list(edges['SUBJECT_CUI']) + list(edges['OBJECT_CUI'])))]
print(len(nodes))

165765
165658


In [29]:
## summary
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
print(len(edges))
print(len(nodes))
print(len(set(edges.bl_type)))
print(len(set(edges.bl_pred)))
print(len(set(nodes.blm_type)))

14033126
165658
1179
18
13


In [30]:
del edges['bl_type']
del edges['bl_domain']
del edges['bl_range']

In [31]:
edges.head()

Unnamed: 0,SUBJECT_CUI,SEMMED_PRED,OBJECT_CUI,PMID,NEG,bl_pred
0,C1412045,AFFECTS,C0023946,20801151,False,AFFECTS
1,C1412045,AFFECTS,C0028754,19789049,False,AFFECTS
2,C1412045,AFFECTS,C0597304,1409557,False,AFFECTS
3,C1412045,AFFECTS,C0599816,7617239,False,AFFECTS
4,C1412045,ASSOCIATED_WITH,C0001807,8503828;8240219,False,RELATED_TO


In [32]:
edges.to_csv("edges_biolink.csv", index=None)
nodes.to_csv("nodes_biolink.csv", index=None)