In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import re
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
edges = pd.read_csv('edges_filtered.csv')
nodes = pd.read_csv("nodes_filtered.csv")
abv = pd.read_csv("abv.csv")
name_abv = dict(zip(abv.full_name, abv.abv))
abv_name = dict(zip(abv.abv, abv.full_name))
node_type = dict(zip(nodes.ID, nodes.blm_type))

In [3]:
#nodes
# {name_abv.get(x):x for x in set(nodes.TYPE)}

In [4]:
#edges
# {abv_name.get(x):x for x in set(edges.PRED)}

In [5]:
nodes.blm_type.value_counts()

chemical_substance                59088
disease_or_phenotypic_feature     36967
gene                              18548
biological_entity                 15105
protein                           12883
gross_anatomical_structure         8948
biological_process_or_activity     6887
anatomical_entity                  2831
cell_component                     1636
cell                               1173
activity_and_behavior               916
phenotypic_feature                  416
genomic_entity                      179
Name: blm_type, dtype: int64

In [6]:
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE
0,C0086931,C0037369,ABafAB,26656404;8816000;19745413;17710714;21412223;19...,6,False,AB,af,AB
1,C0950156,C0009671,ABafAB,12584106,1,False,AB,af,AB
2,C0024810,C0035651,ABafAB,15775958;11173168,2,False,AB,af,AB
3,C0556297,C0037369,ABafAB,12164677;9373700,2,False,AB,af,AB
4,C1154333,C0015745,ABafAB,23872405,1,False,AB,af,AB


In [7]:
# regenerate domain and range from the new node types
edges['bl_domain'] = edges.START_ID.apply(node_type.get)
edges['bl_pred'] = edges.PRED.apply(abv_name.get)
edges['bl_range'] = edges.END_ID.apply(node_type.get)
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE,bl_domain,bl_pred,bl_range,bl_type
0,C0086931,C0037369,ABafAB,26656404;8816000;19745413;17710714;21412223;19...,6,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
1,C0950156,C0009671,ABafAB,12584106,1,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
2,C0024810,C0035651,ABafAB,15775958;11173168,2,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
3,C0556297,C0037369,ABafAB,12164677;9373700,2,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
4,C1154333,C0015745,ABafAB,23872405,1,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...


In [8]:
del edges['DOMAIN']
del edges['PRED']
del edges['RANGE']
del edges['TYPE']

In [9]:
# get the most common domain and ranges for each pred
pred_type_count = edges.groupby("bl_pred").bl_type.value_counts()

In [10]:
pred_type_count['INHIBITS'][:15]

bl_type
chemical_substance.INHIBITS.chemical_substance                134550
chemical_substance.INHIBITS.gene                              114917
chemical_substance.INHIBITS.biological_entity                  89829
biological_entity.INHIBITS.gene                                83472
gene.INHIBITS.gene                                             76298
chemical_substance.INHIBITS.protein                            76266
biological_entity.INHIBITS.biological_entity                   49191
protein.INHIBITS.gene                                          38363
biological_entity.INHIBITS.chemical_substance                  33755
biological_entity.INHIBITS.protein                             23778
protein.INHIBITS.chemical_substance                            16009
chemical_substance.INHIBITS.disease_or_phenotypic_feature      15297
protein.INHIBITS.biological_entity                             14677
chemical_substance.INHIBITS.biological_process_or_activity     14111
protein.INHIBITS.protein  

In [11]:
# working off https://docs.google.com/spreadsheets/d/1zXitcR1QjHyh6WocukgshSR7IoAVg7MJQG-HNh96Jec/edit#gid=579577728
allowed_domain_range = {
    "ADMINISTERED_TO": ({'chemical_substance', 'procedure', 'genomic_entity', 'protein'}, 
                        {'individual_organism', 'anatomical_entity'}),
    "AFFECTS": (None, None), # this means anything,
    "ASSOCIATED_WITH": ({'chemical_substance', 'disease', 'genomic_entity', 'biological_process', 'protein'}, 
                         {'disease'}),
    'AUGMENTS': ({'chemical_substance', 'genomic_entity', 'disease', 'protein'}, 
                         {'biological_process', 'disease', 'activity_and_behavior'}),
    'CAUSES': ({'chemical_substance', 'genomic_entity', 'disease', 'biological_process', 'activity_and_behavior'}, 
                         {'biological_process', 'disease'}),
    'COEXISTS_WITH': (None, None) ,
    'COMPLICATES': ({'disease'}, 
                    {'disease'}),
    'CONVERTS_TO': ({'chemical_substance', 'genomic_entity', 'protein'}, 
                    {'chemical_substance', 'genomic_entity', 'protein'}),
    'DISRUPTS': ({'chemical_substance', 'genomic_entity', 'biological_process', 'protein'}, 
                 {'biological_process', 'disease'}),
    'INHIBITS': ({'chemical_substance', 'genomic_entity', 'protein'}, 
                 {'chemical_substance', 'genomic_entity', 'biological_process', 'disease', 'protein'}),
    'INTERACTS_WITH': ({'chemical_substance', 'genomic_entity', 'protein'}, 
                       {'chemical_substance', 'genomic_entity', 'protein'}),
    'ISA': (None, None),
    'LOCATION_OF': ({'anatomical_entity'}, 
                    None),
    'MANIFESTATION_OF': ({'disease'}, 
                         {'disease', 'biological_process'}),
    'OCCURS_IN': ({'disease'}, {'individual_organism'}),
    'PART_OF': ({'chemical_substance', 'genomic_entity', 'anatomical_entity'}, 
                {'anatomical_entity', 'individual_organism', 'chemical_substance', 'genomic_entity'}),
    'PRECEDES': ({'disease', 'biological_process'}, 
                 {'disease', 'biological_process'}),
    'PREDISPOSES': (None, 
                    {'disease'}),
    'PREVENTS': ({'chemical_substance', 'genomic_entity', 'activity_and_behavior', 'protein'}, 
                 {'disease'}),
    'PROCESS_OF': ({'disease', 'biological_process', 'activity_and_behavior'}, 
                   {'individual_organism', 'anatomical_entity', 'disease'}),
    'PRODUCES': ({'chemical_substance', 'anatomical_entity', 'genomic_entity', 'biological_process', 'protein'}, 
                 {'chemical_substance', 'genomic_entity', 'protein'}),
    'STIMULATES': ({'chemical_substance', 'genomic_entity', 'protein'},
                  {'chemical_substance', 'genomic_entity', 'biological_process', 'disease', 'protein'}),
    'TREATS': ({'chemical_substance', 'genomic_entity', 'activity_and_behavior', 'protein'}, 
               {'disease'}),
}

In [12]:
def is_allowed_edge(domain, pred, rnge):
    allowed_domain, allowed_range = allowed_domain_range[pred]
    return ((domain in allowed_domain if allowed_domain else True) and 
              (rnge in allowed_range if allowed_range else True))

In [13]:
d = {x: is_allowed_edge(*x.split(".")) for x in set(edges.bl_type)}
allowed_edges = {k for k,v in d.items() if v}

### note NOT applying the domain and range restrictions!!

In [14]:
print(len(edges))

12880620


In [15]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C0086931,C0037369,26656404;8816000;19745413;17710714;21412223;19...,6,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
1,C0950156,C0009671,12584106,1,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
2,C0024810,C0035651,15775958;11173168,2,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
3,C0556297,C0037369,12164677;9373700,2,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
4,C1154333,C0015745,23872405,1,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...


In [16]:
# rename 'converts_to' edge to 'derives_into'
edges.loc[edges.bl_pred == "CONVERTS_TO", "bl_pred"] = "DERIVES_INTO"

In [17]:
# rename 'isa' edge to 'subclass of'
edges.loc[edges.bl_pred == "ISA", "bl_pred"] = "SUBCLASS_OF"

In [18]:
# rename 'disrupts' edge to 'affects'
edges.loc[edges.bl_pred == "DISRUPTS", "bl_pred"] = "AFFECTS"

In [19]:
# rename 'associated_with' edge to 'related_to'
edges.loc[edges.bl_pred == "ASSOCIATED_WITH", "bl_pred"] = "RELATED_TO"

In [20]:
# rename 'STIMULATES' edge to 'positively_regulates'
edges.loc[edges.bl_pred == "STIMULATES", "bl_pred"] = "positively_regulates".upper()

In [21]:
# rename 'INHIBITS' edge to 'negatively_regulates'
edges.loc[edges.bl_pred == "INHIBITS", "bl_pred"] = "negatively_regulates".upper()

In [22]:
# associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition, 
# and leave the rest alone
edges.loc[(edges.bl_pred == "RELATED_TO") & (edges.bl_domain == "gene") & 
      (edges.bl_range == "disease_or_phenotypic_feature"), 'bl_pred'] = 'gene_associated_with_condition'.upper()

In [23]:
edges.bl_pred.value_counts()

LOCATION_OF                       1828906
AFFECTS                           1772826
INTERACTS_WITH                    1610497
COEXISTS_WITH                     1560224
PART_OF                           1168114
POSITIVELY_REGULATES               975601
NEGATIVELY_REGULATES               800502
CAUSES                             689445
TREATS                             591775
PRODUCES                           491415
RELATED_TO                         479758
SUBCLASS_OF                        239894
GENE_ASSOCIATED_WITH_CONDITION     239328
PREDISPOSES                        206403
PREVENTS                           112275
DERIVES_INTO                        55158
MANIFESTATION_OF                    35088
PRECEDES                            23411
Name: bl_pred, dtype: int64

In [25]:
edges[(edges.bl_domain == "gene") & (edges.bl_range == "disease_or_phenotypic_feature")].bl_pred.value_counts()

GENE_ASSOCIATED_WITH_CONDITION    239328
AFFECTS                            91155
TREATS                             80697
CAUSES                             77058
PREDISPOSES                        44171
PART_OF                            41268
PREVENTS                           12656
LOCATION_OF                         1219
Name: bl_pred, dtype: int64

In [26]:
print(len(nodes))
nodes = nodes[nodes.ID.isin(set(list(edges['START_ID']) + list(edges['END_ID'])))]
print(len(nodes))

165577
165577


In [31]:
## summary
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
print(len(edges))
print(len(nodes))
print(len(set(edges.bl_type)))
print(len(set(edges.bl_pred)))
print(len(set(nodes.blm_type)))

KeyError: 'bl_domain'

In [29]:
del edges['bl_type']
del edges['bl_domain']
del edges['bl_range']

In [30]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred
0,C0086931,C0037369,26656404;8816000;19745413;17710714;21412223;19...,6,False,AFFECTS
1,C0950156,C0009671,12584106,1,False,AFFECTS
2,C0024810,C0035651,15775958;11173168,2,False,AFFECTS
3,C0556297,C0037369,12164677;9373700,2,False,AFFECTS
4,C1154333,C0015745,23872405,1,False,AFFECTS


In [32]:
edges.to_csv("edges_biolink.csv", index=None)
nodes.to_csv("nodes_biolink.csv", index=None)