In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
import re
from collections import defaultdict, Counter
from tqdm import tqdm

In [2]:
edges = pd.read_csv('edges_filtered.csv')
nodes = pd.read_csv("nodes_filtered.csv")
abv = pd.read_csv("abv.csv")
name_abv = dict(zip(abv.full_name, abv.abv))
abv_name = dict(zip(abv.abv, abv.full_name))
node_type = dict(zip(nodes.ID, nodes.blm_type))

In [5]:
nodes.blm_type.value_counts()

chemical_substance                59088
disease_or_phenotypic_feature     36967
gene                              18548
biological_entity                 15105
protein                           12883
gross_anatomical_structure         8948
biological_process_or_activity     6887
anatomical_entity                  2831
cell_component                     1636
cell                               1173
activity_and_behavior               916
phenotypic_feature                  416
genomic_entity                      179
Name: blm_type, dtype: int64

In [6]:
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE
0,C0086931,C0037369,ABafAB,26656404;8816000;19745413;17710714;21412223;19...,6,False,AB,af,AB
1,C0950156,C0009671,ABafAB,12584106,1,False,AB,af,AB
2,C0024810,C0035651,ABafAB,15775958;11173168,2,False,AB,af,AB
3,C0556297,C0037369,ABafAB,12164677;9373700,2,False,AB,af,AB
4,C1154333,C0015745,ABafAB,23872405,1,False,AB,af,AB


In [7]:
# regenerate domain and range from the new node types
edges['bl_domain'] = edges.START_ID.apply(node_type.get)
edges['bl_pred'] = edges.PRED.apply(abv_name.get)
edges['bl_range'] = edges.END_ID.apply(node_type.get)
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
edges.head()

Unnamed: 0,START_ID,END_ID,TYPE,pmids,n_pmids,NEG,DOMAIN,PRED,RANGE,bl_domain,bl_pred,bl_range,bl_type
0,C0086931,C0037369,ABafAB,26656404;8816000;19745413;17710714;21412223;19...,6,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
1,C0950156,C0009671,ABafAB,12584106,1,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
2,C0024810,C0035651,ABafAB,15775958;11173168,2,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
3,C0556297,C0037369,ABafAB,12164677;9373700,2,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
4,C1154333,C0015745,ABafAB,23872405,1,False,AB,af,AB,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...


In [8]:
del edges['DOMAIN']
del edges['PRED']
del edges['RANGE']
del edges['TYPE']

In [35]:
edges.bl_pred.value_counts()

LOCATION_OF         1828906
INTERACTS_WITH      1610497
COEXISTS_WITH       1560224
AFFECTS             1330905
PART_OF             1168114
STIMULATES           975601
INHIBITS             800502
ASSOCIATED_WITH      719086
CAUSES               689445
TREATS               591775
PRODUCES             491415
DISRUPTS             441921
ISA                  239894
PREDISPOSES          206403
PREVENTS             112275
CONVERTS_TO           55158
MANIFESTATION_OF      35088
PRECEDES              23411
Name: bl_pred, dtype: int64

In [17]:
len(set(edges.bl_pred))

18

In [9]:
# get the most common domain and ranges for each pred
pred_type_count = edges.groupby("bl_pred").bl_type.value_counts()

In [43]:
pred_type_count['PREVENTS'][:30]

bl_type
chemical_substance.PREVENTS.disease_or_phenotypic_feature       72220
biological_entity.PREVENTS.disease_or_phenotypic_feature        20638
gene.PREVENTS.disease_or_phenotypic_feature                     12656
protein.PREVENTS.disease_or_phenotypic_feature                   3512
activity_and_behavior.PREVENTS.disease_or_phenotypic_feature     3240
genomic_entity.PREVENTS.disease_or_phenotypic_feature               9
Name: bl_type, dtype: int64

In [44]:
# working off https://docs.google.com/spreadsheets/d/1zXitcR1QjHyh6WocukgshSR7IoAVg7MJQG-HNh96Jec/edit#gid=579577728
allowed_domain_range = {
    'CAUSES': (None, {'biological_process_or_activity', 'disease_or_phenotypic_feature'}),
    'LOCATION_OF': ({'gross_anatomical_structure', 'anatomical_entity', 'cell_component', 'cell'}, None),
    'TREATS': (None, {'disease_or_phenotypic_feature'}),
    'PREDISPOSES': (None, {'disease_or_phenotypic_feature'}),
    'PREVENTS': (None, {'disease_or_phenotypic_feature'}),
}

In [48]:
def is_allowed_edge(domain, pred, rnge):
    allowed_domain, allowed_range = allowed_domain_range[pred] if pred in allowed_domain_range else (None, None)
    return ((domain in allowed_domain if allowed_domain else True) and 
              (rnge in allowed_range if allowed_range else True))

In [49]:
d = {x: is_allowed_edge(*x.split(".")) for x in set(edges.bl_type)}
allowed_edges = {k for k,v in d.items() if v}

In [51]:
idx = edges.bl_type.isin(allowed_edges)
print(idx.value_counts())
edges = edges[idx]

True     12853517
False       27103
Name: bl_type, dtype: int64


In [52]:
print(len(edges))

12853517


In [53]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_domain,bl_pred,bl_range,bl_type
0,C0086931,C0037369,26656404;8816000;19745413;17710714;21412223;19...,6,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
1,C0950156,C0009671,12584106,1,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
2,C0024810,C0035651,15775958;11173168,2,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
3,C0556297,C0037369,12164677;9373700,2,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...
4,C1154333,C0015745,23872405,1,False,activity_and_behavior,AFFECTS,activity_and_behavior,activity_and_behavior.AFFECTS.activity_and_beh...


In [54]:
# rename 'converts_to' edge to 'derives_into'
edges.loc[edges.bl_pred == "CONVERTS_TO", "bl_pred"] = "DERIVES_INTO"

In [55]:
# rename 'isa' edge to 'subclass of'
edges.loc[edges.bl_pred == "ISA", "bl_pred"] = "SUBCLASS_OF"

In [56]:
# rename 'disrupts' edge to 'affects'
edges.loc[edges.bl_pred == "DISRUPTS", "bl_pred"] = "AFFECTS"

In [57]:
# rename 'associated_with' edge to 'related_to'
edges.loc[edges.bl_pred == "ASSOCIATED_WITH", "bl_pred"] = "RELATED_TO"

In [58]:
# rename 'STIMULATES' edge to 'positively_regulates'
edges.loc[edges.bl_pred == "STIMULATES", "bl_pred"] = "positively_regulates".upper()

In [59]:
# rename 'INHIBITS' edge to 'negatively_regulates'
edges.loc[edges.bl_pred == "INHIBITS", "bl_pred"] = "negatively_regulates".upper()

In [60]:
# associated_with/related_to edges with domain: gene, range: disease; rename to gene_associated_with_condition, 
# and leave the rest alone
edges.loc[(edges.bl_pred == "RELATED_TO") & (edges.bl_domain == "gene") & 
      (edges.bl_range == "disease_or_phenotypic_feature"), 'bl_pred'] = 'gene_associated_with_condition'.upper()

In [61]:
edges.bl_pred.value_counts()

LOCATION_OF                       1807353
AFFECTS                           1772826
INTERACTS_WITH                    1610497
COEXISTS_WITH                     1560224
PART_OF                           1168114
POSITIVELY_REGULATES               975601
NEGATIVELY_REGULATES               800502
CAUSES                             683895
TREATS                             591775
PRODUCES                           491415
RELATED_TO                         479758
SUBCLASS_OF                        239894
GENE_ASSOCIATED_WITH_CONDITION     239328
PREDISPOSES                        206403
PREVENTS                           112275
DERIVES_INTO                        55158
MANIFESTATION_OF                    35088
PRECEDES                            23411
Name: bl_pred, dtype: int64

In [62]:
edges[(edges.bl_domain == "gene") & (edges.bl_range == "disease_or_phenotypic_feature")].bl_pred.value_counts()

GENE_ASSOCIATED_WITH_CONDITION    239328
AFFECTS                            91155
TREATS                             80697
CAUSES                             77058
PREDISPOSES                        44171
PART_OF                            41268
PREVENTS                           12656
Name: bl_pred, dtype: int64

In [63]:
print(len(nodes))
nodes = nodes[nodes.ID.isin(set(list(edges['START_ID']) + list(edges['END_ID'])))]
print(len(nodes))

165577
165450


In [64]:
## summary
edges['bl_type'] = edges['bl_domain'] + "." + edges['bl_pred'] + "." + edges['bl_range']
print(len(edges))
print(len(nodes))
print(len(set(edges.bl_type)))
print(len(set(edges.bl_pred)))
print(len(set(nodes.blm_type)))

12853517
165450
498
18
13


In [65]:
del edges['bl_type']
del edges['bl_domain']
del edges['bl_range']

In [66]:
edges.head()

Unnamed: 0,START_ID,END_ID,pmids,n_pmids,NEG,bl_pred
0,C0086931,C0037369,26656404;8816000;19745413;17710714;21412223;19...,6,False,AFFECTS
1,C0950156,C0009671,12584106,1,False,AFFECTS
2,C0024810,C0035651,15775958;11173168,2,False,AFFECTS
3,C0556297,C0037369,12164677;9373700,2,False,AFFECTS
4,C1154333,C0015745,23872405,1,False,AFFECTS


In [67]:
edges.to_csv("edges_biolink.csv", index=None)
nodes.to_csv("nodes_biolink.csv", index=None)