This notebook loads all edges and nodes of the relevant knowledge graph. New dataframes are created for the nodes and edges including index values for each node.

# Import Libraries

In [19]:
import pandas as pd

# Set Parameters

Specify which dataset is used.
*   1 (Original knowledge graph)
*   2 (Restructured knowledge graph)

In [20]:
dataset_nr = 2
assert dataset_nr == 1 or 2

disease_prefix = 'oi'
assert disease_prefix == 'dmd' or 'hd' or 'oi'

if dataset_nr == 1:
    prefix = 'prev'
else:
    prefix = 'restr'
    
FILE_EDGES = f'{prefix}_{disease_prefix}_kg_edges.csv'
FILE_NODES = f'{prefix}_{disease_prefix}_kg_nodes.csv'

# Load Edges

Load all edges of knowledge graph

In [21]:
edges = pd.read_csv(f'../output/{disease_prefix}/{FILE_EDGES}', header=0)
edges.head(10)

Unnamed: 0,id,subject,object,relation_id,relation_label,relation_iri
0,1eb042d3ad2f1cda0a602fc5be9a0793,MGI:95489,HP:0100625,RO:0003303,causes condition,http://purl.obolibrary.org/obo/RO_0003303
1,e19ec199da3a008272d887dd92608f6d,ZFIN:ZDB-FISH-180503-6,ZFIN:ZDB-GENE-030131-9102,CustomRO:expressesgene,expresses gene,
2,08dfe632061ba01924e0a5449e0e1635,MGI:88462,GO:0062023,RO:0002327,enables,http://purl.obolibrary.org/obo/RO_0002327
3,d7ec998ef2bc79d0f8ec1bd54549766c,ENSEMBL:ENSCAFG00000005814,ENSEMBL:ENSMMUG00000062243,RO:HOM0000017,in orthology relationship with,http://purl.obolibrary.org/obo/RO_HOM0000017
4,e34e770be18a79dd2c2efacdff419d42,MGI:96610,MP:0001711,RO:0003303,causes condition,http://purl.obolibrary.org/obo/RO_0003303
5,fc1ab0f2d8a83dc1e3c534a748e692ed,MGI:2155345,MGI:1315208,RO:0002434,interacts with,
6,6c0b0f32cede7310763d6ee3fb0b3dab,ENSEMBL:ENSMMUG00000011948,RGD:1309313,RO:HOM0000017,in orthology relationship with,http://purl.obolibrary.org/obo/RO_HOM0000017
7,fd653ebfa50e980211bb34aa209daf29,ZFIN:ZDB-GENE-100119-1,HGNC:20821,RO:HOM0000017,in orthology relationship with,
8,94a6b72e81a9880efb2599853dbb5df0,HGNC:1476,NCBITaxon:9606,CustomRO:foundin,found in,
9,49b21d02405b4623e72b6c1da69c6592,HGNC:2214,HGNC:2212,RO:0002434,interacts with,


Extract the relevant columns and rename them

In [22]:
edges = edges[['subject', 'relation_label', 'object']]
edges.rename(columns={'subject': 'head', 'relation_label': 'relation', 'object': 'tail'}, inplace=True)
edges.head(10)

Unnamed: 0,head,relation,tail
0,MGI:95489,causes condition,HP:0100625
1,ZFIN:ZDB-FISH-180503-6,expresses gene,ZFIN:ZDB-GENE-030131-9102
2,MGI:88462,enables,GO:0062023
3,ENSEMBL:ENSCAFG00000005814,in orthology relationship with,ENSEMBL:ENSMMUG00000062243
4,MGI:96610,causes condition,MP:0001711
5,MGI:2155345,interacts with,MGI:1315208
6,ENSEMBL:ENSMMUG00000011948,in orthology relationship with,RGD:1309313
7,ZFIN:ZDB-GENE-100119-1,in orthology relationship with,HGNC:20821
8,HGNC:1476,found in,NCBITaxon:9606
9,HGNC:2214,interacts with,HGNC:2212


In [23]:
def convert_to_int_str(value):
    try:
        return str(int(float(value)))
    except ValueError:
        return value

edges['head'] = edges['head'].apply(convert_to_int_str)
edges['tail'] = edges['tail'].apply(convert_to_int_str)

In [24]:
print(f'There are {edges.shape[0]} edges')

There are 97068 edges


# Load Nodes

Load all nodes of knowledge graph

In [25]:
nodes = pd.read_csv(f'../output/{disease_prefix}/{FILE_NODES}', header=0)
nodes['index_id'] = nodes.index
nodes.head(10)

Unnamed: 0,id,label,iri,semantic,index_id
0,ZP:0000122,"notochord undulate, abnormal",,phenotype,0
1,MP:0011523,thin placenta labyrinth,,phenotype,1
2,ENSEMBL:ENSPTRG00000018033,ENSEMBL:ENSPTRG00000018033,,gene,2
3,MP:0008813,decreased common myeloid progenitor cell number,,phenotype,3
4,ENSEMBL:ENSACAG00000017768,ENSEMBL:ENSACAG00000017768,,gene,4
5,ENSEMBL:ENSBTAG00000007678,ENSEMBL:ENSBTAG00000007678,,gene,5
6,4942,lenvatinib,,drug,6
7,MP:0009141,increased prepulse inhibition,,phenotype,7
8,Xenbase:XB-GENE-5848106,camkmt,,gene,8
9,ZP:0003162,"peristalsis disrupted, abnormal",,phenotype,9


Extract relevant columns

In [26]:
nodes = nodes[['index_id', 'id', 'semantic', 'label']]
nodes.head(10)

Unnamed: 0,index_id,id,semantic,label
0,0,ZP:0000122,phenotype,"notochord undulate, abnormal"
1,1,MP:0011523,phenotype,thin placenta labyrinth
2,2,ENSEMBL:ENSPTRG00000018033,gene,ENSEMBL:ENSPTRG00000018033
3,3,MP:0008813,phenotype,decreased common myeloid progenitor cell number
4,4,ENSEMBL:ENSACAG00000017768,gene,ENSEMBL:ENSACAG00000017768
5,5,ENSEMBL:ENSBTAG00000007678,gene,ENSEMBL:ENSBTAG00000007678
6,6,4942,drug,lenvatinib
7,7,MP:0009141,phenotype,increased prepulse inhibition
8,8,Xenbase:XB-GENE-5848106,gene,camkmt
9,9,ZP:0003162,phenotype,"peristalsis disrupted, abnormal"


In [27]:
print(f'There are {nodes.shape[0]} nodes')

There are 9897 nodes


Convert semantic groups from categorical to numerical values (`semantic_id`)

In [28]:
nodes['semantic'] = nodes['semantic'].astype('category')
nodes['semantic_id'] = nodes['semantic'].cat.codes
nodes.head(10)

Unnamed: 0,index_id,id,semantic,label,semantic_id
0,0,ZP:0000122,phenotype,"notochord undulate, abnormal",8
1,1,MP:0011523,phenotype,thin placenta labyrinth,8
2,2,ENSEMBL:ENSPTRG00000018033,gene,ENSEMBL:ENSPTRG00000018033,4
3,3,MP:0008813,phenotype,decreased common myeloid progenitor cell number,8
4,4,ENSEMBL:ENSACAG00000017768,gene,ENSEMBL:ENSACAG00000017768,4
5,5,ENSEMBL:ENSBTAG00000007678,gene,ENSEMBL:ENSBTAG00000007678,4
6,6,4942,drug,lenvatinib,3
7,7,MP:0009141,phenotype,increased prepulse inhibition,8
8,8,Xenbase:XB-GENE-5848106,gene,camkmt,4
9,9,ZP:0003162,phenotype,"peristalsis disrupted, abnormal",8


Create a dictionary that can be used as mapping between `semantic_id` and `semantic`

In [29]:
node_semantic_dict = dict(enumerate(nodes['semantic'].cat.categories))
node_semantic_dict

{0: 'biological artifact',
 1: 'biological process',
 2: 'disease',
 3: 'drug',
 4: 'gene',
 5: 'gene product',
 6: 'genotype',
 7: 'molecular function',
 8: 'phenotype',
 9: 'taxon',
 10: 'variant'}

Save new dataframe containing all nodes into csv file

In [30]:
nodes.to_csv(f'../output/{disease_prefix}/{prefix}_{disease_prefix}_indexed_nodes.csv', index=False)

# Joining Dataframes

Merge dataframes with nodes and edges in order to join semantic classes and indices of nodes with heads and tails of the edges.

In [31]:
kg_df = pd.merge(edges, nodes,
                 left_on='head', right_on='id',
                 how='inner')
kg_df = kg_df[['head', 'label', 'semantic_id', 'index_id', 'relation', 'tail']]
kg_df.rename(columns={'label': 'label_head', 'semantic_id': 'class_head', 'index_id': 'index_head'}, inplace=True)
kg_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail
0,MGI:95489,Fbn1,4,9118,causes condition,HP:0100625
1,ZFIN:ZDB-FISH-180503-6,col1a1a<dmh13>/col1a1a<+> [unspecified backgro...,6,7602,expresses gene,ZFIN:ZDB-GENE-030131-9102
2,MGI:88462,Col7a1,4,9869,enables,GO:0062023
3,ENSEMBL:ENSCAFG00000005814,ENSEMBL:ENSCAFG00000005814,4,8764,in orthology relationship with,ENSEMBL:ENSMMUG00000062243
4,MGI:96610,Itgb1,4,9426,causes condition,MP:0001711
5,MGI:2155345,Col26a1,4,5110,interacts with,MGI:1315208
6,ENSEMBL:ENSMMUG00000011948,ENSEMBL:ENSMMUG00000011948,4,9811,in orthology relationship with,RGD:1309313
7,ZFIN:ZDB-GENE-100119-1,col27a1a,4,3955,in orthology relationship with,HGNC:20821
8,HGNC:1476,CAPN1,4,4091,found in,NCBITaxon:9606
9,HGNC:2214,COL7A1,4,3013,interacts with,HGNC:2212


In [32]:
kg_df = pd.merge(kg_df, nodes,
                 left_on='tail', right_on='id',
                 how='left')
kg_df = kg_df[['head', 'label_head', 'class_head', 'index_head', 'relation', 'tail', 'label', 'semantic_id', 'index_id']]
kg_df.rename(columns={'label': 'label_tail', 'semantic_id': 'class_tail', 'index_id': 'index_tail'}, inplace=True)
kg_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail
0,MGI:95489,Fbn1,4,9118,causes condition,HP:0100625,Enlarged thorax,8,7613
1,ZFIN:ZDB-FISH-180503-6,col1a1a<dmh13>/col1a1a<+> [unspecified backgro...,6,7602,expresses gene,ZFIN:ZDB-GENE-030131-9102,col1a1a,4,2303
2,MGI:88462,Col7a1,4,9869,enables,GO:0062023,collagen-containing extracellular matrix,7,5027
3,ENSEMBL:ENSCAFG00000005814,ENSEMBL:ENSCAFG00000005814,4,8764,in orthology relationship with,ENSEMBL:ENSMMUG00000062243,ENSEMBL:ENSMMUG00000062243,4,8347
4,MGI:96610,Itgb1,4,9426,causes condition,MP:0001711,abnormal placenta morphology,8,1890
5,MGI:2155345,Col26a1,4,5110,interacts with,MGI:1315208,P3h3,4,5108
6,ENSEMBL:ENSMMUG00000011948,ENSEMBL:ENSMMUG00000011948,4,9811,in orthology relationship with,RGD:1309313,Mfsd13a,4,3684
7,ZFIN:ZDB-GENE-100119-1,col27a1a,4,3955,in orthology relationship with,HGNC:20821,COL24A1,4,9777
8,HGNC:1476,CAPN1,4,4091,found in,NCBITaxon:9606,Homo sapiens,9,2582
9,HGNC:2214,COL7A1,4,3013,interacts with,HGNC:2212,COL6A2,4,617


Encode string values of relation labels as numeric values.

In [33]:
kg_df['type'], relation_labels = pd.factorize(kg_df['relation'])
kg_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,MGI:95489,Fbn1,4,9118,causes condition,HP:0100625,Enlarged thorax,8,7613,0
1,ZFIN:ZDB-FISH-180503-6,col1a1a<dmh13>/col1a1a<+> [unspecified backgro...,6,7602,expresses gene,ZFIN:ZDB-GENE-030131-9102,col1a1a,4,2303,1
2,MGI:88462,Col7a1,4,9869,enables,GO:0062023,collagen-containing extracellular matrix,7,5027,2
3,ENSEMBL:ENSCAFG00000005814,ENSEMBL:ENSCAFG00000005814,4,8764,in orthology relationship with,ENSEMBL:ENSMMUG00000062243,ENSEMBL:ENSMMUG00000062243,4,8347,3
4,MGI:96610,Itgb1,4,9426,causes condition,MP:0001711,abnormal placenta morphology,8,1890,0
5,MGI:2155345,Col26a1,4,5110,interacts with,MGI:1315208,P3h3,4,5108,4
6,ENSEMBL:ENSMMUG00000011948,ENSEMBL:ENSMMUG00000011948,4,9811,in orthology relationship with,RGD:1309313,Mfsd13a,4,3684,3
7,ZFIN:ZDB-GENE-100119-1,col27a1a,4,3955,in orthology relationship with,HGNC:20821,COL24A1,4,9777,3
8,HGNC:1476,CAPN1,4,4091,found in,NCBITaxon:9606,Homo sapiens,9,2582,5
9,HGNC:2214,COL7A1,4,3013,interacts with,HGNC:2212,COL6A2,4,617,4


In [34]:
relation_labels

Index(['causes condition', 'expresses gene', 'enables',
       'in orthology relationship with', 'interacts with', 'found in',
       'contributes to condition', 'is substance that treats', 'targets',
       'associated with phenotype', 'has affected feature', 'involved in',
       'likely causes condition', 'colocalizes with', 'is allele of',
       'has genotype', 'has role in modeling', 'is variant in', 'is of',
       'is product of'],
      dtype='object')

Save new dataframe containing all edges into csv file

In [35]:
kg_df.to_csv(f'../output/{disease_prefix}/{prefix}_{disease_prefix}_indexed_edges.csv', index=False)