This notebook loads all edges and nodes of the relevant knowledge graph. New dataframes are created for the nodes and edges including index values for each node.

# Import Libraries

In [3]:
import pandas as pd

# Set Parameters

Specify which dataset is used.
*   1 (Original knowledge graph)
*   2 (Restructured knowledge graph)

In [2]:
dataset_nr = 1
assert dataset_nr == 1 or 2

if dataset_nr == 1:
    prefix = 'prev'
else:
    prefix = 'new'
    
FILE_EDGES = f'{prefix}_kg_edges.csv'
FILE_NODES = f'{prefix}_kg_nodes.csv'

# Load Edges

Load all edges of knowledge graph

In [4]:
edges = pd.read_csv(f'output/{FILE_EDGES}', header=0)
edges.head(10)

Unnamed: 0,id,subject,object,relation_id,relation_label,relation_iri
0,MONARCH75386,ZFIN:ZDB-GENE-050626-112,FlyBase:FBgn0085464,RO:HOM0000017,in orthology relationship with,
1,MONARCH48847,ENSEMBL:ENSCAFG00000007343,FlyBase:FBgn0026598,RO:HOM0000017,in orthology relationship with,
2,MONARCH41069,ENSEMBL:ENSECAG00000018600,MGI:87906,RO:HOM0000020,in 1 to 1 orthology relationship with,
3,MONARCH9916,ENSEMBL:ENSXETG00000039919,ENSEMBL:ENSECAG00000016277,RO:HOM0000020,in 1 to 1 orthology relationship with,
4,MONARCH51257,MGI:94909,MP:0004819,RO:0002200,has phenotype,
5,MONARCH59350,ENSEMBL:ENSFCAG00000014942,Xenbase:XB-GENE-6041092,RO:HOM0000017,in orthology relationship with,
6,MONARCH1688,FlyBase:FBgn0283681,FBcv:0002002,RO:0002200,has phenotype,
7,MONARCH53712,ZFIN:ZDB-GENE-070412-4,ZP:0005599,RO:0002200,has phenotype,
8,MONARCH40288,WormBase:WBGene00003930,GO:0005515,RO:0002327,enables,
9,MONARCH69749,ENSEMBL:ENSSSCG00000006344,ZFIN:ZDB-GENE-081024-1,RO:HOM0000020,in 1 to 1 orthology relationship with,


Extract the relevant columns and rename them

In [5]:
edges = edges[['subject', 'relation_label', 'object']]
edges.rename(columns={'subject': 'head', 'relation_label': 'relation', 'object': 'tail'}, inplace=True)
edges.head(10)

Unnamed: 0,head,relation,tail
0,ZFIN:ZDB-GENE-050626-112,in orthology relationship with,FlyBase:FBgn0085464
1,ENSEMBL:ENSCAFG00000007343,in orthology relationship with,FlyBase:FBgn0026598
2,ENSEMBL:ENSECAG00000018600,in 1 to 1 orthology relationship with,MGI:87906
3,ENSEMBL:ENSXETG00000039919,in 1 to 1 orthology relationship with,ENSEMBL:ENSECAG00000016277
4,MGI:94909,has phenotype,MP:0004819
5,ENSEMBL:ENSFCAG00000014942,in orthology relationship with,Xenbase:XB-GENE-6041092
6,FlyBase:FBgn0283681,has phenotype,FBcv:0002002
7,ZFIN:ZDB-GENE-070412-4,has phenotype,ZP:0005599
8,WormBase:WBGene00003930,enables,GO:0005515
9,ENSEMBL:ENSSSCG00000006344,in 1 to 1 orthology relationship with,ZFIN:ZDB-GENE-081024-1


In [6]:
print(f'There are {edges.shape[0]} edges')

There are 82944 edges


# Load Nodes

Load all nodes of knowledge graph

In [7]:
nodes = pd.read_csv(f'output/{FILE_NODES}', header=0)
nodes['index_id'] = nodes.index
nodes.head(10)

Unnamed: 0,id,label,iri,semantic,taxon_id,taxon_label,index_id
0,WormBase:WBGene00000389,cdc-25.4,,ORTH,,,0
1,ZP:0018675,right side lateral plate mesoderm mislocalised...,,DISO,,,1
2,ZFIN:ZDB-GENE-040426-1197,tbc1d5,,ORTH,,,2
3,5,(S)-nicardipine,,DRUG,,,3
4,RGD:3443,Ptk2,,ORTH,,,4
5,dbSNP:rs73460075,rs73460075-C,,GENO,,,5
6,6,(S)-nitrendipine,,DRUG,,,6
7,HP:0000997,Axillary freckling,,DISO,,,7
8,ZP:0105707,"Reissner's fiber malformed, abnormal",,DISO,,,8
9,MESH:C580853,cannabidivarin,,DRUG,,,9


Extract relevant columns

In [9]:
nodes = nodes[['index_id', 'id', 'semantic', 'label']]
nodes.head(10)

Unnamed: 0,index_id,id,semantic,label
0,0,WormBase:WBGene00000389,ORTH,cdc-25.4
1,1,ZP:0018675,DISO,right side lateral plate mesoderm mislocalised...
2,2,ZFIN:ZDB-GENE-040426-1197,ORTH,tbc1d5
3,3,5,DRUG,(S)-nicardipine
4,4,RGD:3443,ORTH,Ptk2
5,5,dbSNP:rs73460075,GENO,rs73460075-C
6,6,6,DRUG,(S)-nitrendipine
7,7,HP:0000997,DISO,Axillary freckling
8,8,ZP:0105707,DISO,"Reissner's fiber malformed, abnormal"
9,9,MESH:C580853,DRUG,cannabidivarin


In [10]:
print(f'There are {nodes.shape[0]} nodes')

There are 10034 nodes


Convert semantic groups from categorical to numerical values (`semantic_id`)

In [12]:
nodes['semantic'] = nodes['semantic'].astype('category')
nodes['semantic_id'] = nodes['semantic'].cat.codes
nodes.head(10)

Unnamed: 0,index_id,id,semantic,label,semantic_id
0,0,WormBase:WBGene00000389,ORTH,cdc-25.4,5
1,1,ZP:0018675,DISO,right side lateral plate mesoderm mislocalised...,1
2,2,ZFIN:ZDB-GENE-040426-1197,ORTH,tbc1d5,5
3,3,5,DRUG,(S)-nicardipine,2
4,4,RGD:3443,ORTH,Ptk2,5
5,5,dbSNP:rs73460075,GENO,rs73460075-C,4
6,6,6,DRUG,(S)-nitrendipine,2
7,7,HP:0000997,DISO,Axillary freckling,1
8,8,ZP:0105707,DISO,"Reissner's fiber malformed, abnormal",1
9,9,MESH:C580853,DRUG,cannabidivarin,2


Create a dictionary that can be used as mapping between `semantic_id` and `semantic`

In [13]:
node_semantic_dict = dict(enumerate(nodes['semantic'].cat.categories))
node_semantic_dict

{0: 'ANAT',
 1: 'DISO',
 2: 'DRUG',
 3: 'GENE',
 4: 'GENO',
 5: 'ORTH',
 6: 'PHYS',
 7: 'VARI'}

Save new dataframe containing all nodes into csv file

In [14]:
nodes.to_csv(f'output/indexed_nodes_{dataset_nr}.csv', index=False)

# Joining Dataframes

Merge dataframes with nodes and edges in order to join semantic classes and indices of nodes with heads and tails of the edges.

In [15]:
kg_df = pd.merge(edges, nodes,
                 left_on='head', right_on='id',
                 how='inner')
kg_df = kg_df[['head', 'label', 'semantic_id', 'index_id', 'relation', 'tail']]
kg_df.rename(columns={'label': 'label_head', 'semantic_id': 'class_head', 'index_id': 'index_head'}, inplace=True)
kg_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail
0,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,FlyBase:FBgn0085464
1,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,HGNC:7585
2,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,FlyBase:FBgn0002772
3,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,NCBIGene:396472
4,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSECAG00000020967
5,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSACAG00000017407
6,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,MGI:97267
7,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,RGD:1591197
8,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSFCAG00000003878
9,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSMODG00000010887


In [16]:
kg_df = pd.merge(kg_df, nodes,
                 left_on='tail', right_on='id',
                 how='left')
kg_df = kg_df[['head', 'label_head', 'class_head', 'index_head', 'relation', 'tail', 'label', 'semantic_id', 'index_id']]
kg_df.rename(columns={'label': 'label_tail', 'semantic_id': 'class_tail', 'index_id': 'index_tail'}, inplace=True)
kg_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail
0,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,FlyBase:FBgn0085464,CG34435,5,6825
1,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,HGNC:7585,MYL4,3,27
2,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,FlyBase:FBgn0002772,Mlc1,5,8901
3,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,NCBIGene:396472,MYL4,3,9508
4,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSECAG00000020967,ENSEMBL:ENSECAG00000020967,5,8807
5,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSACAG00000017407,ENSEMBL:ENSACAG00000017407,5,6449
6,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,MGI:97267,Myl4,5,904
7,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,RGD:1591197,Myl4,5,9266
8,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSFCAG00000003878,ENSEMBL:ENSFCAG00000003878,5,660
9,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSMODG00000010887,ENSEMBL:ENSMODG00000010887,5,8400


Encode string values of relation labels as numeric values.

In [17]:
kg_df['type'], relation_labels = pd.factorize(kg_df['relation'])
kg_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,FlyBase:FBgn0085464,CG34435,5,6825,0
1,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,HGNC:7585,MYL4,3,27,0
2,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,FlyBase:FBgn0002772,Mlc1,5,8901,0
3,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in orthology relationship with,NCBIGene:396472,MYL4,3,9508,0
4,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSECAG00000020967,ENSEMBL:ENSECAG00000020967,5,8807,1
5,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSACAG00000017407,ENSEMBL:ENSACAG00000017407,5,6449,1
6,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,MGI:97267,Myl4,5,904,1
7,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,RGD:1591197,Myl4,5,9266,1
8,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSFCAG00000003878,ENSEMBL:ENSFCAG00000003878,5,660,1
9,ZFIN:ZDB-GENE-050626-112,myl4,5,5279,in 1 to 1 orthology relationship with,ENSEMBL:ENSMODG00000010887,ENSEMBL:ENSMODG00000010887,5,8400,1


In [18]:
relation_labels

Index(['in orthology relationship with',
       'in 1 to 1 orthology relationship with', 'expressed in', 'is part of',
       'has phenotype', 'enables', 'interacts with', 'involved in',
       'colocalizes with', 'is causal germline mutation in',
       'contributes to condition', 'has affected feature',
       'pathogenic for condition', 'contributes to', 'targets',
       'has role in modeling', 'is allele of',
       'likely pathogenic for condition', 'causes condition', 'source',
       'has genotype', 'is causal germline mutation partially giving rise to',
       'is marker for'],
      dtype='object')

In [19]:
print(f'There are {kg_df.shape[0]} edges')

There are 82913 edges


Save new dataframe containing all edges into csv file

In [20]:
kg_df.to_csv(f'output/indexed_edges_{dataset_nr}.csv', index=False)