This notebook loads all edges and nodes of the relevant knowledge graph. New dataframes are created for the nodes and edges including index values for each node.

# Import Libraries

In [83]:
import pandas as pd

# Set Parameters

Specify which dataset is used.
*   1 (Original knowledge graph)
*   2 (Restructured knowledge graph)

In [84]:
dataset_nr = 2
assert dataset_nr == 1 or 2

NO_GENE_PRODUCTS = 'no gene products'
SINGLE_RELATION_TYPE = 'only one relation type' # TODO: implement this as well
concept_changes = NO_GENE_PRODUCTS

if dataset_nr == 1:
    prefix = 'prev'
else:
    prefix = 'new'
    
if concept_changes == NO_GENE_PRODUCTS:
    suffix = '_nogeneprods'
elif concept_changes == SINGLE_RELATION_TYPE:
    suffix = '_singlerel'
else:
    suffix = ''
    
FILE_EDGES = f'{prefix}_kg_edges.csv'
FILE_NODES = f'{prefix}_kg_nodes.csv'

# Load Edges

Load all edges of knowledge graph

In [85]:
edges = pd.read_csv(f'output/{FILE_EDGES}', header=0)
edges.head(10)

Unnamed: 0,id,subject,object,relation_id,relation_label,relation_iri
0,15a20d1b-1d4c-4ef5-8960-0bbcee1b4261,WormBase:WBGene00006787,WormBase:WBGene00003929,RO:0002434,interacts with,http://purl.obolibrary.org/obo/RO_0002434
1,78c75c6d-806d-47b4-930f-4544eaa2c5a5,ZFIN:ZDB-GENE-020129-1,ZP:0004595,RO:0003303,causes condition,http://purl.obolibrary.org/obo/RO_0003303
2,34b96ade-f51a-4680-8c53-c00fbf079968,ENSEMBL:ENSCAFG00000001848,RGD:1310277,RO:HOM0000017,in orthology relationship with,http://purl.obolibrary.org/obo/RO_HOM0000017
3,d32fd8ea-e219-45d9-be6d-7adcba8289c0,MGI:101864,MP:0002961,RO:0003303,causes condition,http://purl.obolibrary.org/obo/RO_0003303
4,58c47a12-8477-40a3-9009-a200854f84fe,ENSEMBL:ENSMODG00000018459,WormBase:WBGene00003911,RO:HOM0000017,in orthology relationship with,http://purl.obolibrary.org/obo/RO_HOM0000017
5,2ec9e759-e9ea-4e9d-ac75-57a643cfd845,MGI:1277151,HP:0012757,RO:0003303,causes condition,http://purl.obolibrary.org/obo/RO_0003303
6,18ad99b9-19c6-419a-bb5e-d49926575675,FlyBase:FBgn0261563,ENSEMBL:ENSMMUG00000004321,RO:HOM0000017,in orthology relationship with,http://purl.obolibrary.org/obo/RO_HOM0000017
7,5990c621-27da-4f75-bc39-55ad604f191f,ENSEMBL:ENSFCAG00000012569,HGNC:11873,RO:HOM0000017,in orthology relationship with,http://purl.obolibrary.org/obo/RO_HOM0000017
8,6bdb9894-3967-441e-aa68-f5a37e900c56,WormBase:WBGene00006062,WBPhenotype:0001316,RO:0003303,causes condition,http://purl.obolibrary.org/obo/RO_0003303
9,ed0f0944-a99c-4508-ba34-59675d615d4c,HGNC:8591,MGI:1339984,RO:HOM0000017,in orthology relationship with,http://purl.obolibrary.org/obo/RO_HOM0000017


Extract the relevant columns and rename them

In [86]:
edges = edges[['subject', 'relation_label', 'object']]
edges.rename(columns={'subject': 'head', 'relation_label': 'relation', 'object': 'tail'}, inplace=True)
edges.head(10)

Unnamed: 0,head,relation,tail
0,WormBase:WBGene00006787,interacts with,WormBase:WBGene00003929
1,ZFIN:ZDB-GENE-020129-1,causes condition,ZP:0004595
2,ENSEMBL:ENSCAFG00000001848,in orthology relationship with,RGD:1310277
3,MGI:101864,causes condition,MP:0002961
4,ENSEMBL:ENSMODG00000018459,in orthology relationship with,WormBase:WBGene00003911
5,MGI:1277151,causes condition,HP:0012757
6,FlyBase:FBgn0261563,in orthology relationship with,ENSEMBL:ENSMMUG00000004321
7,ENSEMBL:ENSFCAG00000012569,in orthology relationship with,HGNC:11873
8,WormBase:WBGene00006062,causes condition,WBPhenotype:0001316
9,HGNC:8591,in orthology relationship with,MGI:1339984


In [87]:
def convert_to_int_str(value):
    try:
        return str(int(float(value)))
    except ValueError:
        return value

edges['head'] = edges['head'].apply(convert_to_int_str)
edges['tail'] = edges['tail'].apply(convert_to_int_str)

In [88]:
print(f'There are {edges.shape[0]} edges')

There are 85992 edges


# Load Nodes

Load all nodes of knowledge graph

In [156]:
nodes = pd.read_csv(f'output/{FILE_NODES}', header=0)
nodes['index_id'] = nodes.index
nodes.head(10)

Unnamed: 0,id,label,iri,semantic,index_id
0,MP:0004187,cardia bifida,http://purl.obolibrary.org/obo/MP_0004187,phenotype,0
1,ZP:0100138,muscle tendon junction myotome increased amoun...,http://purl.obolibrary.org/obo/ZP_0100138,phenotype,1
2,MGI:1346525,Sgcd,http://www.informatics.jax.org/accession/MGI:1...,gene,2
3,OMIM:300377.0044,"DMD, LYS770TER",http://omim.org/entry/300377.0044,variant,3
4,ZP:0002210,posterior lateral line neuromast primordium mi...,http://purl.obolibrary.org/obo/ZP_0002210,phenotype,4
5,MP:0020335,abnormal dentate gyrus neuron dendrite morphology,http://purl.obolibrary.org/obo/MP_0020335,phenotype,5
6,HP:0000316,Hypertelorism,http://purl.obolibrary.org/obo/HP_0000316,phenotype,6
7,ZP:0004032,"margin constricted, abnormal",http://purl.obolibrary.org/obo/ZP_0004032,phenotype,7
8,ZP:0003164,"atrium decreased size, abnormal",http://purl.obolibrary.org/obo/ZP_0003164,phenotype,8
9,MP:0003331,increased hepatocellular carcinoma incidence,http://purl.obolibrary.org/obo/MP_0003331,phenotype,9


Extract relevant columns

In [157]:
nodes = nodes[['index_id', 'id', 'semantic', 'label']]
nodes.head(10)

Unnamed: 0,index_id,id,semantic,label
0,0,MP:0004187,phenotype,cardia bifida
1,1,ZP:0100138,phenotype,muscle tendon junction myotome increased amoun...
2,2,MGI:1346525,gene,Sgcd
3,3,OMIM:300377.0044,variant,"DMD, LYS770TER"
4,4,ZP:0002210,phenotype,posterior lateral line neuromast primordium mi...
5,5,MP:0020335,phenotype,abnormal dentate gyrus neuron dendrite morphology
6,6,HP:0000316,phenotype,Hypertelorism
7,7,ZP:0004032,phenotype,"margin constricted, abnormal"
8,8,ZP:0003164,phenotype,"atrium decreased size, abnormal"
9,9,MP:0003331,phenotype,increased hepatocellular carcinoma incidence


In [158]:
print(f'There are {nodes.shape[0]} nodes')

There are 10275 nodes


Convert semantic groups from categorical to numerical values (`semantic_id`)

In [159]:
nodes['semantic'] = nodes['semantic'].astype('category')
nodes['semantic_id'] = nodes['semantic'].cat.codes
nodes.head(10)

Unnamed: 0,index_id,id,semantic,label,semantic_id
0,0,MP:0004187,phenotype,cardia bifida,9
1,1,ZP:0100138,phenotype,muscle tendon junction myotome increased amoun...,9
2,2,MGI:1346525,gene,Sgcd,5
3,3,OMIM:300377.0044,variant,"DMD, LYS770TER",11
4,4,ZP:0002210,phenotype,posterior lateral line neuromast primordium mi...,9
5,5,MP:0020335,phenotype,abnormal dentate gyrus neuron dendrite morphology,9
6,6,HP:0000316,phenotype,Hypertelorism,9
7,7,ZP:0004032,phenotype,"margin constricted, abnormal",9
8,8,ZP:0003164,phenotype,"atrium decreased size, abnormal",9
9,9,MP:0003331,phenotype,increased hepatocellular carcinoma incidence,9


Create a dictionary that can be used as mapping between `semantic_id` and `semantic`

In [160]:
node_semantic_dict = dict(enumerate(nodes['semantic'].cat.categories))
node_semantic_dict

{0: 'biological artifact',
 1: 'biological process',
 2: 'cellular component',
 3: 'disease',
 4: 'drug',
 5: 'gene',
 6: 'gene product',
 7: 'genotype',
 8: 'molecular function',
 9: 'phenotype',
 10: 'taxon',
 11: 'variant'}

Save new dataframe containing all nodes into csv file

In [94]:
nodes.to_csv(f'output/indexed_nodes_{dataset_nr}{suffix}.csv', index=False)

# Joining Dataframes

Merge dataframes with nodes and edges in order to join semantic classes and indices of nodes with heads and tails of the edges.

In [95]:
kg_df = pd.merge(edges, nodes,
                 left_on='head', right_on='id',
                 how='inner')
kg_df = kg_df[['head', 'label', 'semantic_id', 'index_id', 'relation', 'tail']]
kg_df.rename(columns={'label': 'label_head', 'semantic_id': 'class_head', 'index_id': 'index_head'}, inplace=True)
kg_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail
0,WormBase:WBGene00006787,unc-52,5,304,interacts with,WormBase:WBGene00003929
1,WormBase:WBGene00006787,unc-52,5,304,interacts with,WormBase:WBGene00006789
2,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSSSCG00000015555
3,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ZFIN:ZDB-GENE-021226-3
4,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSOANG00000001050
5,WormBase:WBGene00006787,unc-52,5,304,causes condition,WBPhenotype:0001171
6,WormBase:WBGene00006787,unc-52,5,304,causes condition,WBPhenotype:0001425
7,WormBase:WBGene00006787,unc-52,5,304,causes condition,WBPhenotype:0000781
8,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSPTRG00000021480
9,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSSSCG00000015556


In [96]:
kg_df = pd.merge(kg_df, nodes,
                 left_on='tail', right_on='id',
                 how='left')
kg_df = kg_df[['head', 'label_head', 'class_head', 'index_head', 'relation', 'tail', 'label', 'semantic_id', 'index_id']]
kg_df.rename(columns={'label': 'label_tail', 'semantic_id': 'class_tail', 'index_id': 'index_tail'}, inplace=True)
kg_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail
0,WormBase:WBGene00006787,unc-52,5,304,interacts with,WormBase:WBGene00003929,pat-2,5,1542
1,WormBase:WBGene00006787,unc-52,5,304,interacts with,WormBase:WBGene00006789,unc-54,5,6544
2,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSSSCG00000015555,LAMC1,5,9268
3,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ZFIN:ZDB-GENE-021226-3,lamc1,5,5387
4,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSOANG00000001050,ENSEMBL:ENSOANG00000001050,5,2204
5,WormBase:WBGene00006787,unc-52,5,304,causes condition,WBPhenotype:0001171,shortened life span,9,5475
6,WormBase:WBGene00006787,unc-52,5,304,causes condition,WBPhenotype:0001425,receptor mediated endocytosis defective,9,3057
7,WormBase:WBGene00006787,unc-52,5,304,causes condition,WBPhenotype:0000781,body wall muscle thin filament variant,9,7821
8,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSPTRG00000021480,ENSEMBL:ENSPTRG00000021480,5,6347
9,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSSSCG00000015556,LAMC2,5,769


Encode string values of relation labels as numeric values.

In [141]:
kg_df['type'], relation_labels = pd.factorize(kg_df['relation'])
kg_df.head(10)

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,WormBase:WBGene00006787,unc-52,5,304,interacts with,WormBase:WBGene00003929,pat-2,5,1542,0
1,WormBase:WBGene00006787,unc-52,5,304,interacts with,WormBase:WBGene00006789,unc-54,5,6544,0
2,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSSSCG00000015555,LAMC1,5,9268,1
3,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ZFIN:ZDB-GENE-021226-3,lamc1,5,5387,1
4,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSOANG00000001050,ENSEMBL:ENSOANG00000001050,5,2204,1
5,WormBase:WBGene00006787,unc-52,5,304,causes condition,WBPhenotype:0001171,shortened life span,9,5475,2
6,WormBase:WBGene00006787,unc-52,5,304,causes condition,WBPhenotype:0001425,receptor mediated endocytosis defective,9,3057,2
7,WormBase:WBGene00006787,unc-52,5,304,causes condition,WBPhenotype:0000781,body wall muscle thin filament variant,9,7821,2
8,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSPTRG00000021480,ENSEMBL:ENSPTRG00000021480,5,6347,1
9,WormBase:WBGene00006787,unc-52,5,304,in orthology relationship with,ENSEMBL:ENSSSCG00000015556,LAMC2,5,769,1


In [98]:
relation_labels

Index(['interacts with', 'in orthology relationship with', 'causes condition',
       'involved in', 'found in', 'is part of', 'enables', 'colocalizes with',
       'has role in modeling', 'contributes to condition', 'expresses gene',
       'has affected feature', 'is of', 'likely causes condition',
       'is variant in', 'is allele of', 'is substance that treats', 'targets',
       'associated with phenotype', 'is product of', 'has genotype'],
      dtype='object')

Save new dataframe containing all edges into csv file

In [175]:
if concept_changes == NO_GENE_PRODUCTS and dataset_nr == 2:
    all_edges = kg_df.copy()
    
    all_gene_products = all_edges.loc[all_edges['type'] == list(relation_labels).index('is product of')]
    all_targeting_drugs = all_edges.loc[all_edges['type'] == list(relation_labels).index('targets')]
    
    merged_df = pd.merge(all_targeting_drugs, all_gene_products, how='left', left_on='tail', right_on='head', indicator=True)
    result_df = merged_df[merged_df['_merge'] == 'both']
    
    new_edges = result_df[['head_x', 'label_head_x', 'class_head_x', 'index_head_x', 'relation_x', 'tail_y', 'label_tail_y', 'class_tail_y', 'index_tail_y', 'type_y']]
    drug_gene_edges = new_edges.rename(columns={'head_x': 'head', 'label_head_x': 'label_head', 'class_head_x': 'class_head', 'index_head_x': 'index_head', 
                                                'relation_x': 'relation', 'tail_y': 'tail', 'label_tail_y': 'label_tail', 'class_tail_y': 'class_tail', 'index_tail_y': 'index_tail', 'type_y': 'type'})
    
    
    all_non_gene_products = all_edges.loc[(all_edges['type'] != list(relation_labels).index('is product of')) & (all_edges['type'] != list(relation_labels).index('targets'))]
    all_new_edges = pd.concat([all_non_gene_products, drug_gene_edges], ignore_index=True)

    gene_products_nodes = nodes.loc[nodes['semantic'] == 'gene product']
    no_gene_products_nodes = nodes.loc[nodes['semantic'] != 'gene product']
    no_gene_products_nodes.reset_index(inplace=True)
    no_gene_products_nodes['index_id'] = no_gene_products_nodes.index
    
    # Remap indices of head and tail of edges with node indices
    reindexed_left = pd.merge(all_new_edges, no_gene_products_nodes, how='inner', left_on='index_head', right_on='index')[['head', 'label_head', 'class_head', 'index_id', 
                                                                                                                           'relation', 
                                                                                                                           'tail', 'label_tail', 'class_tail', 'index_tail', 'type']]
    reindexed_left.rename(columns={'index_id': 'index_head'}, inplace=True)
    reindexed_left_right = pd.merge(reindexed_left, no_gene_products_nodes, how='inner', left_on='index_tail', right_on='index')[['head', 'label_head', 'class_head', 'index_head', 
                                                                                                                                  'relation', 
                                                                                                                                  'tail', 'label_tail', 'class_tail', 'index_id', 'type']]
    reindexed_left_right.rename(columns={'index_id': 'index_tail'}, inplace=True)
    
    no_gene_products_nodes.drop(['index'], axis=1, inplace=True)
    
    # Remap relation type indices
    reindexed_left_right['type'], relation_labels_new = pd.factorize(reindexed_left_right['relation'])
    
    # Remap node type indices
    no_gene_products_nodes['semantic_id'] = no_gene_products_nodes['semantic'].cat.codes
    
    print(f'There are {nodes.shape[0]} nodes in total before change')
    print(f'There are {no_gene_products_nodes.shape[0]} nodes after change')
    print(f'There are {gene_products_nodes.shape[0]} gene product nodes removed after change')

    print(f'There are {kg_df.shape[0]} edges in total before change')
    print(f'There are {reindexed_left_right.shape[0]} edges after change')
    print(f'There are {all_gene_products.shape[0]} gene product related rows removed after change')

There are 10275 nodes in total before change
There are 10237 nodes after change
There are 38 gene product nodes removed after change
There are 85992 edges in total before change
There are 85954 edges after change
There are 38 gene product related rows removed after change


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_gene_products_nodes['index_id'] = no_gene_products_nodes.index
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_gene_products_nodes.drop(['index'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  no_gene_products_nodes['semantic_id'] = no_gene_products_nodes['semantic'].cat.codes


In [176]:
reindexed_left_right

Unnamed: 0,head,label_head,class_head,index_head,relation,tail,label_tail,class_tail,index_tail,type
0,WormBase:WBGene00006787,unc-52,5,301,interacts with,WormBase:WBGene00003929,pat-2,5,1537,0
1,WormBase:WBGene00000051,acr-12,5,8381,interacts with,WormBase:WBGene00003929,pat-2,5,1537,0
2,WormBase:WBGene00001328,epi-1,5,3940,interacts with,WormBase:WBGene00003929,pat-2,5,1537,0
3,WormBase:WBGene00000042,acr-2,5,657,interacts with,WormBase:WBGene00003929,pat-2,5,1537,0
4,FlyBase:FBgn0001250,if (fruit fly),5,458,in orthology relationship with,WormBase:WBGene00003929,pat-2,5,1537,1
...,...,...,...,...,...,...,...,...,...,...
85949,dbSNP:rs56979833,rs56979833-?,11,5590,has affected feature,ENSEMBL:ENSG00000259647,ENSEMBL:ENSG00000259647,5,911,14
85950,dbSNP:rs35396788,rs35396788-?,11,4731,has affected feature,HGNC:26832,TAPT1-AS1,5,3519,14
85951,dbSNP:rs9975725,rs9975725-?,11,7460,has affected feature,ENSEMBL:ENSG00000231236,ENSEMBL:ENSG00000231236,5,1462,14
85952,dbSNP:rs35693284,rs35693284-?,11,9142,has affected feature,ENSEMBL:ENSG00000260569,ENSEMBL:ENSG00000260569,5,4077,14


In [177]:
no_gene_products_nodes.loc[no_gene_products_nodes['index_id'] == 9142]

Unnamed: 0,index_id,id,semantic,label,semantic_id
9142,9142,dbSNP:rs35693284,variant,rs35693284-?,11


In [184]:
print(f'Changed from {len(relation_labels)} types of relations to {len(relation_labels_new)} types of relations.')
print(f"Changed from {len(node_semantic_dict)} types of nodes to {no_gene_products_nodes['semantic_id'].nunique()} nodes.")

Changed from 21 types of relations to 20 types of relations.
Changed from 12 types of nodes to 11 nodes.


In [181]:
if concept_changes == NO_GENE_PRODUCTS and dataset_nr == 2:
    reindexed_left_right.to_csv(f'output/indexed_edges_{dataset_nr}{suffix}.csv', index=False)
    no_gene_products_nodes.to_csv(f'output/indexed_nodes_{dataset_nr}{suffix}.csv', index=False)
    
else:
    kg_df.to_csv(f'output/indexed_edges_{dataset_nr}{suffix}.csv', index=False)