# Pangenome dataset export

## Setup

In [1]:
## Notebook config

import sys
if '../' not in sys.path:
    sys.path.append("../")
%load_ext dotenv
%reload_ext dotenv
%dotenv

In [2]:
# Notebook imports

import ast
import os
import pickle
import collections 


from Bio import Entrez, SeqIO
from graphdatascience import GraphDataScience
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import umap

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATASET_DIR = './dataset/'
PROJECTION_NAME_PREFIX = 'dataset_projection_'
RANDOM_SEED = 42

NEO4J_URI = os.environ.get('NEO4J_URI')
NEO4J_USER = os.environ.get('NEO4J_USER')
NEO4J_PASSWORD = os.environ.get('NEO4J_PASSWORD')


## Create GDS projection

In [28]:
# GDS utils
def get_gds_client():
    return GraphDataScience(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def get_projection_name(version=1):
    return PROJECTION_NAME_PREFIX + str(version)

def get_projection(gds, projection_name):
    if gds.graph.exists(projection_name)['exists']:
        return gds.graph.get(projection_name)
    projection = gds.graph.project(
        graph_name=projection_name,
        node_spec=[
            'Fasta',
            'Hit',
            'HitFamily',
            'HitRegion',
        ],
        # 'properties': {'weight': {'defaultValue': 1}
        relationship_spec={
            'hasHit': {'orientation': 'UNDIRECTED'},
            'hasRegion': {'orientation': 'UNDIRECTED', 'properties': {'regid': {'defaultValue': 1}}},
            'hasAffiliate': {'orientation': 'UNDIRECTED', 'properties': {'coverage': {'defaultValue': 1}}},
            'hasMember': {'orientation': 'UNDIRECTED', 'properties': {'probab': {'defaultValue': 1}}},
            'hasDownstream': {'orientation': 'UNDIRECTED'},
        },
    )
    return projection

def delete_projection(gds, projection_name):
    if gds.graph.exists(projection_name)['exists']:
        gds.graph.drop(gds.graph.get(projection_name))


In [29]:
# Create GDS projection

version = 1
projection_name = get_projection_name(version)
gds = get_gds_client()
delete_projection(gds, projection_name)
projection = get_projection(gds, projection_name)
print(projection)

GraphCreateResult(graph=Graph({'graphName': 'dataset_projection_1', 'nodeCount': 148782, 'relationshipCount': 787238, 'database': 'neo4j', 'configuration': {'relationshipProjection': {'hasHit': {'aggregation': 'DEFAULT', 'orientation': 'UNDIRECTED', 'indexInverse': False, 'properties': {}, 'type': 'hasHit'}, 'hasMember': {'aggregation': 'DEFAULT', 'orientation': 'UNDIRECTED', 'indexInverse': False, 'properties': {'probab': {'aggregation': 'DEFAULT', 'property': 'probab', 'defaultValue': 1}}, 'type': 'hasMember'}, 'hasRegion': {'aggregation': 'DEFAULT', 'orientation': 'UNDIRECTED', 'indexInverse': False, 'properties': {'regid': {'aggregation': 'DEFAULT', 'property': 'regid', 'defaultValue': 1}}, 'type': 'hasRegion'}, 'hasDownstream': {'aggregation': 'DEFAULT', 'orientation': 'UNDIRECTED', 'indexInverse': False, 'properties': {}, 'type': 'hasDownstream'}, 'hasAffiliate': {'aggregation': 'DEFAULT', 'orientation': 'UNDIRECTED', 'indexInverse': False, 'properties': {'coverage': {'aggregatio

In [13]:
# Add node features (at least one common node property is needed to use the export function)

df_degree = gds.degree.mutate(
    G=gds.graph.get(projection_name),
    mutateProperty='degree',
    nodeLabels=[
        'Fasta',
        'Hit',
        'HitFamily',
        'HitRegion',
    ],
    relationshipTypes=[
        'hasHit',
        'hasRegion',
        'hasAffiliate',
        'hasDownstream',
        'hasMember',
    ],
)

## Export Datasets

In [15]:
# Export nodes

def export_nodes_from_projection(property):
    exported_projection = gds.graph.nodeProperties.stream(
        get_projection(gds, get_projection_name()),
        node_properties=[property],
        separate_property_columns=True,
        db_node_properties=['accession', 'name', 'annotation', 'source', 'taxonomy'],
        listNodeLabels=True,
    )
    exported_projection['nodeLabels'] = exported_projection['nodeLabels'].apply(lambda x: x[0])
    return exported_projection


exported_projection = export_nodes_from_projection('degree')

assert len(exported_projection.loc[
    exported_projection['nodeLabels'] == 'Fasta'
])  == 6702

In [16]:
# Enrich Genomes with taxonomic labels from ICTV string

def get_taxonomy_by_idx(tax_str, idx):
    try:
        return tax_str.split(';')[idx].strip()
    except:
        return None

def enrich_data_with_tax_labels(df):
    rank_index = {
        'clade': 0,
        'kingdom': 2,
        'phylum': 4,
        'class': 6,
        'order': 8,
        'family': 10,
        'genus': 12,
        'species': 14,
    }
    for rank in ['species', 'genus', 'family', 'order', 'class', 'phylum']:
        df[rank] = None
        df.loc[df['nodeLabels'] == 'Genome', rank] = df.loc[
            df['nodeLabels'] == 'Genome', 'taxonomy'].apply(get_taxonomy_by_idx, args=[rank_index[rank]])
    return df

exported_projection = enrich_data_with_tax_labels(exported_projection)

In [17]:
# Enrich nodes

# nodeId is not stable in neo4j, instead add a reference to a stable node attribute
def set_app_id(df):
    config = {
        'Fasta': 'accession',
        'Hit': 'name',
        'HitFamily': 'accession',
        'HitRegion': 'name',
    }
    df['appId'] = None
    for node_label, id_column in config.items():
        df.loc[df['nodeLabels'] == node_label, 'appId'] = df.loc[
            df['nodeLabels'] == node_label, id_column]

set_app_id(exported_projection)

nodes = exported_projection[[
    'nodeId',
    'appId',
    'nodeLabels',
    'degree',
    'accession',
    'name',
    'annotation',
    'species',
    'genus',
    'family',
    'order',
    'phylum',
    'class',
]]

nodes.to_csv(DATASET_DIR + 'pangenome_nodes.csv', index=False)

In [33]:
# Export dataset edges

edges = gds.graph.relationshipProperties.stream(
    get_projection(gds, get_projection_name()),
    relationship_properties=['probab'],
)

def set_edge_weights(df):
    config = {
        'hasHit': 'identities',
        'hasRegion': 'identities',
        'hasAffiliate': 'identities',
        'hasDownstream': 'identities',
        'hasMember': 'identities',
    }
    df['weight'] = None
    edges.dropna(subset=['weight'], inplace=True)

# set_edge_weights(edges)

print(edges.head())

# edges.to_csv(DATASET_DIR + 'edges.csv', index=False)

   sourceNodeId  targetNodeId relationshipType relationshipProperty  \
0         44736           191        hasMember               probab   
1         44737           192        hasMember               probab   
2         44738           193        hasMember               probab   
3         44739           195        hasMember               probab   
4         44740           196        hasMember               probab   

   propertyValue  
0         0.9935  
1         0.9887  
2         0.9914  
3         0.9712  
4         0.9339  


['hasMember']


In [51]:
# drop projection
gds.graph.drop(get_projection_name())

graphName                                         exploration_projection_1
database                                                             neo4j
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                            33453
relationshipCount                                                  1345820
configuration            {'relationshipProjection': {'HasReg': {'aggreg...
density                                                           0.001203
creationTime                           2024-04-22T15:16:18.864758401+00:00
modificationTime                       2024-04-22T15:19:25.329273718+00:00
schema                   {'graphProperties': {}, 'nodes': {'FuncDomainS...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'FuncDomainS...
Name: 0, dtype: object

## Graphistry

In [186]:
import graphistry
graphistry.register(
    api=3,
    username=os.getenv('GRAPHISTRY_USERNAME'),
    password=os.getenv('GRAPHISTRY_PASSWORD'),
)

In [190]:
g = graphistry.bind()

nodes['type'] = nodes['nodeLabels']

g = g.bind(
    source='sourceNodeId',
    destination='targetNodeId',
    edge_weight='weight',
).edges(edges)

g = g.bind(
    node='nodeId',
    point_label='appId',
    point_size=None,
).nodes(nodes)



params = {
        'play': 2000,
        'menu': True, 
        'info': True,
        'showArrows': True,
        'pointSize': 2.0, 
        'edgeOpacity': 0.25, 
        'pointOpacity': 1.0,
        'linLog': True, 
        'compactLayout': True,
        'strongGravity': True,
        'dissuadeHubs': False,
        'edgeInfluence': 5,
        'showLabelPropertiesOnHover': True,
        'pointsOfInterestMax': 10,
      }


g = g.addStyle(
    bg={
        'color': 'black',
})

g.plot()