In [1]:
# Imports and config for the notebook

## Notebook config
import sys
if '../' not in sys.path:
    sys.path.append("../")
%load_ext dotenv
%reload_ext dotenv
%dotenv

import collections
import os
import csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import graphistry

from datasources.neo4j import gds
from queries import utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configs

# graphistry.register(
#     api=3,
#     username=os.getenv('GRAPHISTRY_USERNAME'),
#     password=os.getenv('GRAPHISTRY_PASSWORD'),
# )

RANDOM_SEED = 42
EMBEDDINGS_DIR = '/mnt/embeddings/'

print(gds)

<graphdatascience.graph_data_science.GraphDataScience object at 0x7f4a12cb81d0>


TODO:


- Create Tissue projection (random generate initial embeddings)
- Create Taxon projection (use rank as initial feature?)

- Create heterogenous projection from dataset?
- run memory estimates for hashgnn and fastRP using hetero projection and 


- HashGNN_homogenous, HashGNN_heterogenous
- FastRP_homogenous, FastRP_heterogenous

In [13]:
TAXON_PROJECTION_NAME = 'taxon-graph'
TISSUE_PROJECTION_NAME = 'tissue-graph'
HOMOGENOUS_PROJECTION_NAME = 'homogenous-graph'
HETERO_PROJECTION_NAME = 'hetero-graph'


In [14]:

def get_homogenous_projection():
    projection = gds.graph.project(
        graph_name=HOMOGENOUS_PROJECTION_NAME,
        node_spec=[
            'Taxon',
            'Tissue',
            'SOTU',
        ],
        relationship_spec={
            'HAS_PARENT': {'orientation': 'UNDIRECTED'},
            'SEQUENCE_ALIGNMENT': {'orientation': 'UNDIRECTED'},
        },
    )
    return projection


# TODO: Use dataset projection
def get_heterogenous_projection():
    projection = gds.graph.project(
        graph_name=HETERO_PROJECTION_NAME,
        node_spec=[
            'Taxon',
            'Tissue',
            'SOTU',
        ],
        relationship_spec={
            'HAS_PARENT': {'orientation': 'UNDIRECTED'},
            'SEQUENCE_ALIGNMENT': {'orientation': 'UNDIRECTED'},
        },
    )
    return projection


def get_taxon_projection():
    projection = gds.graph.project(
        graph_name=TAXON_PROJECTION_NAME,
        node_spec=[
            'Taxon',
        ],
        relationship_spec={
            'HAS_PARENT': {'orientation': 'UNDIRECTED'},
        },
    )
    return projection

def get_tissue_projection():
    projection = gds.graph.project(
        graph_name=TISSUE_PROJECTION_NAME,
        node_spec=['Tissue'],
        relationship_spec={
            'HAS_PARENT': {'orientation': 'UNDIRECTED'},
        },
    )
    return projection


In [15]:
projection = get_homogenous_projection()

In [18]:
# https://neo4j.com/docs/graph-data-science/current/machine-learning/node-embeddings/hashgnn/#algorithms-embeddings-hashgnn-syntax
# https://github.com/neo4j/graph-data-science-client/blob/main/examples/heterogeneous-node-classification-with-hashgnn.ipynb#L18

# one may try to set embeddingDensity to 128, 256, 512, or roughly 25%-50% of the embedding dimension, i.e. the number of binary features.

gds.hashgnn.stream.estimate(
    G=gds.graph.get(HOMOGENOUS_PROJECTION_NAME),
    nodeLabels=['Taxon', 'Tissue', 'SOTU'],
    relationshipTypes=['HAS_PARENT', 'SEQUENCE_ALIGNMENT'],
    randomSeed=RANDOM_SEED,
    generateFeatures={
        'dimension': 512, # dimension of the embedding vector
        'densityLevel': 2, # number of initial values equalling 1
    },
    iterations=10, # maximum number of hops
    embeddingDensity=256,
    neighborInfluence=1.0,
)

requiredMemory                                [1338 MiB ... 12865 MiB]
treeView             Memory Estimation: [1338 MiB ... 12865 MiB]\n|...
mapView              {'memoryUsage': '[1338 MiB ... 12865 MiB]', 'n...
bytesMin                                                    1403659432
bytesMax                                                   13490131432
nodeCount                                                      3021618
relationshipCount                                             57702540
heapPercentageMin                                                  0.1
heapPercentageMax                                                  0.6
Name: 0, dtype: object

In [20]:
filename = 'HashGNN_homogenous.csv'
df = gds.hashgnn.stream(
    G=gds.graph.get(HOMOGENOUS_PROJECTION_NAME),
    nodeLabels=['Taxon', 'Tissue', 'SOTU'],
    relationshipTypes=['HAS_PARENT', 'SEQUENCE_ALIGNMENT'],
    randomSeed=RANDOM_SEED,
    generateFeatures={
        'dimension': 512, # dimension of the embedding vector
        'densityLevel': 2, # number of initial values equalling 1
    },
    iterations=10, # maximum number of hops
    embeddingDensity=256,
    neighborInfluence=1.0,
)

df.to_csv(EMBEDDING_DIR + filename, index=False)

HashGNN:  48%|████▊     | 47.58/100 [1:53:11<2:18:41, 158.75s/%]

In [None]:
gds.fastrp.stream.estimate(
    G=gds.graph.get(HOMOGENOUS_PROJECTION_NAME),
    nodeLabels=['Taxon', 'Tissue', 'SOTU'],
    relationshipTypes=['HAS_PARENT', 'SEQUENCE_ALIGNMENT'],
    randomSeed=RANDOM_SEED,
    embeddingDimension=256,
    relationshipWeightProperty="weight",
)

In [None]:
filename = 'FastRP_homogenous.csv'
df = gds.fastrp.stream(
    G=gds.graph.get(HOMOGENOUS_PROJECTION_NAME),
    nodeLabels=['Taxon', 'Tissue', 'SOTU'],
    relationshipTypes=['HAS_PARENT', 'SEQUENCE_ALIGNMENT'],
    randomSeed=RANDOM_SEED,
    embeddingDimension=256,
    relationshipWeightProperty='weight',
)
df.to_csv(EMBEDDING_DIR + filename, index=False)