In [2]:
# Notebook config
import sys
if '../' not in sys.path:
    sys.path.append("../")
%load_ext dotenv
%dotenv

In [3]:
from queries import gds_queries
from datasources.neo4j import gds
from config.base import DIR_CFG

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
G_sotu = gds.graph.project(
    graph_name='SOTU-sequence-aligments',
    node_spec=['SOTU'],
    relationship_spec={'SEQUENCE_ALIGNMENT': {'properties': ['percentIdentity']}},
)

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.graph.project`: Caused by: java.lang.IllegalArgumentException: A graph with name 'SOTU-sequence-aligments' already exists.}

In [None]:
# G_sotu_taxon = gds.graph.project(
#     graph_name='SOTU-and-Taxon',
#     node_spec=['SOTU', 'Taxon'],
#     relationship_spec={'SEQUENCE_ALIGNMENT': {'properties': ['percentIdentity']}, 'HAS_PARENT': {}},
# )

Examine distribution of host associations per SOTU

In [4]:
gds.run_cypher('''
    MATCH (s:SOTU)<-[:HAS_SOTU]-(:Palmprint)
            <-[:HAS_PALMPRINT]-(:SRA)-[:HAS_HOST_METADATA]->(t:Taxon),
    (s:SOTU)<-[:HAS_PALMPRINT]-(:SRA)-[:HAS_HOST_METADATA]->(t:Taxon)
    WHERE NOT (t)-[:HAS_PARENT*]->(:Taxon {taxId: '12908'})
    WITH s, t, count(*) as count 
    RETURN stDev(count) as stDev, avg(count) as avg, min(count) as min, max(count) as max
''')

Unnamed: 0,stDev,avg,min,max
0,163447.10141,2275.405198,1,20314354


### Weakly Connected Components

[Docs](https://neo4j.com/docs/graph-data-science/current/algorithms/wcc/)

The Weakly Connected Components (WCC) algorithm finds sets of connected nodes in directed and undirected graphs. Two nodes are connected if there exists an undirected path between them.

In [22]:
output = gds.run_cypher('''
    MATCH (n:SOTU) RETURN COUNT(n) as `Number of SOTUs`
''')
print(output)

output = gds.run_cypher('''
    MATCH ()-[r:SEQUENCE_ALIGNMENT]->() RETURN COUNT(r) as `Number of sequence alignment edges`
''')
print(output)

output = gds.run_cypher('''
    MATCH (:SOTU)-[r:SEQUENCE_ALIGNMENT]->(:SOTU) RETURN COUNT(r) as `Number of sequence alignment edges between SOTUs`
''')
print(output)

   Number of SOTUs
0           513176
   Number of sequence alginment edges
0                            26341751
   Number of sequence alginment edges between SOTUs
0                                          26341751


In [16]:
output = gds.wcc.stats(
  gds.graph.get('SOTU-sequence-aligments'),
  relationshipWeightProperty='percentIdentity',
)
print(output['componentCount'])
print(output['componentDistribution'])
print(output['configuration'])


367519
{'p99': 1, 'min': 1, 'max': 138584, 'mean': 1.3963250879546363, 'p90': 1, 'p50': 1, 'p999': 2, 'p95': 1, 'p75': 1}
{'jobId': '2752b2f6-dd6a-488a-bd58-f5ea0e456b85', 'seedProperty': None, 'consecutiveIds': False, 'threshold': 0.0, 'logProgress': True, 'relationshipWeightProperty': 'percentIdentity', 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'concurrency': 4}


Note: clustering was run with command:
```
usearch -calc_distmx otu_centroids.fa -tabbedout palmdb.40id_edge.txt \
        -maxdist 0.6 -termdist 0.7
```

With these thresholds, a large WCC of size 138,584 (of the 513,176 total SOTUs) are connected by `SEQUENCE_ALIGNMENT` edges. The remaining SOTUs make up the 367518 WCCs. It appears the majority of the remaining WCC have size 1, with p999 having size 2. To get fine-grained details on the distribution, we can stream the output then the number of nodes per componentID (TODO later)

In [33]:
G_sotu_undirected = gds.graph.project(
    graph_name='SOTU-sequence-aligments-undirected',
    node_spec=['SOTU'],
    relationship_spec={'SEQUENCE_ALIGNMENT': {'properties': ['percentIdentity'], 'orientation': 'UNDIRECTED'}},
)

Loading: 100%|██████████| 100.0/100 [00:05<00:00, 19.06%/s]


In [34]:
output = gds.localClusteringCoefficient.stats(
  G=gds.graph.get('SOTU-sequence-aligments-undirected'),
)
print(output)

LocalClusteringCoefficient: 100%|██████████| 100.0/100 [02:15<00:00,  1.36s/%]

averageClusteringCoefficient                                             0.139741
nodeCount                                                                  513176
postProcessingMillis                                                            0
preProcessingMillis                                                             0
computeMillis                                                              136305
configuration                   {'jobId': 'c2193e36-39a2-4447-9415-7a8e51732ac...
Name: 0, dtype: object





In [11]:
G_sotu_taxon_undirected = gds.graph.project(
    graph_name='SOTU-taxons-undirected',
    node_spec=['SOTU', 'Taxon'],
    relationship_spec={
        'SEQUENCE_ALIGNMENT': {'properties': ['percentIdentity'], 'orientation': 'UNDIRECTED'},
        'HAS_PARENT': {'orientation': 'UNDIRECTED'}
    },
)

Loading: 100%|██████████| 100.0/100 [00:09<00:00, 10.20%/s]


In [12]:
output = gds.localClusteringCoefficient.stats(
  G=gds.graph.get('SOTU-taxons-undirected'),
)
print(output)

LocalClusteringCoefficient:   0%|          | 0.14/100 [00:22<6:35:56, 237.90s/%]

In [9]:
output = gds.alpha.scc.write(
  gds.graph.get('SOTU-sequence-aligments'),
)
print(output)
print(output['communityCount'])

Scc: 100%|██████████| 100.0/100 [00:03<00:00, 30.20%/s]

preProcessingMillis               0
computeMillis                   610
writeMillis                      41
postProcessingMillis           3159
nodes                        513176
communityCount               513176
setCount                     513176
minSetSize                        1
maxSetSize                        1
p1                                1
p5                                1
p10                               1
p25                               1
p50                               1
p75                               1
p90                               1
p95                               1
p99                               1
p100                              1
writeProperty           componentId
Name: 0, dtype: object
513176



