## Open Virome

In [2]:
# Notebook config
import sys
if '../' not in sys.path:
    sys.path.append("../")
%load_ext dotenv
%dotenv

# imports
import os
from collections import defaultdict

import psycopg2
from graphdatascience import GraphDataScience
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
NEO4J_URI = os.environ.get('NEO4J_URI')
NEO4J_USER = os.environ.get('NEO4J_USER')
NEO4J_PASSWORD = os.environ.get('NEO4J_PASSWORD')

DATA_DIR = './data'

RANDOM_SEED = 42

In [5]:
def get_gds_client():
    return GraphDataScience(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def query_gds(query):
    client = get_gds_client()
    return client.run_cypher(query)


def get_sql_client():
    # These credentials are public and are safe to commit to the repository
    # https://github.com/ababaian/serratus/wiki/SQL-Schema
    return psycopg2.connect(
        database="summary",
        host="serratus-aurora-20210406.cluster-ro-ccz9y6yshbls.us-east-1.rds.amazonaws.com",
        user="public_reader",
        password="serratus",
        port="5432")

def query_sql(query):
    conn = get_sql_client()
    cur = conn.cursor()
    cur.execute(query)
    data = cur.fetchall()
    # df = pd.DataFrame(cursor.fetchall(), columns=['tax_id'])
    cur.close()
    conn.close()
    return data



In [6]:
apicomplexa_label = pd.read_csv(f'{DATA_DIR}/00_apicomplexa_runs.csv', index_col=False)
apicomplexa_stat = pd.read_csv(f'{DATA_DIR}/00_apicomplexa_STAT.csv', index_col=False)

print(apicomplexa_label.shape[0])
print(apicomplexa_label.head())

print(apicomplexa_stat.shape[0])
print(apicomplexa_stat.head())

56831
         acc
0  DRR000427
1  DRR000429
2  DRR001762
3  DRR001763
4  DRR001764
1823925
           acc sample_acc biosample sra_study bioproject     assay_type  \
0  ERR11444757        NaN       NaN       NaN        NaN            WGS   
1  ERR12177221        NaN       NaN       NaN        NaN            WGS   
2  ERR11276730        NaN       NaN       NaN        NaN            WGS   
3  ERR10386778        NaN       NaN       NaN        NaN  Bisulfite-Seq   
4  ERR11526551        NaN       NaN       NaN        NaN            WGS   

  librarysource organism  tax_id  total_count  self_count  
0   METAGENOMIC      NaN    5794          330           0  
1   METAGENOMIC      NaN    5794         1834          47  
2       GENOMIC      NaN    5794            8           0  
3       GENOMIC      NaN    5794        19430          21  
4   METAGENOMIC      NaN    5794          774           0  


In [7]:
overlap = apicomplexa_label['acc'].isin(apicomplexa_stat['acc'])
print(overlap.sum())

missing = apicomplexa_label[~overlap]
print(missing.shape[0])

56469
362


In [8]:
# Get all sOTUs associated with runs labeled as apicomplexa
labelled_runs = apicomplexa_label['acc'].unique().tolist()

# randomly sample 100 runs for testing
np.random.seed(RANDOM_SEED)
labelled_runs = np.random.choice(labelled_runs, 100)

run_string = ','.join([f"\'{run}\'" for run in labelled_runs])
query = f"""
    SELECT DISTINCT sotu
    FROM palm_virome
    WHERE run IN ({run_string})
"""
sotus = query_sql(query)
print(len(sotus))
sotus = [sotu[0] for sotu in sotus]

1920


In [9]:
# get all sOTU node ids in the knowledge graph
sotu_string = ','.join([f"\'{sotu}\'" for sotu in sotus])
query = f"""
    MATCH (s:SOTU)
    WHERE s.sotu IN [{sotu_string}]
    RETURN id(s) as nodeId, s.sotu as sotu
"""
sotu_to_node_id = query_gds(query)
print(len(sotu_to_node_id))
print(sotu_to_node_id.head())

1920
    nodeId   sotu
0  7688210   u374
1  7692104   u531
2  7693716   u944
3  7696820  u1364
4  7702556  u2781


In [10]:
# Create projection of all SOTU nodes and their sequence alignments

gds = get_gds_client()

sotu_projection_name = 'SOTU-sequence-aligments'
if gds.graph.exists(sotu_projection_name)['exists']:
    gds.graph.drop(sotu_projection_name)

G_sotu = gds.graph.project(
    graph_name=sotu_projection_name,
    node_spec=['SOTU'],
    relationship_spec={'SEQUENCE_ALIGNMENT': {'properties': ['percentIdentity']}},
)

print(G_sotu)

GraphCreateResult(graph=Graph({'graphName': 'SOTU-sequence-aligments', 'nodeCount': 513176, 'relationshipCount': 26341751, 'database': 'neo4j', 'configuration': {'relationshipProjection': {'SEQUENCE_ALIGNMENT': {'aggregation': 'DEFAULT', 'orientation': 'NATURAL', 'indexInverse': False, 'properties': {'percentIdentity': {'aggregation': 'DEFAULT', 'property': 'percentIdentity', 'defaultValue': None}}, 'type': 'SEQUENCE_ALIGNMENT'}}, 'readConcurrency': 4, 'relationshipProperties': {}, 'nodeProperties': {}, 'jobId': '58c1d979-3a7b-47de-87b8-da1056120ced', 'nodeProjection': {'SOTU': {'label': 'SOTU', 'properties': {}}}, 'logProgress': True, 'creationTime': neo4j.time.DateTime(2024, 7, 26, 18, 15, 0, 672700438, tzinfo=<UTC>), 'validateRelationships': False, 'sudo': False}, 'schema': {'graphProperties': {}, 'nodes': {'SOTU': {}}, 'relationships': {'SEQUENCE_ALIGNMENT': {'percentIdentity': 'Float (DefaultValue(NaN), PERSISTENT, Aggregation.NONE)'}}}, 'memoryUsage': '270 MiB'}), result=nodeProj

In [34]:
# Create subgraph of sOTUs associated with runs labeled as apicomplexa
sotu_subgraph_projection_name = 'SOTU-apicomplexa-subgraph'
resampled_edges_major, _ = gds.alpha.graph.sample.rwr(
    graph_name=sotu_subgraph_projection_name,
    from_G=gds.graph.get(sotu_projection_name),
    concurrency=1,
    randomSeed=RANDOM_SEED,
    samplingRatio=0.5,
    nodeLabelStratification=True,
    relationshipWeightProperty='percentIdentity',
    relationshipTypes=['SEQUENCE_ALIGNMENT'],
    startNodes=sotu_to_node_id['nodeId'].tolist(),
)

In [69]:
# Get all nearest neighbor sOTUs

source_sotus = set()
nn_sotus = set()
sotu_neighbors = defaultdict(set)

for sotu in sotus:
    query = f"""
        MATCH (n:SOTU)-[r:SEQUENCE_ALIGNMENT*..1]-(m:SOTU)
        WHERE n.sotu = '{sotu[0]}'
        // AND reduce(s = 0, x IN r | s + x.percentIdentity) > 2
        WITH collect(DISTINCT m.sotu) AS nn
        RETURN nn
    """
    nn_sotus = query_gds(query)
    nn_sotus = nn_sotus.values[0][0]
    if len(nn_sotus) > 0:
        source_sotus.add(sotu[0])
        nn_sotus = [s for s in nn_sotus]

print(len(source_sotus))
print(len(nn_sotus))

KeyboardInterrupt: 

In [1]:
print(nn_sotus)

NameError: name 'nn_sotus' is not defined

171775
