In [1]:

import pandas as pd
from pathlib import Path
import gzip
from plotnine import *
from socialgene.base.socialgene import SocialGene
from socialgene.neo4j.neo4j import GraphDriver 
from socialgene.config import env_vars
env_vars["NEO4J_URI"] = "bolt://localhost:7687"


In [2]:
%%time
with GraphDriver() as db:
    x= db.run(
        """
        MATCH (p1:protein)
        RETURN count(p1) as count
        """
    ).value()

print(f"{x[0]} proteins in the database")

304330794 proteins in the database
CPU times: user 9.67 ms, sys: 95 µs, total: 9.76 ms
Wall time: 145 ms


In [3]:
%%time
with GraphDriver() as db:
    x= db.run(
        """
        MATCH (p1:hmm)
        RETURN count(p1) as count
        """
    ).value()

print(f"{x[0]} HMM non-redundant models in the database")

25566 HMM non-redundant models in the database
CPU times: user 2.63 ms, sys: 0 ns, total: 2.63 ms
Wall time: 53.1 ms


In [4]:
%%time
with GraphDriver() as db:
    x= db.run(
        """
        MATCH ()-[r:ANNOTATES]->()
        RETURN count(r) as count
        """
    ).value()

print(f"{x[0]} HMM annotations in the database")

847850795 HMM annotations in the database
CPU times: user 2.05 ms, sys: 30 µs, total: 2.08 ms
Wall time: 51.4 ms


In [5]:
%%time
with GraphDriver() as db:
    x= db.run(
        """
        MATCH (p1:protein)
        WHERE NOT (p1)<-[:ANNOTATES]-(:hmm)
        RETURN count(p1) as count
        """
    ).value()

print(f"{x[0]} proteins with no annotation")

57711961 proteins with no annotation
CPU times: user 12.1 ms, sys: 610 µs, total: 12.8 ms
Wall time: 12min 18s


In [6]:
%%time
with GraphDriver() as db:
    x= db.run(
        """
        MATCH (p1:protein)
        RETURN count(p1) as count
        """
    ).value()

print(f"{x[0]} proteins with no annotation")

304330794 proteins with no annotation
CPU times: user 2.01 ms, sys: 108 µs, total: 2.12 ms
Wall time: 12.2 ms


In [11]:
round(100 - 57711961 / 304330794 * 100, 2)

81.04

In [2]:
%%time
with GraphDriver() as db:
    x= db.run(
        """
        MATCH (p1:protein)<-[:ANNOTATES]-(:hmm)-[:SOURCE_DB]->(:pfam)
        RETURN count(distinct p1) as count
        """
    ).value()

print(f"{x[0]} proteins with Pfam annotation")

241632003 proteins with Pfam annotation
CPU times: user 17.7 ms, sys: 420 µs, total: 18.1 ms
Wall time: 8min 34s


In [4]:
round(241632003 / 304330794 * 100, 2)

79.4