In [1]:
from socialgene.neo4j.neo4j import GraphDriver # grab the the neo4j connection
from socialgene.config import env_vars
import pandas as pd
from socialgene.base.socialgene import SocialGene

env_vars["NEO4J_URI"] = "bolt://localhost:7688"
pd.set_option('display.max_rows', 100)

> Note: While timings here are fast, they also include latency between my computer in Chicago and the server in Madison, WI.

In [2]:
%%time
with GraphDriver() as db:
        df = db.run(
            """
                MATCH 
                    (h1:hmm_source {acc:"PF00227.29"})-[:SOURCE_DB]-(hmm:hmm),
                    (hmm)-[:ANNOTATES]->(:protein)<-[e1:ENCODES]-(n1:nucleotide)
                where n1.external_id starts with "BGC"
                RETURN  
                        h1.acc as hmm_acc,
                        h1.description as description,
                        n1.external_id as nucelotide_id,
                        e1.locus_tag as locus_tag,
                        e1.external_id as protein_id,
                        e1.description
        """,
        ).to_df()

df


CPU times: user 11.2 ms, sys: 261 µs, total: 11.5 ms
Wall time: 591 ms


Unnamed: 0,hmm_acc,description,nucelotide_id,locus_tag,protein_id,e1.description
0,PF00227.29,Proteasome subunit,BGC0000145,,ABP73653.1,SalI
1,PF00227.29,Proteasome subunit,BGC0000345,,AHB38505.1,proteasome beta-subunit
2,PF00227.29,Proteasome subunit,BGC0000971,,CBW54670.1,20S proteasome beta-subunit
3,PF00227.29,Proteasome subunit,BGC0001041,Strop_1015,ABP53490.1,"20S proteasome, A and B subunits"
4,PF00227.29,Proteasome subunit,BGC0001202,,AKA59433.1,proteasome subunit beta
5,PF00227.29,Proteasome subunit,BGC0001203,,AKA59451.1,proteasome subunit beta
6,PF00227.29,Proteasome subunit,BGC0001399,ANIA_03493,CBF76042.1,conserved hypothetical protein
7,PF00227.29,Proteasome subunit,BGC0001829,,CUX96950.1,TmcC


Count the number of total non MIBiG BGCs

In [3]:
%%time
with GraphDriver() as db:
        df = db.run(
            """
        MATCH (:hmm_source {acc:"PF00227.29"})-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(:protein)<-[e1:ENCODES]-(n1:nucleotide)-[:ASSEMBLES_TO]-(a1:assembly)
        where e1.antismash_region is not null and not n1.external_id starts with "BGC" 
        WITH a1, n1, COUNT(DISTINCT(e1.antismash_region)) as ar_in_nuc
        RETURN sum(ar_in_nuc) as count
        
        """,
        ).value()

df[0]

CPU times: user 2.12 ms, sys: 0 ns, total: 2.12 ms
Wall time: 1.54 s


1595

In [4]:
%%time
with GraphDriver() as db:
        df = db.run(
            """
        MATCH (:hmm_source {acc:"PF00227.29"})-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(:protein)<-[e1:ENCODES]-(n1:nucleotide)-[:ASSEMBLES_TO]-(a1:assembly)
        where e1.antismash_region is not null and not n1.external_id starts with "BGC" 
        RETURN 
            count(DISTINCT a1.superkingdom) as superkingdom_count,
            count(DISTINCT a1.phylum) as phylum_count,
            count(DISTINCT a1.class) as class_count,
            count(DISTINCT a1.order) as order_count,
            count(DISTINCT a1.family) as family_count,
            count(DISTINCT a1.genus) as genus_count           
        """,
        ).to_df()
df

CPU times: user 204 µs, sys: 3.39 ms, total: 3.6 ms
Wall time: 1.5 s


Unnamed: 0,superkingdom_count,phylum_count,class_count,order_count,family_count,genus_count
0,3,25,44,93,167,280


All BGC types

In [5]:
%%time
with GraphDriver() as db:
        df = db.run(
            """
        MATCH (:hmm_source {acc:"PF00227.29"})-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(:protein)<-[e1:ENCODES]-(n1:nucleotide)-[:ASSEMBLES_TO]-(a1:assembly)
        where e1.antismash_region is not null and not n1.external_id starts with "BGC" 
        WITH distinct a1, n1, e1.antismash_region as ar, apoc.coll.sort(e1.antismash_products) as ap
        WITH collect(ap) as zz
        with apoc.coll.frequenciesAsMap(zz) as w
        unwind keys(w) as ww
        return ww as bgc_type, w[ww] as count order by count DESC
        """,
        ).to_df()

df


CPU times: user 7.28 ms, sys: 93 µs, total: 7.37 ms
Wall time: 1.52 s


Unnamed: 0,bgc_type,count
0,[ectoine],257
1,[NRPS-like],225
2,[T3PKS],123
3,[terpene],107
4,[T1PKS],81
5,"[NRPS, T1PKS]",74
6,[NRPS],72
7,"[arylpolyene, resorcinol]",69
8,"[NRPS-like, T1PKS]",65
9,[arylpolyene],57


Bacterial BGC types

In [6]:
%%time
with GraphDriver() as db:
        df = db.run(
            """
        MATCH (:hmm_source {acc:"PF00227.29"})-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(:protein)<-[e1:ENCODES]-(n1:nucleotide)-[:ASSEMBLES_TO]-(a1:assembly)
        where e1.antismash_region is not null and not n1.external_id starts with "BGC" 
        WITH distinct a1, n1, e1.antismash_region as ar, apoc.coll.sort(e1.antismash_products) as ap
        where a1.superkingdom="Bacteria"
        WITH collect(ap) as zz
        with apoc.coll.frequenciesAsMap(zz) as w
        unwind keys(w) as ww
        return ww as bgc_type, w[ww] as count ORDER BY count DESC 
        """,
        ).to_df()

df


CPU times: user 6.65 ms, sys: 187 µs, total: 6.83 ms
Wall time: 1.5 s


Unnamed: 0,bgc_type,count
0,[ectoine],257
1,[NRPS-like],180
2,[T3PKS],120
3,[terpene],78
4,"[NRPS, T1PKS]",70
5,"[arylpolyene, resorcinol]",69
6,"[NRPS-like, T1PKS]",57
7,[arylpolyene],57
8,[T1PKS],48
9,[NRPS],42


Eukaryota BGC types

In [7]:
%%time
with GraphDriver() as db:
        df = db.run(
            """
        MATCH (:hmm_source {acc:"PF00227.29"})-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(:protein)<-[e1:ENCODES]-(n1:nucleotide)-[:ASSEMBLES_TO]-(a1:assembly)
        where e1.antismash_region is not null and not n1.external_id starts with "BGC" 
        WITH distinct a1, n1, e1.antismash_region as ar, apoc.coll.sort(e1.antismash_products) as ap
        where a1.superkingdom="Eukaryota"
        WITH collect(ap) as zz
        with apoc.coll.frequenciesAsMap(zz) as w
        unwind keys(w) as ww
        return ww as bgc_type, w[ww] as count ORDER BY count DESC 
        """,
        ).to_df()

df


CPU times: user 3.25 ms, sys: 0 ns, total: 3.25 ms
Wall time: 1.4 s


Unnamed: 0,bgc_type,count
0,[NRPS-like],45
1,[fungal-RiPP-like],36
2,"[NRP-metallophore, NRPS]",35
3,[T1PKS],32
4,[NRPS],25
5,[terpene],13
6,"[NRPS-like, T1PKS]",7
7,"[NRPS, T1PKS]",4
8,[T3PKS],3
9,"[T1PKS, terpene]",2


Archaea BGC types

In [8]:
%%time
with GraphDriver() as db:
        df = db.run(
            """
        MATCH (:hmm_source {acc:"PF00227.29"})-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(:protein)<-[e1:ENCODES]-(n1:nucleotide)-[:ASSEMBLES_TO]-(a1:assembly)
        where e1.antismash_region is not null and not n1.external_id starts with "BGC" 
        WITH distinct a1, n1, e1.antismash_region as ar, apoc.coll.sort(e1.antismash_products) as ap
        where a1.superkingdom = "Archaea" 
        WITH collect(ap) as zz
        with apoc.coll.frequenciesAsMap(zz) as w
        unwind keys(w) as ww
        return ww as bgc_type, w[ww] as count ORDER BY count DESC 
        """,
        ).to_df()

df


CPU times: user 2.9 ms, sys: 0 ns, total: 2.9 ms
Wall time: 1.54 s


Unnamed: 0,bgc_type,count
0,[terpene],14
1,[betalactone],12
2,[RRE-containing],9
3,[thiopeptide],6
4,[NRPS],5
5,[RiPP-like],4
6,[NI-siderophore],2
7,[linaridin],1
8,[thioamitides],1
9,"[thioamitides, thiopeptide]",1


Grab the BGC clusters from the SocialGene database and create genbank files for each cluster

In [9]:
%%time
with GraphDriver() as db:
    res = db.run(
        """
    MATCH (:hmm_source {acc:"PF00227.29"})-[:SOURCE_DB]-(:hmm)-[:ANNOTATES]->(:protein)<-[e1:ENCODES]-(n1:nucleotide)-[:ASSEMBLES_TO]-(a1:assembly)
    where e1.antismash_region is not null and not n1.external_id starts with "BGC"
    WITH DISTINCT n1, e1.antismash_region as ar
    MATCH (n1)-[e1:ENCODES]-(p1:protein)
    WHERE e1.antismash_region = ar
    WITH n1, e1.antismash_region as ar,  e1.antismash_products as type, min(e1.start) as start, max(e1.end) as end 
    RETURN n1.uid as nucelotide_id, n1.external_id as external_id, ar as region, type, start, end 
    """,
    ).data()



CPU times: user 77.8 ms, sys: 8.19 ms, total: 86 ms
Wall time: 3.42 s


In [10]:
sg = SocialGene()

In [11]:
for i in res:
    z = sg.fill_given_locus_range(locus_uid=i["nucelotide_id"], start=i["start"], end=i["end"])
    sg.assemblies[z['assembly']].loci[z['locus']].add_bgcs_by_start_end(start=i["start"], end=i["end"], uid=f'{i["region"]}_{"-".join(i["type"])}')

In [12]:
sg.add_sequences_from_neo4j()

In [13]:
for ak,av in sg.assemblies.items():
    for nk,nv in av.loci.items():
        for gene_cluster in nv.gene_clusters:
            gene_cluster.write_genbank(f"bgc_gbk/{ak}_{nv.external_id}_{gene_cluster.uid}.gbk")
        
    