# Quick example

pip install socialgene

In [None]:
import logging
from rich import inspect
from socialgene.neo4j.neo4j import GraphDriver
from socialgene.base.socialgene import SocialGene
from socialgene.config import env_vars


In [2]:
env_vars["NEO4J_URI"] ="bolt://localhost:7687"
env_vars["NEO4J_PASSWORD"] = "test12345"
logging.getLogger("neo4j").setLevel(logging.WARNING)
logging.getLogger().setLevel(logging.INFO)

Example of a parameterized query.
"Find up to 10 examples of genomes that contain a protein that's annotated by an hmm_accession x and whose isolation source is y. Return various select properties"

In [3]:
%%time
with GraphDriver() as db:
    results = db.run(
    """
    MATCH z=(a1:assembly)<-[:ASSEMBLES_TO]-(n1:nucleotide)-[e1:ENCODES]->(p1:protein)<-[an1:ANNOTATES]-(h1:hmm)-[:SOURCE_DB]-(:hmm_source {acc: $hmm_accession})
    WHERE a1.isolation_source = $isolation_source
    RETURN a1.uid as assembly,
            n1.external_id as sequence,
            e1.locus_tag as locus_tag, 
            e1.protein_id as protein_id, 
            e1.description as protein_description, 
            e1.start as gene_start,
            e1.end as gene_end,
            an1.i_evalue as log_adjusted_hmmsearch_i_evalue,
            p1.uid as protein_hash
    LIMIT 10
    """,
    hmm_accession="PF04820.17",
    isolation_source ="beach sand"
    ).to_df()
results

CPU times: user 10.7 ms, sys: 0 ns, total: 10.7 ms
Wall time: 1.26 s


Unnamed: 0,assembly,sequence,locus_tag,protein_id,protein_description,gene_start,gene_end,log_adjusted_hmmsearch_i_evalue,protein_hash
0,GCF_004323735.1,NZ_CP036455.1,EKD16_RS20930,WP_131100656.1,geranylgeranyl reductase family protein,4955705,4957006,-2,4kvtZxe11D9P4z3kOYUmi_oLdXH4IT-N
1,GCF_004323735.1,NZ_CP036455.1,EKD16_RS14000,WP_131098792.1,tryptophan 7-halogenase,3281238,3282746,-48,59gk5hDSro-EYM2vdjKMOAtHRELP0ith
2,GCF_004323735.1,NZ_CP036455.1,EKD16_RS14000,WP_131098792.1,tryptophan 7-halogenase,3281238,3282746,-19,59gk5hDSro-EYM2vdjKMOAtHRELP0ith
3,GCF_004153685.1,NZ_SDQD01000001.1,ERD84_RS05130,WP_130038523.1,tryptophan 7-halogenase,1086657,1087835,-3,1Al_kCHd_BPbr8PkALQuRkRUh5NNP5IX
4,GCF_009687845.1,NZ_BLAD01000056.1,Acor_RS20940,WP_155338399.1,geranylgeranyl reductase family protein,26646,27914,-1,0BdeYz5UVvxOLz1iSbpRbX-5iPMN8Jiu
5,GCF_009687845.1,NZ_BLAD01000043.1,Acor_RS10630,WP_155336424.1,tryptophan 7-halogenase,1042,2583,-140,KoBq99m9B-ePpQGbNaKRerz19p2pnuZG
6,GCF_002847285.1,NZ_CP025407.1,CXR04_RS17345,WP_101423313.1,FAD-dependent oxidoreductase,3996930,3998132,-2,1QrIKvXS0hMXykmT8okKGv6_BTcqR1by
7,GCF_002847285.1,NZ_CP025407.1,CXR04_RS14205,WP_101422445.1,geranylgeranyl reductase family protein,3317579,3318865,-2,Pu2JUzpV5lq3LWA4vWfkKcEU73yw-_ap


Pull in a genome

In [4]:
gbk_file = "/home/chase/Documents/data/mibig/3_1/mibig_gbk_3.1/BGC0000001.gbk"
sg = SocialGene()
sg.parse(gbk_file)

Pull all domain info from database for all proteins in the sg obect

In [28]:
len(sg.proteins)

28

In [31]:
%%time
_ = sg.annotate(use_neo4j_precalc=True)

CPU times: user 50.2 ms, sys: 0 ns, total: 50.2 ms
Wall time: 128 ms


Compare the domains between proteins (all vs all)

In [26]:
sg.compare_proteins(append=True)
sg.protein_comparison_to_df()

`mod_score` below is:

```
if jaccard_score == 0:
    mod_score_value = 0
else:
    mod_score_value = (jaccard_score * 0.5) + mod_levenshtein_score
```

Where mod_levenshtein_score is a max-edit adjusted Levenshtein distance, in order to scale the metric 0 to 1

In [32]:
sg.protein_comparison

Unnamed: 0,query,target,query_n_domains,target_n_domains,levenshtein,jaccard,mod_score
0,y16g4mrip1rIn89xeDFTX6FLUhaY2rlF,y16g4mrip1rIn89xeDFTX6FLUhaY2rlF,7,7,1.0,1.0,1.5
55,MoLkddRrWI7LVxHGv7ypKfUNz7Laj2wL,MoLkddRrWI7LVxHGv7ypKfUNz7Laj2wL,3,3,1.0,1.0,1.5
106,KIR7ngehgJ-dPS9UAmQKYTbeU-eNzVmQ,KIR7ngehgJ-dPS9UAmQKYTbeU-eNzVmQ,7,7,1.0,1.0,1.5
130,_QAobw4i1j-wEsBoKzgWvQY4TIp7k_ro,_QAobw4i1j-wEsBoKzgWvQY4TIp7k_ro,5,5,1.0,1.0,1.5
153,XQ8mbp2HCNhOTXzOc8kpkNwC09FUlJcy,XQ8mbp2HCNhOTXzOc8kpkNwC09FUlJcy,7,7,1.0,1.0,1.5
...,...,...,...,...,...,...,...
152,_QAobw4i1j-wEsBoKzgWvQY4TIp7k_ro,cp7rGmIB7AZOYPd-2dPXQ_-MtnzSnKXD,5,4,0.0,0.0,0.0
151,_QAobw4i1j-wEsBoKzgWvQY4TIp7k_ro,ufAusqX795oGMqZjvJz97i80uvDqux97,5,12,0.0,0.0,0.0
150,_QAobw4i1j-wEsBoKzgWvQY4TIp7k_ro,Na-Fe4IJ27iRUmWcQri4xMBvaciXdu9A,5,8,0.0,0.0,0.0
149,_QAobw4i1j-wEsBoKzgWvQY4TIp7k_ro,kBBW8NgXlhgoHI6zCf0ILCnleFPQsoGk,5,2,0.0,0.0,0.0
