In [32]:
import pickle
import pandas as pd
from pathlib import Path
import gzip
from plotnine import *
from socialgene.clustermap.serialize import SerializeToClustermap
from socialgene.neo4j.neo4j import GraphDriver # grab the the neo4j connection
from socialgene.config import env_vars
env_vars["NEO4J_URI"] = "bolt://localhost:7687"

pickle_path ='/media/chase/80a5af81-07f9-454f-9801-3d305555e821/search_against_refseq_pickles/BGC0001979.pickle'
json_path = 'clinker.json'

In [33]:
# The pickle file is a serialized object that contains the search results from the SocialGene search engine results 
# from searching all of RefSeq for similar genetic regions to the BGC0000946 cluster.

with open(pickle_path, 'rb') as f:
    search_object = pickle.load(f)

In [34]:
df = search_object.link_df
df = df.groupby('target_gene_cluster').agg({'pident': ['mean', 'median', 'count']})
df.columns = ['_'.join(col).strip() for col in df.columns.values]
# df=df[df.pident_count == len(search_object.input_bgc.features)]


In [35]:
temp = pd.merge(
            search_object._compare_bgcs_by_jaccard_and_levenshtein(),
            search_object._compare_bgcs_by_median_bitscore(),
            left_on="query_gene_cluster",
            right_on="target_gene_cluster",
            how="inner",
        )

In [36]:
df = pd.merge(df, temp, left_on='target_gene_cluster', right_on='query_gene_cluster_x', how='inner')
df = df.drop(columns=['query_gene_cluster_x', 'target_gene_cluster_x', 'query_gene_cluster_y'])
df.sort_values(by=["modscore", "score"], ascending=False, inplace=True)
df['query_bgc'] = search_object.input_bgc_id
df = df[['query_bgc','target_gene_cluster_y', 'pident_mean', 'pident_median', 'pident_count', 'levenshtein_include_internal_nonortholog','levenshtein_only_orthologs', 'percent_of_query', 'jaccard', 'modscore','score']]
df['target_nucleotide_sequence'] = df.target_gene_cluster_y.apply(lambda x: x.parent.external_id)
df['target_assembly'] = df.target_gene_cluster_y.apply(lambda x: x.parent.parent.uid)

In [37]:
df[df.target_assembly.str.startswith('BGC')]

Unnamed: 0,query_bgc,target_gene_cluster_y,pident_mean,pident_median,pident_count,levenshtein_include_internal_nonortholog,levenshtein_only_orthologs,percent_of_query,jaccard,modscore,score,target_nucleotide_sequence,target_assembly
14,BGC0001979,<socialgene.base.molbio.GeneCluster object at ...,100.0,100.0,53,1.0,1.0,100,1.0,3.0,696.0,BGC0001979,BGC0001979
7,BGC0001979,<socialgene.base.molbio.GeneCluster object at ...,79.360417,82.9,48,0.62069,0.754717,91,0.827586,2.275862,532.0,BGC0002141,BGC0002141
18,BGC0001979,<socialgene.base.molbio.GeneCluster object at ...,44.409524,39.0,21,0.426471,0.490566,40,0.308824,0.723363,194.0,BGC0001409,BGC0001409
44,BGC0001979,<socialgene.base.molbio.GeneCluster object at ...,45.552632,42.7,19,0.416667,0.528302,36,0.263889,0.661426,197.0,BGC0001061,BGC0001061
20,BGC0001979,<socialgene.base.molbio.GeneCluster object at ...,50.255556,47.4,18,0.394366,0.603774,34,0.253521,0.580654,265.5,BGC0001568,BGC0001568


In [38]:

temp=df.sort_values(by=["pident_median"], ascending=False, inplace=False)
assemblies = [search_object.input_assembly] + [i.parent.parent for i  in temp.target_gene_cluster_y.to_list()]
zz = SerializeToClustermap(
    sg_object=search_object.sg_object,
    sorted_bgcs=assemblies,
    link_df=search_object.link_df,
    group_df=search_object.group_df,
)
zz.write(Path('./plot/data.json'))