In [1]:
import networkx as nx
import pandas as pd
from pyvis.network import Network
import numpy
import os

In [2]:
def build_network(edge_file, mapping_file, family_file, groupby="Genus"):
    """
    Build network from bigscape output
    """
    df_edge = pd.read_csv(edge_file, sep="\t")
    
    df_mapping = pd.read_csv(mapping_file).set_index("BGC")
    
    df_family = pd.read_csv(family_file, sep="\t").set_index("#BGC Name")
    
    # Formatting taxonomy
    df_tax = pd.DataFrame(columns=["Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"])
    for i in df_mapping.index:
        try:
            tax = [t for t in df_mapping.loc[i, "Taxonomy"].split(',')]
            ctr = 0
            for c in df_tax.columns:
                try:
                    df_tax.loc[i, c] = tax[ctr]
                except IndexError:
                    df_tax.loc[i, c] = numpy.nan
                    pass
                ctr = ctr + 1
        except AttributeError:
            df_tax.loc[i, :] = numpy.nan
            pass
    
    # merging attributes
    df_attributes = pd.concat([df_mapping, df_family, df_tax], axis=1)
    
    # layouting attributes
    df_attributes.loc[:, "label"] = df_attributes.loc[:, "genome_id"]

    shape_lib = {'NCBI' : "dot", 
                 'MIBIG' : "square", 
                 'Aalborg' : "triangle"}

    size_lib = {'NCBI' : 50, 
                 'MIBIG' : 200, 
                 'Aalborg' : 200}

    for i in df_attributes.index:
        df_attributes.loc[i, "title"] = f"{df_attributes.loc[i, 'Product Prediction']} - {df_attributes.loc[i, 'genome_id']}.{i.split('.')[-1]} : {df_attributes.loc[i, 'Organism']}"
        df_attributes.loc[i, "shape"] = shape_lib[df_attributes.loc[i, "group"]]
        df_attributes.loc[i, "size"] = size_lib[df_attributes.loc[i, "group"]]
    df_attributes.loc[:, "group label"] = df_attributes.loc[:, "group"]
    df_attributes.loc[:, "group"] = df_attributes.loc[:, groupby]
    
    # build network
    G = nx.from_pandas_edgelist(df_edge, "Clustername 1", "Clustername 2", ['Raw distance', 'Squared similarity',
       'Jaccard index', 'DSS index', 'Adjacency index', 'raw DSS non-anchor',
       'raw DSS anchor', 'Non-anchor domains', 'Anchor domains',
       'Combined group', 'Shared group'])
    
    # enrich with attributes
    try:
        for n in G.nodes:
            for c in df_attributes.columns:
                G.nodes[n][c] = df_attributes.loc[n, c]
    except KeyError:
        pass
    return G

In [3]:
def filter_network(G):
    target = [x for x,y in G.nodes(data=True) if y['group label']=="Aalborg"]

    # Filter self-looped singleton
    filter_out = []
    for n in list(nx.selfloop_edges(G)):
        if n[0] in target or n[1] in target:
            pass
        else:
            filter_out.append(n)
    G.remove_edges_from(filter_out)

    # Filter singleton
    filter_out = []
    for n in list(nx.isolates(G)):
        if n[0] in target:
            pass
        else:
            filter_out.append(n)
    G.remove_nodes_from(filter_out)

    # filter group without target
    filter_out = []
    for n in list(nx.connected_components(G)):
        decision = []
        for t in target:
            if t in n:
                decision.append("keep")
            else:
                pass
        if len(decision) == 0:
            filter_out.extend(n)
    G.remove_nodes_from(filter_out)
    return(G)

In [4]:
def Visualize_network(G):
    # Add Legend Nodes
    step = 1000
    x = 1000
    y = 1000

    shape_lib = {'NCBI' : "dot", 
                     'MIBIG' : "square", 
                     'Aalborg' : "triangle"}

    legend_nodes = [
        (
            G.number_of_nodes() + n, 
            {
                'group': legend_node, 
                'label': str(legend_node),
                'size': 200, 
                # 'fixed': True, # So that we can move the legend nodes around to arrange them better
                'physics': False, 
                'y': y, 
                'x': f'{x + n*step}px',
                'shape': shape_lib[legend_node],
                'color' : 'grey',
                'font': {'size': 200}
            }
        )
        for n, legend_node in enumerate(['NCBI', 'MIBIG', 'Aalborg'])
    ]
    G.add_nodes_from(legend_nodes)
    pyvis_graph = Network(height='500px', width='800px', notebook=True)
    pyvis_graph.from_nx(G, default_node_size=50)
    pyvis_graph.barnes_hut(central_gravity=0.3)
    return pyvis_graph

In [5]:
def visualize_bigscape(bigscape_path, as_type, mapping_file, outdir=".", groupby="Genus", cutoff='0.30'):
    cutoff = str(cutoff)
    edge_file = os.path.join(bigscape_path, as_type, f"{as_type}_c{cutoff}.network")
    family_file = os.path.join(bigscape_path, as_type, f"{as_type}_clustering_c{cutoff}.tsv")
    G = build_network(edge_file, mapping_file, family_file, groupby=groupby)
    G = filter_network(G)
    #STATS
    print(f"{as_type}_c{cutoff} - Number of BGCs from query: {len([x for x,y in G.nodes(data=True) if y['group label']=='Aalborg'])}")
    print(f"{as_type}_c{cutoff} - Number of MIBIG hits: {len([x for x,y in G.nodes(data=True) if y['group label']=='MIBIG'])}")
    print(f"{as_type}_c{cutoff} - Number of NCBI/GTDB hits: {len([x for x,y in G.nodes(data=True) if y['group label']=='NCBI'])}")
    
    
    pyvis_graph = Visualize_network(G)
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    nx.to_pandas_edgelist(G).to_csv(os.path.join(outdir, f"{as_type}_c{cutoff}.csv"), index=False)
    pd.DataFrame.from_dict(dict(G.nodes(data=True)), orient='index').to_csv(os.path.join(outdir, f"{as_type}_c{cutoff}_mapping.csv"))
    return pyvis_graph.show(os.path.join(outdir, f"{as_type}_c{cutoff}.html"))

In [6]:
#! (cd ../data && unzip p__Nitrospirota_all_antismash_6.0.1.zip)
#! (cd ../data && unzip p__Myxococcota_all_antismash_6.0.1.zip)

In [12]:
bigscape_path = "../data/p__Nitrospirota_all_antismash_6.0.1/network_files/2022-02-01_16-12-56_glocal_p__Nitrospirota_all_antismash_6.0.1/"
mapping_file = '../tables/bigscape_mapping_nitro.csv'
as_type = ['Others', 'PKS-NRP_Hybrids', 'RiPPs', 'NRPS', 'PKSI', 'PKSother', 'Terpene']
cutoffs = ["0.30", "0.40", "0.50"]
groupby = "Genus"
for cutoff in cutoffs:
    for as_class in as_type:
        visualize_bigscape(bigscape_path, as_class, mapping_file, outdir="../networks/p__Nitrospirota", groupby=groupby, cutoff=cutoff)

Others_c0.30 - Number of BGCs from query: 18
Others_c0.30 - Number of MIBIG hits: 0
Others_c0.30 - Number of NCBI/GTDB hits: 109
PKS-NRP_Hybrids_c0.30 - Number of BGCs from query: 5
PKS-NRP_Hybrids_c0.30 - Number of MIBIG hits: 0
PKS-NRP_Hybrids_c0.30 - Number of NCBI/GTDB hits: 9
RiPPs_c0.30 - Number of BGCs from query: 23
RiPPs_c0.30 - Number of MIBIG hits: 0
RiPPs_c0.30 - Number of NCBI/GTDB hits: 192
NRPS_c0.30 - Number of BGCs from query: 6
NRPS_c0.30 - Number of MIBIG hits: 0
NRPS_c0.30 - Number of NCBI/GTDB hits: 33
PKSI_c0.30 - Number of BGCs from query: 2
PKSI_c0.30 - Number of MIBIG hits: 0
PKSI_c0.30 - Number of NCBI/GTDB hits: 8
PKSother_c0.30 - Number of BGCs from query: 0
PKSother_c0.30 - Number of MIBIG hits: 0
PKSother_c0.30 - Number of NCBI/GTDB hits: 0
Terpene_c0.30 - Number of BGCs from query: 24
Terpene_c0.30 - Number of MIBIG hits: 0
Terpene_c0.30 - Number of NCBI/GTDB hits: 334
Others_c0.40 - Number of BGCs from query: 18
Others_c0.40 - Number of MIBIG hits: 0
Oth

In [11]:
bigscape_path = "../data/p__Myxococcota_all_antismash_6.0.1/network_files/2022-02-01_09-33-46_glocal_p__Myxococcota_all_antismash_6.0.1/"
mapping_file = '../tables/bigscape_mapping_myxo.csv'
as_type = ['Others', 'PKS-NRP_Hybrids', 'RiPPs', 'NRPS', 'PKSI', 'PKSother', 'Terpene']
cutoffs = ["0.30", "0.40", "0.50"]
groupby = "Genus"
for cutoff in cutoffs:
    for as_class in as_type:
        visualize_bigscape(bigscape_path, as_class, mapping_file, outdir="../networks/p__Myxococcota", groupby=groupby, cutoff=cutoff)

Others_c0.30 - Number of BGCs from query: 111
Others_c0.30 - Number of MIBIG hits: 0
Others_c0.30 - Number of NCBI/GTDB hits: 6
PKS-NRP_Hybrids_c0.30 - Number of BGCs from query: 35
PKS-NRP_Hybrids_c0.30 - Number of MIBIG hits: 0
PKS-NRP_Hybrids_c0.30 - Number of NCBI/GTDB hits: 0
RiPPs_c0.30 - Number of BGCs from query: 207
RiPPs_c0.30 - Number of MIBIG hits: 0
RiPPs_c0.30 - Number of NCBI/GTDB hits: 13
NRPS_c0.30 - Number of BGCs from query: 62
NRPS_c0.30 - Number of MIBIG hits: 0
NRPS_c0.30 - Number of NCBI/GTDB hits: 6
PKSI_c0.30 - Number of BGCs from query: 11
PKSI_c0.30 - Number of MIBIG hits: 0
PKSI_c0.30 - Number of NCBI/GTDB hits: 0
PKSother_c0.30 - Number of BGCs from query: 60
PKSother_c0.30 - Number of MIBIG hits: 0
PKSother_c0.30 - Number of NCBI/GTDB hits: 1
Terpene_c0.30 - Number of BGCs from query: 93
Terpene_c0.30 - Number of MIBIG hits: 0
Terpene_c0.30 - Number of NCBI/GTDB hits: 8
Others_c0.40 - Number of BGCs from query: 111
Others_c0.40 - Number of MIBIG hits: 0
Ot