# Network Analysis

This is a set of repeated functions required everytime we're analyzing a graph. Useful for code reuse and keeping things clean.

In [2]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def compute_network_stats(G, network_name="G"):
    if type(G) == str:
        #if its given a gml file path
        G = nx.read_gml(G)

    components = nx.connected_components(G)

    largest_cc = max(components, key=len)
    largest_cc_subgraph = G.subgraph(largest_cc)

    communities = nx.algorithms.community.louvain_communities(G)

    stats = {
        "Name": network_name,
        "|V|": G.number_of_nodes(),
        "|E|": G.number_of_edges(),
        "density": nx.density(G),
        "k": np.mean(list(dict(G.degree()).values())),
        "k weighted": np.mean(list(dict(G.degree(weight='weight')).values())),
        "|components|": nx.number_connected_components(G),
        "cc": nx.average_clustering(G),
        "s_path": nx.average_shortest_path_length(largest_cc_subgraph),
        "d": nx.diameter(largest_cc_subgraph),
        "|communities|": len(list(communities)),
        "Q": nx.community.modularity(G, communities),
    }

    stats_df = pd.DataFrame(stats, index=[0])
    stats_df.set_index("Name", inplace=True)
    
    return stats_df

def plot_log_log(G):
    degrees = [G.degree(n) for n in G.nodes()]
    mean_degree = np.mean(degrees)

    log_pk = np.log(np.unique(degrees, return_counts=True)[1])
    log_k = np.log(np.unique(degrees, return_counts=True)[0])

    x,y = log_k, log_pk

    ### regression line
    slope, intercept = np.polyfit(x, y, 1)
    # print(f"SLope: {slope*-1}, intercept: {intercept}")
    r_squared = 1 - (sum((y - (slope * x + intercept))**2) / ((len(y) - 1) * np.var(y)))

    plt.title(f"log-log degree distribution         r squared: {round(r_squared,5)}")
    plt.xlabel("log(k)")
    plt.ylabel("log(p(k))")

    plt.scatter(log_k, log_pk)
    plt.plot(x, slope * x + intercept, color="red")
    plt.show()

In [38]:
def set_node_attributes(G:nx.Graph, clstr_attribute_df):
    '''
    Set node attributes from a dataframe to a networkx graph

    param:
    ------
    - G: nx.Graph, networkx graph
    - clstr_attribute_df: pd.DataFrame, dataframe with cluster number and attributes

    return:
    -------
    - G: nx.Graph, networkx graph with attributes set
    '''
    #make the df a dict of index clusters
    clstr_attribute_df.set_index(clstr_attribute_df.columns[0], inplace=True)

    my_dict=clstr_attribute_df[clstr_attribute_df.columns[0]].to_dict()

    attribute=clstr_attribute_df.columns[0]
    nx.set_node_attributes(G, my_dict, name=attribute)
    return G

In [3]:
G=nx.read_graphml('../data/graphs-4/signed_corr0.6_200SVM_Campylobacter_coli_ciprofloxacin.graphml')

# --- pan

df=pd.read_csv("../data/cluster_descriptions/cluster_pan_gene_class.csv")
df.set_index(df.columns[0], inplace=True)
my_dict = df[df.columns[0]].to_dict()
#add " " to the dict keys
# my_dict = {f'"{k}"':v for k,v in my_dict.items()}
nx.set_node_attributes(G, my_dict, name="gene_class")

# --- log odds

df=pd.read_csv('../data/log_odds_nodes/Campylobacter_coli_ciprofloxacin_log_odds.csv')
df.set_index(df.columns[0], inplace=True)
my_dict = df[df.columns[0]].to_dict()
#add " " to the dict keys
# my_dict = {f'"{k}"':v for k,v in my_dict.items()}
nx.set_node_attributes(G, my_dict, name="log_odds")

# --- product name

df=pd.read_csv('../data/cluster_descriptions/cluster_product.csv')
df.set_index(df.columns[0], inplace=True)
my_dict = df[df.columns[0]].to_dict()
#add " " to the dict keys
# my_dict = {f'"{k}"':v for k,v in my_dict.items()}
nx.set_node_attributes(G, my_dict, name="product_name")
# my_dict

# --- patric id

df=pd.read_csv('../data/cluster_descriptions/cluster_representatives.csv')
df.set_index(df.columns[0], inplace=True)
my_dict = df[df.columns[0]].to_dict()
# my_dict={f'"{k}"':v for k,v in my_dict.items()}
nx.set_node_attributes(G, my_dict, name="patric_id")
# my_dict


nx.write_graphml(G, '../data/graphs-4/signed_corr0.6_200SVM_Campylobacter_coli_ciprofloxacin.graphml')

In [17]:
#pangenome annot

df=pd.read_csv("../data/cluster_descriptions/cluster_pan_gene_class.csv")
df.set_index(df.columns[0], inplace=True)
my_dict = df[df.columns[0]].to_dict()
#add " " to the dict keys
# my_dict = {f'"{k}"':v for k,v in my_dict.items()}
nx.set_node_attributes(G, my_dict, name="gene_class")

In [18]:
#log odds

df=pd.read_csv('../data/log_odds_nodes/Campylobacter_coli_ciprofloxacin_log_odds.csv')
df.set_index(df.columns[0], inplace=True)
my_dict = df[df.columns[0]].to_dict()
#add " " to the dict keys
# my_dict = {f'"{k}"':v for k,v in my_dict.items()}
nx.set_node_attributes(G, my_dict, name="log_odds")

In [19]:
#product name

df=pd.read_csv('../data/cluster_descriptions/cluster_product.csv')
df.set_index(df.columns[0], inplace=True)
my_dict = df[df.columns[0]].to_dict()
#add " " to the dict keys
# my_dict = {f'"{k}"':v for k,v in my_dict.items()}
nx.set_node_attributes(G, my_dict, name="product_name")
# my_dict

In [20]:
#patric id

df=pd.read_csv('../data/cluster_descriptions/cluster_representatives.csv')
df.set_index(df.columns[0], inplace=True)
my_dict = df[df.columns[0]].to_dict()
# my_dict={f'"{k}"':v for k,v in my_dict.items()}
nx.set_node_attributes(G, my_dict, name="patric_id")
# my_dict

In [21]:
G.nodes(data=True)

NodeDataView({'Cluster 0': {'patric_id': '195.2029.peg.1780', 'gene_class': 'accessory', 'log_odds': 1.2703651831434948, 'product_name': 'helicase, Snf2 family'}, 'Cluster 80': {'patric_id': '195.2029.peg.1806', 'gene_class': 'accessory', 'log_odds': 1.141298669899524, 'product_name': 'Inner membrane protein forms channel for type IV secretion of T-DNA complex, VirB3 / ATPase required for both assembly of type IV secretion complex and secretion of T-DNA complex, VirB4'}, 'Cluster 242': {'patric_id': '195.2287.peg.1584', 'gene_class': 'unique', 'log_odds': -0.3112442530302943, 'product_name': 'Inner membrane protein forms channel for type IV secretion of T-DNA complex, VirB3 / ATPase required for both assembly of type IV secretion complex and secretion of T-DNA complex, VirB4'}, 'Cluster 289': {'patric_id': '195.2085.peg.77', 'gene_class': 'unique', 'log_odds': 1.4074965687704937, 'product_name': 'CMP-N-acetylneuraminate-beta-galactosamide-alpha-2,3-sialyltransferase (EC 2.4.99.-)'}, 'C

In [22]:
#write its as a graphml in the same location

nx.write_graphml(G, '../data/graphs-4/signed_corr0.6_200SVM_Campylobacter_coli_ciprofloxacin.graphml')