In [1]:
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os
from collections import defaultdict
import ndex2

In [2]:
datadir = '/cellar/users/snwright/Data/NetColocTest/'

In [3]:
def load_node_sets(node_set_file, delimiter='\t', verbose=False, id_type="Entrez"):
    """ Load node sets from a text file into a dictionary
    
    Args:
        node_set_file (str): path to node set file
        delimiter (str): delimiter for node set file
        verbose (bool): print out number of node sets loaded
        id_type (str): type of node ID to use for graph
    
    Returns:
        dict: dictionary of node sets
    """
    f = open(node_set_file)
    node_set_lines = f.read().splitlines()
    node_set_lines_split = [line.split(delimiter) for line in node_set_lines]
    f.close()
    node_sets = {node_set[0]:set(node_set[1:]) for node_set in node_set_lines_split}
    if id_type == "Entrez":
        for set_id in node_sets:
            node_sets[set_id] = {int(node) for node in list(node_sets[set_id]) if node.isnumeric()}
    if verbose:
        print('Node cohorts loaded:', node_set_file)
    return node_sets

In [8]:
go_genesets = load_node_sets(os.path.join(datadir, 'Reference','go.test' ))

In [16]:
G = nx.Graph()
G.add_edge(1, 2)
G.add_edge(1, 3)
G.add_edge(1, 50)
G.add_edge(2, 3)
G.add_edge(3, 4)
G.add_edge(4, 50)

In [17]:
x = pd.DataFrame.from_dict(dict(nx.all_pairs_shortest_path_length(G)))

In [18]:
x

Unnamed: 0,1,2,3,50,4
1,0,1,1,1,2
2,1,0,1,2,2
3,1,1,0,2,1
50,1,2,2,0,1
4,2,2,1,1,0


In [None]:
ndex_server='public.ndexbio.org'

ndex_user=None
ndex_password=None
G_overlap_cx = ndex2.create_nice_cx_from_server(
        ndex_server, 
        username=ndex_user, 
        password=ndex_password, 
        uuid='d73d6357-e87b-11ee-9621-005056ae23aa')
G_overlap = G_overlap_cx.to_networkx()
print('number of nodes:')
print(len(G_overlap.nodes))
print('\nnumber of edges:')
print(len(G_overlap.edges))

In [None]:
node_map = pd.DataFrame({'node': G_overlap.nodes})

In [None]:
node_map['Entrez'] = node_map.node.apply(lambda x: G_overlap.nodes[x]['GeneID'])

## Average degree

In [16]:
degrees = pd.read_csv(os.path.join(datadir, 'inputs', 'GO', 'pcnet2_0_degrees.txt'), sep='\t', names=['Degree'], header=None, index_col=0)

In [25]:
avg_degree = defaultdict(float)
med_degree = defaultdict(float)
for gs in go_genesets:
    gs_degree = []
    for gene in go_genesets[gs]:
        if gene in degrees.index:
            gs_degree.append(degrees.loc[gene].Degree)
    avg_degree[gs] = np.mean(gs_degree)
    med_degree[gs] = np.median(gs_degree)

In [26]:
med_degree

defaultdict(float,
            {'GO:0000082': 636.0,
             'GO:0097529': 318.0,
             'GO:0090068': 627.0,
             'GO:0001935': 367.0,
             'GO:0098742': 132.0,
             'GO:0001704': 402.5,
             'GO:0045216': 277.0,
             'GO:0042445': 226.0,
             'GO:2001234': 620.5,
             'GO:0002699': 366.0})

## Clustering/Average Shortest Path

## Assortative Clustering

This will be more relevant with real gene sets?