In [8]:
import sys
from collections import defaultdict

class MCODE():
    def __init__(self, filename, weight_threshold=0.2):
        self.weight_threshold = 1 - weight_threshold
        self.filename = filename
        self.clusters = []

    def cluster(self):
        edges = defaultdict(set)

        # Read edgelist
        with open(self.filename, 'r') as f:
            for line in f:
                pair = line.split()
                a = pair[0]
                b = pair[2]
                edges[a].add(b)
                edges[b].add(a)
        print ('## Input graph loaded; %i nodes' % (len(edges),))

        # Clusters list
        clusters = []

        # Stage 1: Vertex Weighting
        print ('## Weighting vertices...')
        weights = dict((v, 1.) for v in edges)
        for i, v in enumerate(edges):
            neighborhood = set((v,)) | edges[v]
            # if node has only one neighbor, we know everything we need to know
            if len(neighborhood) <= 2:
                continue

            # see if larger k-cores exist
            k = 1  # highest valid k-core
            while neighborhood:
                k_core = neighborhood.copy()
                invalid_nodes = True
                while invalid_nodes and neighborhood:
                    invalid_nodes = set(n for n in neighborhood if len(
                        edges[n] & neighborhood) <= k)
                    neighborhood -= invalid_nodes
                k += 1  # on exit, k will be one greater than we want

            # vertex weight = k-core number * density of k-core
            weights[v] = (k - 1) * (sum(len(edges[n] & k_core)
                                        for n in k_core) / (2. * len(k_core)**2))

        # Stage 2: Molecular Complex Prediction
        print('## Molecular complex prediction...')
        unvisited = set(edges)
        num_clusters = 0

        for seed in sorted(weights, key=weights.get, reverse=True):
            if seed not in unvisited:
                continue

            cluster, frontier = set((seed,)), set((seed,))
            w = weights[seed] * self.weight_threshold
            while frontier:
                cluster.update(frontier)
                unvisited -= frontier
                frontier = set(n for n in set.union(
                    *(edges[n] for n in frontier)) & unvisited if weights[n] > w)

            # Haircut: only keep 2-core complexes
            invalid_nodes = True
            while invalid_nodes and cluster:
                invalid_nodes = set(
                    n for n in cluster if len(edges[n] & cluster) < 2)
                cluster -= invalid_nodes

            if cluster:
                print (' '.join(cluster))
                num_clusters += 1
                print (num_clusters, len(cluster), seed)
                clusters.append(cluster)

        self.clusters = clusters

    def save_clusters(self, filehandle):
        with open(filehandle, 'w') as fh:
            for c in self.clusters:
                fh.write(' '.join(c) + "\n")
                


In [9]:
mcode = MCODE("files/original_network.sif")

In [10]:
mcode.cluster()

## Input graph loaded; 6008 nodes
## Weighting vertices...
## Molecular complex prediction...
RL21_HUMAN RS11_HUMAN RS3A_HUMAN RS14_HUMAN HEXI1_HUMAN RL27A_HUMAN RS19_HUMAN KIF14_HUMAN RLA2_HUMAN MEPCE_HUMAN RL15_HUMAN FZR1_HUMAN PIHD1_HUMAN RL38_HUMAN TRI25_HUMAN RS27_HUMAN NR2C2_HUMAN RL5_HUMAN RL12_HUMAN RS21_HUMAN PAN2_HUMAN NOP56_HUMAN RL3_HUMAN RL11_HUMAN FACD2_HUMAN RS12_HUMAN RL18_HUMAN RS7_HUMAN RL23_HUMAN RL30_HUMAN KIF23_HUMAN RL10_HUMAN RS15A_HUMAN TIP_HUMAN RS5_HUMAN CUL1_HUMAN RL27_HUMAN CTRO_HUMAN RL32_HUMAN ECT2_HUMAN RS20_HUMAN RS15_HUMAN PRC1_HUMAN RL13_HUMAN RS16_HUMAN RL19_HUMAN RS28_HUMAN RL14_HUMAN UBL4A_HUMAN RL4_HUMAN AAR2_HUMAN RL18A_HUMAN RS24_HUMAN RLA0_HUMAN RS3_HUMAN RL8_HUMAN RL7A_HUMAN RSSA_HUMAN RL35A_HUMAN RL17_HUMAN RL13A_HUMAN UFL1_HUMAN CUL3_HUMAN RS6_HUMAN RS26_HUMAN RS23_HUMAN BTF3_HUMAN RS25_HUMAN RL10A_HUMAN RL31_HUMAN BIRC3_HUMAN FBXW7_HUMAN RL7_HUMAN RS18_HUMAN CSN5_HUMAN RS2_HUMAN RL37A_HUMAN RL23A_HUMAN RC3H2_HUMAN RECQ4_HUMAN RC3H1_HUMAN RLA