In [2]:
import sys
from collections import defaultdict

class MCODE():
    def __init__(self, filename, weight_threshold=0.2):
        self.weight_threshold = 1 - weight_threshold
        self.filename = filename
        self.clusters = []

    def cluster(self):
        edges = defaultdict(set)

        # Read edgelist
        with open(self.filename, 'r') as f:
            for line in f:
                pair = line.split()
                a = pair[0]
                b = pair[2]
                edges[a].add(b)
                edges[b].add(a)
        print ('## Input graph loaded; %i nodes' % (len(edges),))

        # Clusters list
        clusters = []

        # Stage 1: Vertex Weighting
        print ('## Weighting vertices...')
        weights = dict((v, 1.) for v in edges)
        for i, v in enumerate(edges):
            neighborhood = set((v,)) | edges[v]
            # if node has only one neighbor, we know everything we need to know
            if len(neighborhood) <= 2:
                continue

            # see if larger k-cores exist
            k = 1  # highest valid k-core
            while neighborhood:
                k_core = neighborhood.copy()
                invalid_nodes = True
                while invalid_nodes and neighborhood:
                    invalid_nodes = set(n for n in neighborhood if len(
                        edges[n] & neighborhood) <= k)
                    neighborhood -= invalid_nodes
                k += 1  # on exit, k will be one greater than we want

            # vertex weight = k-core number * density of k-core
            weights[v] = (k - 1) * (sum(len(edges[n] & k_core)
                                        for n in k_core) / (2. * len(k_core)**2))

        # Stage 2: Molecular Complex Prediction
        print('## Molecular complex prediction...')
        unvisited = set(edges)
        num_clusters = 0

        for seed in sorted(weights, key=weights.get, reverse=True):
            if seed not in unvisited:
                continue

            cluster, frontier = set((seed,)), set((seed,))
            w = weights[seed] * self.weight_threshold
            while frontier:
                cluster.update(frontier)
                unvisited -= frontier
                frontier = set(n for n in set.union(
                    *(edges[n] for n in frontier)) & unvisited if weights[n] > w)

            # Haircut: only keep 2-core complexes
            invalid_nodes = True
            while invalid_nodes and cluster:
                invalid_nodes = set(
                    n for n in cluster if len(edges[n] & cluster) < 2)
                cluster -= invalid_nodes

            if cluster:
                print (' '.join(cluster))
                num_clusters += 1
                print (num_clusters, len(cluster), seed)
                clusters.append(cluster)

        self.clusters = clusters

    def save_clusters(self, filehandle):
        with open(filehandle, 'w') as fh:
            for c in self.clusters:
                fh.write(' '.join(c) + "\n")
                


In [3]:
mcode = MCODE("files/original_network.sif")

In [4]:
mcode.cluster()

## Input graph loaded; 6008 nodes
## Weighting vertices...


KeyboardInterrupt: 

In [5]:
network = set()
with open("files/original_network.sif", 'r') as file:
        for line in file:
            nodes = line.strip().split("\t-\t")
            if len(nodes) == 2:
                network.add(tuple(nodes))

In [12]:
def save_to_sif(path, cluster):
    cluster = set(cluster.split(' '))
    lines = []
    
    for edge in network:
        if edge[0] in cluster and edge[1] in cluster:
            lines.append(edge[0] + "\t-\t" + edge[1])
    
    with open(path, 'w') as file:
        for i, line in enumerate(lines):
            if i < len(lines) - 1:
                file.write(f"{line}\n")
            else:
                file.write(line)

In [69]:
genes = ["AIP","LEP","CMGA","CDN2A","CAV1","CTNB1",
                "MDR1","CASR","ALBU","G6PD","FLNA","ESR1","KLOT",
                "ACTB","MEN1","P53","VGFR2","EGFR","CDN2B",
                "EDNRB","VDR","LEG3","LRP5","SYT1","AMPE","MK01",
                "CCND1","TNF10","PTHY","KAP0","S12A3","POTEF"] 
for i in range(32):
    genes[i] += "_HUMAN"

In [77]:
clusters = []
cluster_contain_gene = {}
i = 0 
with open("files/modules_from_original_network.txt","r") as file:
    for line in file:
        i += 1
        if i == 16:
            break 
        
        file_name = "module_" + str(i) + ".sif" 
        cluster_contain_gene[file_name] = False
        
        for gene in genes:
            if line.find(gene) != -1:
                cluster_contain_gene[file_name] = True
                break
        
        save_to_sif(""files/mcode_results/sif_files_for_clusters/" + file_name, line)

In [81]:
import os

for file in os.listdir("files/mcode_results/sif_files_for_clusters/"):
        file_path = os.path.join("files/mcode_results/sif_files_for_clusters/", file)
        
        if cluster_contain_gene[file]:
            mcode = MCODE(file_path)
            mcode.cluster()
            mcode.save_clusters("files/mcode_results/auxilary_files/" + file)

## Input graph loaded; 329 nodes
## Weighting vertices...
## Molecular complex prediction...
TYY1_HUMAN VDAC3_HUMAN ATPO_HUMAN KAT8_HUMAN CPSF7_HUMAN ATN1_HUMAN MCRS1_HUMAN MFAP1_HUMAN CCNT1_HUMAN PHF20_HUMAN KANL1_HUMAN SRC8_HUMAN SET1B_HUMAN ATX2L_HUMAN ACTB_HUMAN KANL3_HUMAN CPSF6_HUMAN A4_HUMAN NUFP2_HUMAN TERT_HUMAN ZN608_HUMAN P20L1_HUMAN DDX42_HUMAN RERE_HUMAN ZZZ3_HUMAN KANL2_HUMAN RBM25_HUMAN
1 27 ZZZ3_HUMAN
MMTA2_HUMAN RASF1_HUMAN CDK6_HUMAN STK11_HUMAN TEAD2_HUMAN BCL7C_HUMAN FGFR4_HUMAN IF5A1_HUMAN FOS_HUMAN CD44_HUMAN BECN1_HUMAN HNF4A_HUMAN VDAC2_HUMAN CDN2B_HUMAN CCND2_HUMAN AMRP_HUMAN CDN2C_HUMAN GLIS2_HUMAN ESR2_HUMAN BCL7A_HUMAN SP100_HUMAN MP2K5_HUMAN ARID2_HUMAN SMRD3_HUMAN LATS2_HUMAN ATPG_HUMAN SMRC1_HUMAN ETS1_HUMAN ML12B_HUMAN CREST_HUMAN SMRC2_HUMAN PGFRA_HUMAN BRD9_HUMAN VDAC1_HUMAN
2 34 IF5A1_HUMAN
MBTD1_HUMAN HS71B_HUMAN ING3_HUMAN SOAT1_HUMAN CLCB_HUMAN HS71A_HUMAN IMA1_HUMAN NDUA7_HUMAN EPC1_HUMAN SDHA_HUMAN TFR1_HUMAN NDUV2_HUMAN VAPB_HUMAN DNJC7_HUMAN RB

In [89]:
cluster_contain_gene

{'module_1.sif': False,
 'module_2.sif': True,
 'module_3.sif': False,
 'module_4.sif': True,
 'module_5.sif': False,
 'module_6.sif': False,
 'module_7.sif': True,
 'module_8.sif': True,
 'module_9.sif': True,
 'module_10.sif': True,
 'module_11.sif': True,
 'module_12.sif': True,
 'module_13.sif': True,
 'module_14.sif': False,
 'module_15.sif': True}

In [90]:
for f in os.listdir("files/mcode_results/auxilary_files/"):
    file_path = os.path.join("files/mcode_results/auxilary_files/", f)
    
    with open(file_path,"r") as file:
        for line in file:

            file_name = f[:-4] + "." + str(i) + ".sif"
            cluster_contain_gene[file_name] = False

            for gene in genes:
                if line.find(gene) != -1:
                    cluster_contain_gene[file_name] = True
                    i += 1
                    break
                    
            
            if cluster_contain_gene[file_name]:
                save_to_sif("files/mcode_results/sif_files_for_clusters/" + file_name, line)