In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..\..'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)

In [3]:
from funcs import *

In [4]:
disease = 'C0003873'

## Read Data + Save

In [5]:
human_data(file='data/BIOGRID-ORGANISM-Homo_sapiens-4.4.204.tab3.txt',
           save_file='data/processed/homo_preprocess.tsv', correct=False)

Nodes:  19618
Edges:  665061


In [6]:
disease_data(file='data/curated_gene_disease_associations.tsv',
             disease_id=disease)

In [7]:
generate_PPI(in_file='data/processed/homo_preprocess.tsv', out_file='data/processed/PPI.txt')

## Data Overview

In [8]:
data_overview(human_file='data/processed/homo_preprocess.tsv',
              disease_file=f'data/processed/disease{disease}.tsv')

Number of genes associated with the disease: 174
Classes of the disease: ['C17;C05;C20']
Number of genes present in the interactome: 173
Largest connected component: 83
Number of genes in the interactome: 19618
Missing gene: {6556}


In [28]:
split_files_diffusion_heat(seeds='data/seed.txt', disease=disease, k=10)

## Make sure the interactome follows the ground truth:
Nodes = 19618

Edges = 665061

In [54]:
interactome_df = pd.read_csv('data/homo_preprocess.tsv', sep='\t')
interactome_g = nx.from_pandas_edgelist(interactome_df, source='A',
                                        target='B')

In [55]:
interactome_g.number_of_nodes()

19620

In [56]:
interactome_g.number_of_edges()

665061

## Algorithms

In [None]:
MCL_hyper('data/homo_preprocess.tsv', start=18, end=23)

In [9]:
DIAMOND(network_file='data/processed/PPI.txt', seed_file=f'data/processed/seeds_{disease}.txt', n=200, alpha=1, out_file=f'data/results/diamond_{disease}.txt')

DIAMOnD(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/results/diamond_C0003873.txt' 



In [10]:
DIABLE(network_file='data/processed/PPI.txt', seed_file=f'data/processed/seeds_{disease}.txt', n=200, alpha=1, out_file=f'data/results/diable_{disease}.txt')

DiaBLE(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/results/diable_C0003873.txt' 



In [11]:
RANDOM_WALK_WITH_RESTART(network_file='data/processed/PPI.txt', seed_file=f'data/processed/seeds_{disease}.txt', r=0.7, score_thr=0, tol=1e-6, out_file=f'data/results/r_walk_{disease}.txt')

RANDOM_WALK_WITH_RESTART(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/results/r_walk_C0003873.txt' 



## 5-fold cross validation

In [13]:
from imported_code.diamond import DIAMOnD

In [16]:
kwargs = {
    'network_file':'data/processed/PPI.txt', 
    'seed_file':f'data/processed/seeds_{disease}.txt',
    'metrics_file':f'data/results/metrics_diamond_{disease}.json',
    'extended_disease_file':None,
    'disease':disease,
    'func_args':{
        'max_number_of_added_nodes':200,
        'alpha':1,
        'outfile':'./data/results/kfold_tmp.txt'
    }
    
}

metrics = k_fold(DIAMOnD, compute_metrics, k=5, extended_val=False, **kwargs)

In [17]:
metrics

defaultdict(list,
            {'precision_at_50': (0.008, 0.0098),
             'recall_at_50': (0.01143, 0.014),
             'ndcg_at_k_50': (0.00999, 0.01259),
             'F1-score_at_50': (0.00941, 0.01153),
             'precision_at_17': (0.01176, 0.02353),
             'recall_at_17': (0.00571, 0.01143),
             'ndcg_at_k_17': (0.00949, 0.01898),
             'F1-score_at_17': (0.00769, 0.01538),
             'precision_at_43': (0.0093, 0.01139),
             'recall_at_43': (0.01143, 0.014),
             'ndcg_at_k_43': (0.00999, 0.01259),
             'F1-score_at_43': (0.01026, 0.01256),
             'precision_at_86': (0.01163, 0.0),
             'recall_at_86': (0.02891, 0.00041),
             'ndcg_at_k_86': (0.01988, 0.00515),
             'F1-score_at_86': (0.01658, 7e-05),
             'precision_at_173': (0.00809, 0.00283),
             'recall_at_173': (0.0405, 0.01428),
             'ndcg_at_k_173': (0.0254, 0.00533),
             'F1-score_at_173': (0.01349,

In [18]:
kwargs = {
    'network_file':'data/processed/PPI.txt', 
    'seed_file':f'data/processed/seeds_{disease}.txt',
    'metrics_file':f'data/results/metrics_diable_{disease}.json',
    'extended_disease_file':None,
    'disease':disease,
    'func_args':{
        'max_number_of_added_nodes':200,
        'alpha':1,
        'outfile':'./data/results/kfold_tmp.txt'
    }
    
}

metrics = k_fold(DiaBLE, compute_metrics, k=5, extended_val=False, **kwargs)

In [19]:
metrics

defaultdict(list,
            {'precision_at_50': (0.012, 0.016),
             'recall_at_50': (0.01714, 0.02286),
             'ndcg_at_k_50': (0.01757, 0.02223),
             'F1-score_at_50': (0.01412, 0.01882),
             'precision_at_17': (0.02353, 0.02882),
             'recall_at_17': (0.01143, 0.014),
             'ndcg_at_k_17': (0.02173, 0.02664),
             'F1-score_at_17': (0.01538, 0.01884),
             'precision_at_43': (0.01395, 0.0186),
             'recall_at_43': (0.01714, 0.02286),
             'ndcg_at_k_43': (0.01757, 0.02223),
             'F1-score_at_43': (0.01538, 0.02051),
             'precision_at_86': (0.0093, 0.01139),
             'recall_at_86': (0.02286, 0.02799),
             'ndcg_at_k_86': (0.02084, 0.02552),
             'F1-score_at_86': (0.01322, 0.01619),
             'precision_at_173': (0.00925, 0.00784),
             'recall_at_173': (0.04605, 0.03886),
             'ndcg_at_k_173': (0.03246, 0.02966),
             'F1-score_at_173': (

In [26]:
kwargs = {
    'network_file':'data/processed/PPI.txt', 
    'seed_file':f'data/processed/seeds_{disease}.txt',
    'metrics_file':f'data/results/metrics_r_walk_{disease}.json',
    'extended_disease_file':None,
    'disease':disease,
    'func_args':{
        'r':0.7, 
        'score_thr':0, 
        'tol':1e-6, 
        'sorted_nodes_only':True
    }
    
}

metrics = k_fold(random_walk_wr, compute_metrics, k=5, extended_val=False, **kwargs)

In [27]:
metrics

defaultdict(list,
            {'precision_at_50': (0.0, 0.0),
             'recall_at_50': (0.0, 0.0),
             'ndcg_at_k_50': (0.0, 0.0),
             'F1-score_at_50': (0.0, 0.0),
             'precision_at_17': (0.0, 0.0),
             'recall_at_17': (0.0, 0.0),
             'ndcg_at_k_17': (0.0, 0.0),
             'F1-score_at_17': (0.0, 0.0),
             'precision_at_43': (0.0, 0.0),
             'recall_at_43': (0.0, 0.0),
             'ndcg_at_k_43': (0.0, 0.0),
             'F1-score_at_43': (0.0, 0.0),
             'precision_at_86': (0.0, 0.0),
             'recall_at_86': (0.0, 0.0),
             'ndcg_at_k_86': (0.0, 0.0),
             'F1-score_at_86': (0.0, 0.0),
             'precision_at_173': (0.0, 0.0),
             'recall_at_173': (0.0, 0.0),
             'ndcg_at_k_173': (0.0, 0.0),
             'F1-score_at_173': (0.0, 0.0)})

In [43]:
MCL_metric = k_fold_MCL(human_file='data/homo_preprocess.tsv',
                        seed_file='data/seed.txt', metric_func=compute_metrics)

Number of clusters: 2


  return metrics


Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2


In [None]:
res = diffusion_heat(path='diffusion_heat/C0003873_old/')
diffusion_heat_metric = k_fold_diffusion_heat(res, metric_func=compute_metrics, 
                                              n=174, disease='C0003873_old')

## Extended validation

In [28]:
from imported_code.diamond import DIAMOnD

In [32]:
kwargs = {
    'network_file':'data/processed/PPI.txt', 
    'seed_file':f'data/processed/seeds_{disease}.txt',
    'metrics_file':f'data/results/metrics_ext_diamond_{disease}.json',
    'extended_disease_file':'data/all_gene_disease_associations.tsv',
    'disease':disease,
    'func_args':{
        'max_number_of_added_nodes':200,
        'alpha':1,
        'outfile':'./data/results/kfold_tmp.txt'
    }
    
}

metrics = k_fold(DIAMOnD, compute_metrics, k=5, extended_val=True, **kwargs)

In [33]:
metrics

defaultdict(list,
            {'precision_at_50': (0.008, 0.0098),
             'recall_at_50': (0.01143, 0.014),
             'ndcg_at_k_50': (0.01252, 0.01552),
             'F1-score_at_50': (0.00941, 0.01153),
             'precision_at_17': (0.02353, 0.02882),
             'recall_at_17': (0.01143, 0.014),
             'ndcg_at_k_17': (0.02002, 0.02482),
             'F1-score_at_17': (0.01538, 0.01884),
             'precision_at_43': (0.0093, 0.01139),
             'recall_at_43': (0.01143, 0.014),
             'ndcg_at_k_43': (0.01252, 0.01552),
             'F1-score_at_43': (0.01026, 0.01256),
             'precision_at_86': (0.0093, 0.01356),
             'recall_at_86': (0.02286, 0.03332),
             'ndcg_at_k_86': (0.01877, 0.02417),
             'F1-score_at_86': (0.01322, 0.01928),
             'precision_at_173': (0.00925, 0.00865),
             'recall_at_173': (0.04571, 0.04276),
             'ndcg_at_k_173': (0.03009, 0.02809),
             'F1-score_at_173': (0.0

In [34]:
kwargs = {
    'network_file':'data/processed/PPI.txt', 
    'seed_file':f'data/processed/seeds_{disease}.txt',
    'metrics_file':f'data/results/metrics_ext_diable_{disease}.json',
    'extended_disease_file':'data/all_gene_disease_associations.tsv',
    'disease':disease,
    'func_args':{
        'max_number_of_added_nodes':200,
        'alpha':1,
        'outfile':'./data/results/kfold_tmp.txt'
    }
    
}

metrics = k_fold(DiaBLE, compute_metrics, k=5, extended_val=True, **kwargs)

In [35]:
metrics

defaultdict(list,
            {'precision_at_50': (0.004, 0.008),
             'recall_at_50': (0.00588, 0.01176),
             'ndcg_at_k_50': (0.00543, 0.01086),
             'F1-score_at_50': (0.00476, 0.00952),
             'precision_at_17': (0.01176, 0.02353),
             'recall_at_17': (0.00588, 0.01176),
             'ndcg_at_k_17': (0.00852, 0.01703),
             'F1-score_at_17': (0.00784, 0.01569),
             'precision_at_43': (0.00465, 0.0093),
             'recall_at_43': (0.00588, 0.01176),
             'ndcg_at_k_43': (0.00543, 0.01086),
             'F1-score_at_43': (0.00519, 0.01039),
             'precision_at_86': (0.00465, 0.0057),
             'recall_at_86': (0.0116, 0.01421),
             'ndcg_at_k_86': (0.00884, 0.01129),
             'F1-score_at_86': (0.00664, 0.00813),
             'precision_at_173': (0.00694, 0.00566),
             'recall_at_173': (0.03462, 0.02793),
             'ndcg_at_k_173': (0.02001, 0.01403),
             'F1-score_at_173': 

In [36]:
kwargs = {
    'network_file':'data/processed/PPI.txt', 
    'seed_file':f'data/processed/seeds_{disease}.txt',
    'metrics_file':f'data/results/metrics_ext_r_walk_{disease}.json',
    'extended_disease_file':'data/all_gene_disease_associations.tsv',
    'disease':disease,
    'func_args':{
        'r':0.7, 
        'score_thr':0, 
        'tol':1e-6, 
        'sorted_nodes_only':True
    }
    
}

metrics = k_fold(random_walk_wr, compute_metrics, k=5, extended_val=True, **kwargs)

In [37]:
metrics

defaultdict(list,
            {'precision_at_50': (0.0, 0.0),
             'recall_at_50': (0.0, 0.0),
             'ndcg_at_k_50': (0.0, 0.0),
             'F1-score_at_50': (0.0, 0.0),
             'precision_at_17': (0.0, 0.0),
             'recall_at_17': (0.0, 0.0),
             'ndcg_at_k_17': (0.0, 0.0),
             'F1-score_at_17': (0.0, 0.0),
             'precision_at_43': (0.0, 0.0),
             'recall_at_43': (0.0, 0.0),
             'ndcg_at_k_43': (0.0, 0.0),
             'F1-score_at_43': (0.0, 0.0),
             'precision_at_86': (0.0, 0.0),
             'recall_at_86': (0.0, 0.0),
             'ndcg_at_k_86': (0.0, 0.0),
             'F1-score_at_86': (0.0, 0.0),
             'precision_at_173': (0.0, 0.0),
             'recall_at_173': (0.0, 0.0),
             'ndcg_at_k_173': (0.0, 0.0),
             'F1-score_at_173': (0.0, 0.0),
             'extended_val': []})