In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import os
import sys
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..\..'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)

In [83]:
from funcs import *

In [81]:
check_issue(file='data/BIOGRID-ORGANISM-Homo_sapiens-4.4.204.tab3.txt')

Official Symbol Interactor B
MEMO1    2
RNR1     2
Name: Entrez Gene Interactor B, dtype: int64


In [5]:
diseases = ['C0003873', 'C0019193', 
            'C0033578', 'C0919267', 
            'C0917816']

In [13]:
generate_PPI('data/homo_preprocess.tsv', 'data/PPI.txt')


## Read Data + Save

In [82]:
human_data(file='data/BIOGRID-ORGANISM-Homo_sapiens-4.4.204.tab3.txt',
           save_file='data/homo_preprocess.tsv')

Nodes:  19618
Edges:  665061


In [20]:
for i in range(len(diseases)):
    disease_data(file='data/curated_gene_disease_associations.tsv',
                disease_id=diseases[i])

## Data Overview

In [21]:
for i in diseases:
    print(i)
    data_overview(human_file='data/homo_preprocess.tsv',
                disease_file='data/disease'+i+'.tsv')
    print('===================')

C0003873
Number of genes associated with the disease: 174
Classes of the desease: ['C17;C05;C20']
Number of genes present in the interactome: 173
Largest connected component: 83
C0019193
Number of genes associated with the disease: 404
Classes of the desease: ['C06;C25']
Number of genes present in the interactome: 321
Largest connected component: 230
C0033578
Number of genes associated with the disease: 616
Classes of the desease: ['C04;C12']
Number of genes present in the interactome: 613
Largest connected component: 550
C0919267
Number of genes associated with the disease: 134
Classes of the desease: ['C04;C13;C19']
Number of genes present in the interactome: 133
Largest connected component: 111
C0917816
Number of genes associated with the disease: 139
Classes of the desease: ['C23;C10;F03;F01']
Number of genes present in the interactome: 139
Largest connected component: 54


In [24]:
for i in diseases:
    split_files_diffusion_heat(seeds='data/seeds_'+i+'.txt', disease=i, k=5)

In [90]:
tsv_to_txt(tsv_file='data/homo_preprocess.tsv', txt_file='data/homo_preprocess_new.txt')

## Algorithms

In [6]:
MCL_hyper('data/homo_preprocess.tsv', start=18, end=23)

In [15]:
DIAMOND(network_file='data/PPI.txt', seed_file='data/seed_genes.txt', n=200, alpha=1, out_file='data/diamond_results.txt')

DIAMOnD(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/diamond_results.txt' 



In [61]:
DIABLE(network_file='data/PPI.txt', seed_file='data/seed_genes.txt', n=200, alpha=1, out_file='data/diable_results.txt')

DiaBLE(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/diable_results.txt' 



In [91]:
RANDOM_WALK_WITH_RESTART(network_file='data/PPI.txt', seed_file='data/seed_genes.txt', r=0.1, score_thr=0, tol=1e-6)

RANDOM_WALK_WITH_RESTART(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/random_walk_wr_results.txt' 



## 10-fold cross validation

In [33]:
from imported_code.diamond import DIAMOnD

In [43]:
gt = np.array([34, 14, 12, 41, 23, 2])
pred = np.array(np.arange(1, 100))
n = 27

kwargs = {
    'network_file':'data/PPI.txt', 
    'seed_file':'data/seed_genes.txt',
    'func_args':{
        'max_number_of_added_nodes':180,
        'alpha':1,
        'outfile':'./data/test_kfold_diamond.txt'
    }
    
}

metrics = k_fold(DIAMOnD, compute_metrics, 3, **kwargs)

  return metrics


In [44]:
metrics

defaultdict(list,
            {'precision_at_50': (0.02, 0.016329931618554522),
             'recall_at_50': (0.017241379310344827, 0.014077527257374586),
             'ndcg_at_k_50': (0.020747046136719956, 0.015646240006390748),
             'F1-score_at_50': (0.01851851851851852, 0.015120307054217151),
             'precision_at_17': (0.0392156862745098, 0.0277296776935901),
             'recall_at_17': (0.011494252873563218, 0.008127664151569512),
             'ndcg_at_k_17': (0.03175479080952405, 0.022518169531146874),
             'F1-score_at_17': (0.017777777777777778, 0.01257078722109418),
             'precision_at_43': (0.023255813953488372, 0.018988292579714558),
             'recall_at_43': (0.017241379310344827, 0.014077527257374586),
             'ndcg_at_k_43': (0.022981040787214534, 0.01733099147627673),
             'F1-score_at_43': (0.019801980198019802, 0.016168249127281705),
             'precision_at_86': (0.023089332526719093, 0.00805731519419522),
             '

In [None]:
for i in diseases:
    MCL_metric = k_fold_MCL(human_file='data/homo_preprocess.tsv',
                            network_file='data/PPI.txt',
                            metric_func=compute_metrics_MCL,
                            extended_disease_file='data/all_gene_disease_associations.tsv',
                            metrics_file='data/'+i+'.json',
                            k=5, disease=i, extended_validation=False)

In [84]:
for i in diseases:
    res = diffusion_heat(path='diffusion_heat/'+i+'/')
    diffusion_heat_metric = k_fold_diffusion_heat(network_file='data/PPI.txt', dict_res=res, 
                                                  metric_func=compute_metrics, 
                                                  extended_disease_file='data/all_gene_disease_associations.tsv',
                                                  disease=i, extended_validation=False, metrics_file='data/'+i+'_heat.json')

35
17
35
35
35
35
17
35
35
35
34
17
34
34
34
35
17
35
35
35
35
17
35
35
35
50
32
80
81
81
50
32
80
81
81
50
32
80
80
80
50
32
80
81
81
50
32
80
81
81


## Extended validation

In [82]:
res

{'1': [],
 '2': [1029,
  1021,
  11186,
  1019,
  5894,
  5515,
  1026,
  891,
  4683,
  6772,
  5111,
  1642,
  5606,
  5371,
  3717,
  369,
  2289,
  6790,
  7161,
  2956,
  595,
  1387,
  5524,
  1111,
  3939,
  55014,
  4361,
  10111,
  405,
  11073,
  5608,
  1869,
  4088,
  472,
  580,
  84662,
  1030,
  25,
  983,
  4087,
  2810,
  5156,
  5970,
  4086,
  11180,
  226,
  890,
  1969,
  1432,
  5230,
  6714,
  11014,
  3308,
  10273,
  1018,
  6196,
  51366,
  10399,
  1017,
  3654,
  11200,
  1031,
  8975,
  5925,
  5605,
  3480,
  5933,
  3615,
  4771,
  10445,
  7507,
  10762,
  26524,
  2064,
  2264,
  7534,
  7529,
  4292,
  7132,
  23624,
  8900,
  1488,
  5702,
  9261,
  23411,
  2194,
  4176,
  367,
  5781,
  100499483,
  8463,
  836,
  3091,
  8607,
  8678,
  9156,
  6667,
  79184,
  5591,
  641,
  5700,
  2735,
  6117,
  1024,
  2547,
  55210,
  4233,
  84172,
  221937,
  3014,
  4137,
  54386,
  5716,
  3837,
  10856,
  841,
  5339,
  22869,
  5706,
  1051,
  7520,
  3