In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..\..'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)

In [3]:
from funcs import *

In [4]:
diseases = ('C0003873', 
            'C0019193',
            'C0033578',
            'C0919267',
            'C0917816')

## Read Data + Save

In [5]:
human_data(file='data/BIOGRID-ORGANISM-Homo_sapiens-4.4.204.tab3.txt',
           save_file='data/processed/homo_preprocess.tsv', correct=False)

Nodes:  19618
Edges:  665061


In [6]:
for disease in diseases:
    print(f'Saving file for {disease}')
    disease_data(file='data/curated_gene_disease_associations.tsv',
                 disease_id=disease)
    print('===================')

Saving file for C0003873
Saving file for C0019193
Saving file for C0033578
Saving file for C0919267
Saving file for C0917816


In [7]:
generate_PPI(in_file='data/processed/homo_preprocess.tsv', out_file='data/processed/PPI.txt')

## Data Overview

In [8]:
for disease in diseases:
    print(disease)
    data_overview(human_file='data/processed/homo_preprocess.tsv',
                  disease_file=f'data/{disease}/disease{disease}.tsv')
    print('===================')

C0003873
Number of genes associated with the disease: 174
Classes of the disease: ['C17;C05;C20']
Number of genes present in the interactome: 173
Largest connected component: 83
Number of genes in the interactome: 19618
Missing gene: {6556}
C0019193
Number of genes associated with the disease: 404
Classes of the disease: ['C06;C25']
Number of genes present in the interactome: 321
Largest connected component: 230
Number of genes in the interactome: 19618
Missing gene: {407055, 442901, 442904, 442905, 442906, 442909, 442910, 442912, 619552, 619556, 574501, 574506, 574508, 100616247, 724030, 100616258, 100616259, 574030, 100126313, 100126334, 100616357, 100616376, 102466247, 3274, 100422867, 768218, 102466270, 100313838, 494323, 100616437, 494335, 102465802, 100302115, 102465833, 100302142, 100302145, 102465858, 723779, 100302149, 100422991, 100302174, 100500847, 100302197, 406902, 693120, 693122, 693124, 406919, 406920, 100302218, 100302221, 406940, 406941, 406942, 100302236, 100302240, 

In [28]:
split_files_diffusion_heat(seeds='data/seed.txt', disease=disease, k=10)

## Make sure the interactome follows the ground truth:
Nodes = 19618

Edges = 665061

In [9]:
interactome_df = pd.read_csv('data/processed/homo_preprocess.tsv', sep='\t')
interactome_g = nx.from_pandas_edgelist(interactome_df, source='A',
                                        target='B')

In [10]:
interactome_g.number_of_nodes()

19618

In [11]:
interactome_g.number_of_edges()

665061

## Algorithms

In [12]:
for disease in diseases:
    if not os.path.exists(f'data/results/{disease}'):
        os.mkdir(f'data/results/{disease}')

In [None]:
MCL_hyper('data/homo_preprocess.tsv', start=18, end=23)

In [13]:
n_iter_diamond_diable = {'C0003873':180, 
                         'C0019193':330,
                         'C0033578':620,
                         'C0919267':140,
                         'C0917816':140}

In [14]:
for disease in diseases:
    print(disease)
    DIAMOND(network_file='data/processed/PPI.txt', seed_file=f'data/{disease}/seeds_{disease}.txt', n=n_iter_diamond_diable[disease], 
            alpha=1, out_file=f'data/results/{disease}/diamond_{disease}.txt')
    print('===================')

C0003873
DIAMOnD(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/results/C0003873/diamond_C0003873.txt' 

C0019193
DIAMOnD(): ignoring 83 of 404 seed genes that are not in the network

 results have been saved to 'data/results/C0019193/diamond_C0019193.txt' 

C0033578
DIAMOnD(): ignoring 3 of 616 seed genes that are not in the network

 results have been saved to 'data/results/C0033578/diamond_C0033578.txt' 

C0919267
DIAMOnD(): ignoring 1 of 134 seed genes that are not in the network

 results have been saved to 'data/results/C0919267/diamond_C0919267.txt' 

C0917816

 results have been saved to 'data/results/C0917816/diamond_C0917816.txt' 



In [15]:
for disease in diseases:
    print(disease)
    DIABLE(network_file='data/processed/PPI.txt', seed_file=f'data/{disease}/seeds_{disease}.txt', n=n_iter_diamond_diable[disease], 
           alpha=1, out_file=f'data/results/{disease}/diable_{disease}.txt')
    print('===================')

C0003873
DiaBLE(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/results/C0003873/diable_C0003873.txt' 

C0019193
DiaBLE(): ignoring 83 of 404 seed genes that are not in the network

 results have been saved to 'data/results/C0019193/diable_C0019193.txt' 

C0033578
DiaBLE(): ignoring 3 of 616 seed genes that are not in the network

 results have been saved to 'data/results/C0033578/diable_C0033578.txt' 

C0919267
DiaBLE(): ignoring 1 of 134 seed genes that are not in the network

 results have been saved to 'data/results/C0919267/diable_C0919267.txt' 

C0917816

 results have been saved to 'data/results/C0917816/diable_C0917816.txt' 



In [16]:
for disease in diseases:
    print(disease)
    RANDOM_WALK_WITH_RESTART(network_file='data/processed/PPI.txt', seed_file=f'data/{disease}/seeds_{disease}.txt', r=0.7, 
                             score_thr=0, tol=1e-6, out_file=f'data/results/{disease}/r_walk_{disease}.txt')
    print('===================')

C0003873
RANDOM_WALK_WITH_RESTART(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/results/C0003873/r_walk_C0003873.txt' 

C0019193
RANDOM_WALK_WITH_RESTART(): ignoring 83 of 404 seed genes that are not in the network

 results have been saved to 'data/results/C0019193/r_walk_C0019193.txt' 

C0033578
RANDOM_WALK_WITH_RESTART(): ignoring 3 of 616 seed genes that are not in the network

 results have been saved to 'data/results/C0033578/r_walk_C0033578.txt' 

C0919267
RANDOM_WALK_WITH_RESTART(): ignoring 1 of 134 seed genes that are not in the network

 results have been saved to 'data/results/C0919267/r_walk_C0919267.txt' 

C0917816

 results have been saved to 'data/results/C0917816/r_walk_C0917816.txt' 



## 5-fold cross validation

In [17]:
from imported_code.diamond import DIAMOnD

In [18]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_diamond_{disease}.json',
        'extended_disease_file':None,
        'disease':disease,
        'func_args':{
            'max_number_of_added_nodes':n_iter_diamond_diable[disease],
            'alpha':1,
            'outfile':'./data/results/kfold_tmp.txt'
        }
    }

    metrics = k_fold(DIAMOnD, compute_metrics, k=5, extended_val=False, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [19]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_diable_{disease}.json',
        'extended_disease_file':None,
        'disease':disease,
        'func_args':{
            'max_number_of_added_nodes':n_iter_diamond_diable[disease],
            'alpha':1,
            'outfile':'./data/results/kfold_tmp.txt'
        }
    }

    metrics = k_fold(DiaBLE, compute_metrics, k=5, extended_val=False, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [20]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_r_walk_{disease}.json',
        'extended_disease_file':None,
        'disease':disease,
        'func_args':{
            'r':0.7, 
            'score_thr':0, 
            'tol':1e-6, 
            'sorted_nodes_only':True
        }   
    }

    metrics = k_fold(random_walk_wr, compute_metrics, k=5, extended_val=False, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [43]:
MCL_metric = k_fold_MCL(human_file='data/homo_preprocess.tsv',
                        seed_file='data/seed.txt', metric_func=compute_metrics)

Number of clusters: 2


  return metrics


Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2


In [None]:
res = diffusion_heat(path='diffusion_heat/C0003873_old/')
diffusion_heat_metric = k_fold_diffusion_heat(res, metric_func=compute_metrics, 
                                              n=174, disease='C0003873_old')

## Extended validation

In [21]:
from imported_code.diamond import DIAMOnD

In [24]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_ext_diamond_{disease}.json',
        'extended_disease_file':'data/all_gene_disease_associations.tsv',
        'disease':disease,
        'func_args':{
            'max_number_of_added_nodes':n_iter_diamond_diable[disease],
            'alpha':1,
            'outfile':'./data/results/kfold_tmp.txt'
        }
    }

    metrics = k_fold(DIAMOnD, compute_metrics, k=5, extended_val=True, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [25]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_ext_diable_{disease}.json',
        'extended_disease_file':'data/all_gene_disease_associations.tsv',
        'disease':disease,
        'func_args':{
            'max_number_of_added_nodes':n_iter_diamond_diable[disease],
            'alpha':1,
            'outfile':'./data/results/kfold_tmp.txt'
        }
    }

    metrics = k_fold(DiaBLE, compute_metrics, k=5, extended_val=True, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [26]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_ext_r_walk_{disease}.json',
        'extended_disease_file':'data/all_gene_disease_associations.tsv',
        'disease':disease,
        'func_args':{
            'r':0.7, 
            'score_thr':0, 
            'tol':1e-6, 
            'sorted_nodes_only':True
        }
    }

    metrics = k_fold(random_walk_wr, compute_metrics, k=5, extended_val=True, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


## Results to $\LaTeX$

In [26]:
import pandas as pd
import re

In [183]:
def prepare_results_for_latex(results_file):
    """
    Prepare the results of an algorithm for a disease for the LaTeX visualization
    """
    alg_name_regx = re.compile('(?<=metrics_)(.+)(?=_)')
    alg_name = re.search(alg_name_regx, results_file).group(0).upper()
    results = pd.read_json(results_file, orient='index').set_axis(['avg', 'std'], axis=1)
    results[alg_name] = (results['avg'] * 100).round(decimals=2).astype('str') + ' ± ' + (results['std'] * 100).round(decimals=2).astype('str')
    results.drop(['avg', 'std'], axis=1, inplace=True)
    results.index = results.index.str.replace(re.compile('(_at(_k)?_)'), '@').str.capitalize()
    results.index = results.index.str.split('@', expand=True)
    results.index.rename(['Metric', '@'], inplace=True)
    results.reset_index(inplace=True)
    results['@'] = results['@'].astype(int)
    results.sort_values(['@', 'Metric'], inplace=True)
    results.set_index(['@', 'Metric'], inplace=True)
    return results

In [184]:
def join_results(results_files):
    """
    """
    results_list = []
    for results_file in results_files:
        results = prepare_results_for_latex(results_file)
        results_list.append(results)
    return pd.concat(results_list, axis=1, join='inner')


In [220]:
def print_latex(results_files):
    """
    """
    results = join_results(results_files)
    header = [r'\textbf{Diffusion Heat}', r'\textbf{Random Walk WR}', 
              r'\textbf{DIAMOND}', r'\textbf{DIABLE}']
    results.columns = header
    results.index = results.index.set_levels(results.index.levels[1].str.replace('Ndcg', 'NDCG').str.replace('score', 'Score'), level=1)
    latex = results.to_latex(bold_rows=True, escape=False, multicolumn=True, multirow=True, column_format='rr|cccc')
    print(latex)

In [227]:
# C0003873 C0019193 C0033578 C0919267 C0917816

disease = 'C0917816'
algorithms = ['heat', 'r_walk', 'diamond', 'diable']
results_files = [f'data/results/{disease}/metrics_{algorithm}_{disease}.json' for algorithm in algorithms]

In [None]:
print_latex(results_files)