In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import os
import sys
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..\..'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)

In [15]:
from funcs import *

In [4]:
diseases = ('C0003873', 
            'C0019193',
            'C0033578',
            'C0919267',
            'C0917816')

## Read Data + Save

In [5]:
human_data(file='data/BIOGRID-ORGANISM-Homo_sapiens-4.4.204.tab3.txt',
           save_file='data/processed/homo_preprocess.tsv', correct=False)

Nodes:  19618
Edges:  665061


In [6]:
for disease in diseases:
    print(f'Saving file for {disease}')
    disease_data(file='data/curated_gene_disease_associations.tsv',
                 disease_id=disease)
    print('===================')

Saving file for C0003873
Saving file for C0019193
Saving file for C0033578
Saving file for C0919267
Saving file for C0917816


In [7]:
generate_PPI(in_file='data/processed/homo_preprocess.tsv', out_file='data/processed/PPI.txt')

## Read Data + Save

In [8]:
for disease in diseases:
    print(disease)
    data_overview(human_file='data/processed/homo_preprocess.tsv',
                  disease_file=f'data/{disease}/disease{disease}.tsv')
    print('===================')

C0003873
Number of genes associated with the disease: 174
Classes of the disease: ['C17;C05;C20']
Number of genes present in the interactome: 173
Largest connected component: 83
Number of genes in the interactome: 19618
Missing gene: {6556}
C0019193
Number of genes associated with the disease: 404
Classes of the disease: ['C06;C25']
Number of genes present in the interactome: 321
Largest connected component: 230
Number of genes in the interactome: 19618
Missing gene: {407055, 442901, 442904, 442905, 442906, 442909, 442910, 442912, 619552, 619556, 574501, 574506, 574508, 100616247, 724030, 100616258, 100616259, 574030, 100126313, 100126334, 100616357, 100616376, 102466247, 3274, 100422867, 768218, 102466270, 100313838, 494323, 100616437, 494335, 102465802, 100302115, 102465833, 100302142, 100302145, 102465858, 723779, 100302149, 100422991, 100302174, 100500847, 100302197, 406902, 693120, 693122, 693124, 406919, 406920, 100302218, 100302221, 406940, 406941, 406942, 100302236, 100302240, 

In [20]:
split_files_diffusion_heat(seeds='data/seed.txt', disease=disease, k=10)

## Data Overview

In [9]:
interactome_df = pd.read_csv('data/processed/homo_preprocess.tsv', sep='\t')
interactome_g = nx.from_pandas_edgelist(interactome_df, source='A',
                                        target='B')

C0003873
Number of genes associated with the disease: 174
Classes of the desease: ['C17;C05;C20']
Number of genes present in the interactome: 173
Largest connected component: 83
C0019193
Number of genes associated with the disease: 404
Classes of the desease: ['C06;C25']
Number of genes present in the interactome: 321
Largest connected component: 230
C0033578
Number of genes associated with the disease: 616
Classes of the desease: ['C04;C12']
Number of genes present in the interactome: 613
Largest connected component: 550
C0919267
Number of genes associated with the disease: 134
Classes of the desease: ['C04;C13;C19']
Number of genes present in the interactome: 133
Largest connected component: 111
C0917816
Number of genes associated with the disease: 139
Classes of the desease: ['C23;C10;F03;F01']
Number of genes present in the interactome: 139
Largest connected component: 54


In [10]:
for i in diseases:
    split_files_diffusion_heat(seeds='data/seed'+i+'.txt', disease=i, k=5)

19618

In [11]:
tsv_to_txt(tsv_file='data/homo_preprocess.tsv', txt_file='data/homo_preprocess_new.txt')

665061

## Algorithms

In [5]:
for disease in diseases:
    if not os.path.exists(f'data/results/{disease}'):
        os.mkdir(f'data/results/{disease}')

In [None]:
MCL_hyper('data/homo_preprocess.tsv', start=18, end=23)

In [6]:
n_iter_diamond_diable = {'C0003873':180, 
                         'C0019193':330,
                         'C0033578':620,
                         'C0919267':140,
                         'C0917816':140}

In [14]:
for disease in diseases:
    print(disease)
    DIAMOND(network_file='data/processed/PPI.txt', seed_file=f'data/{disease}/seeds_{disease}.txt', n=n_iter_diamond_diable[disease], 
            alpha=1, out_file=f'data/results/{disease}/diamond_{disease}.txt')
    print('===================')

C0003873
DIAMOnD(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/results/C0003873/diamond_C0003873.txt' 

C0019193
DIAMOnD(): ignoring 83 of 404 seed genes that are not in the network

 results have been saved to 'data/results/C0019193/diamond_C0019193.txt' 

C0033578
DIAMOnD(): ignoring 3 of 616 seed genes that are not in the network

 results have been saved to 'data/results/C0033578/diamond_C0033578.txt' 

C0919267
DIAMOnD(): ignoring 1 of 134 seed genes that are not in the network

 results have been saved to 'data/results/C0919267/diamond_C0919267.txt' 

C0917816

 results have been saved to 'data/results/C0917816/diamond_C0917816.txt' 



In [15]:
for disease in diseases:
    print(disease)
    DIABLE(network_file='data/processed/PPI.txt', seed_file=f'data/{disease}/seeds_{disease}.txt', n=n_iter_diamond_diable[disease], 
           alpha=1, out_file=f'data/results/{disease}/diable_{disease}.txt')
    print('===================')

C0003873
DiaBLE(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/results/C0003873/diable_C0003873.txt' 

C0019193
DiaBLE(): ignoring 83 of 404 seed genes that are not in the network

 results have been saved to 'data/results/C0019193/diable_C0019193.txt' 

C0033578
DiaBLE(): ignoring 3 of 616 seed genes that are not in the network

 results have been saved to 'data/results/C0033578/diable_C0033578.txt' 

C0919267
DiaBLE(): ignoring 1 of 134 seed genes that are not in the network

 results have been saved to 'data/results/C0919267/diable_C0919267.txt' 

C0917816

 results have been saved to 'data/results/C0917816/diable_C0917816.txt' 



In [16]:
for disease in diseases:
    print(disease)
    RANDOM_WALK_WITH_RESTART(network_file='data/processed/PPI.txt', seed_file=f'data/{disease}/seeds_{disease}.txt', r=0.7, 
                             score_thr=0, tol=1e-6, out_file=f'data/results/{disease}/r_walk_{disease}.txt')
    print('===================')

C0003873
RANDOM_WALK_WITH_RESTART(): ignoring 1 of 174 seed genes that are not in the network

 results have been saved to 'data/results/C0003873/r_walk_C0003873.txt' 

C0019193
RANDOM_WALK_WITH_RESTART(): ignoring 83 of 404 seed genes that are not in the network

 results have been saved to 'data/results/C0019193/r_walk_C0019193.txt' 

C0033578
RANDOM_WALK_WITH_RESTART(): ignoring 3 of 616 seed genes that are not in the network

 results have been saved to 'data/results/C0033578/r_walk_C0033578.txt' 

C0919267
RANDOM_WALK_WITH_RESTART(): ignoring 1 of 134 seed genes that are not in the network

 results have been saved to 'data/results/C0919267/r_walk_C0919267.txt' 

C0917816

 results have been saved to 'data/results/C0917816/r_walk_C0917816.txt' 



## 5-fold cross validation

In [17]:
from imported_code.diamond import DIAMOnD

In [18]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_diamond_{disease}.json',
        'extended_disease_file':None,
        'disease':disease,
        'func_args':{
            'max_number_of_added_nodes':n_iter_diamond_diable[disease],
            'alpha':1,
            'outfile':'./data/results/kfold_tmp.txt'
        }
    }

    metrics = k_fold(DIAMOnD, compute_metrics, k=5, extended_val=False, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [19]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_diable_{disease}.json',
        'extended_disease_file':None,
        'disease':disease,
        'func_args':{
            'max_number_of_added_nodes':n_iter_diamond_diable[disease],
            'alpha':1,
            'outfile':'./data/results/kfold_tmp.txt'
        }
    }

    metrics = k_fold(DiaBLE, compute_metrics, k=5, extended_val=False, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [20]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_r_walk_{disease}.json',
        'extended_disease_file':None,
        'disease':disease,
        'func_args':{
            'r':0.7, 
            'score_thr':0, 
            'tol':1e-6, 
            'sorted_nodes_only':True
        }   
    }

    metrics = k_fold(random_walk_wr, compute_metrics, k=5, extended_val=False, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [43]:
for i in diseases:
    MCL_metric = k_fold_MCL(human_file='data/homo_preprocess.tsv',
                            network_file='data/PPI.txt',
                            metric_func=compute_metrics_MCL,
                            extended_disease_file='data/all_gene_disease_associations.tsv',
                            metrics_file='data/'+i+'.json',
                            k=5, disease=i, extended_validation=False)

Number of clusters: 2


  return metrics


Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2
Number of clusters: 2


In [None]:
for i in diseases:
    res = diffusion_heat(path='diffusion_heat/'+i+'/')
    diffusion_heat_metric = k_fold_diffusion_heat(network_file='data/PPI.txt', dict_res=res, 
                                                  metric_func=compute_metrics, 
                                                  extended_disease_file='data/all_gene_disease_associations.tsv',
                                                  disease=i, extended_validation=False, metrics_file='data/'+i+'.json',
                                                  k=5, disease=i, extended_validation=False)

## Extended validation

In [7]:
from imported_code.diamond import DIAMOnD

In [9]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_ext_diamond_{disease}.json',
        'extended_disease_file':'data/all_gene_disease_associations.tsv',
        'disease':disease,
        'func_args':{
            'max_number_of_added_nodes':n_iter_diamond_diable[disease],
            'alpha':1,
            'outfile':'./data/results/kfold_tmp.txt'
        }
    }

    metrics = k_fold(DIAMOnD, compute_metrics, k=5, extended_val=True, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [12]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_ext_diable_{disease}.json',
        'extended_disease_file':'data/all_gene_disease_associations.tsv',
        'disease':disease,
        'func_args':{
            'max_number_of_added_nodes':n_iter_diamond_diable[disease],
            'alpha':1,
            'outfile':'./data/results/kfold_tmp.txt'
        }
    }

    metrics = k_fold(DiaBLE, compute_metrics, k=5, extended_val=True, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


In [11]:
for disease in diseases:
    print(disease)

    kwargs = {
        'network_file':'data/processed/PPI.txt', 
        'seed_file':f'data/{disease}/seeds_{disease}.txt',
        'metrics_file':f'data/results/{disease}/metrics_ext_r_walk_{disease}.json',
        'extended_disease_file':'data/all_gene_disease_associations.tsv',
        'disease':disease,
        'func_args':{
            'r':0.7, 
            'score_thr':0, 
            'tol':1e-6, 
            'sorted_nodes_only':True
        }
    }

    metrics = k_fold(random_walk_wr, compute_metrics, k=5, extended_val=True, **kwargs)
    print('===================')

C0003873
C0019193
C0033578
C0919267
C0917816


## Results to $\LaTeX$

In [26]:
import pandas as pd
import re

In [254]:
def prepare_results_for_latex(results_file):
    """
    Prepare the results of an algorithm for a disease for the LaTeX visualization
    """
    alg_name_regx = re.compile('(?<=metrics_)[^_ext](.+)(?=_)|(?<=ext_)(.+)(?=_)')
    alg_name = re.search(alg_name_regx, results_file).group(0).upper()
    results = pd.read_json(results_file, orient='index').set_axis(['avg', 'std'], axis=1)
    if 'extended_val' in results.index:
        results.drop(['extended_val'], axis=0, inplace=True)
    results[alg_name] = (results['avg'] * 100).round(decimals=2).astype('str') + '±' + (results['std'] * 100).round(decimals=2).astype('str')
    results.drop(['avg', 'std'], axis=1, inplace=True)
    results.index = results.index.str.replace(re.compile('(_at(_k)?_)'), '@').str.capitalize()
    results.index = results.index.str.split('@', expand=True)
    results.index.rename(['Metric', '@'], inplace=True)
    results.reset_index(inplace=True)
    results['@'] = results['@'].astype(int)
    results.sort_values(['@', 'Metric'], inplace=True)
    results.set_index(['@', 'Metric'], inplace=True)
    return results

In [255]:
def join_results(results_files):
    """
    """
    results_list = []
    for results_file in results_files:
        results = prepare_results_for_latex(results_file)
        results_list.append(results)
    return pd.concat(results_list, axis=1, join='inner')


In [259]:
def print_latex(results_files):
    """
    """
    results = join_results(results_files)
    header = [r'\textbf{Diff. Heat}', r'\textbf{RW WR}', 
              r'\textbf{Diamond}', r'\textbf{Diable}', r'\textbf{E.Diff. Heat}', r'\textbf{E.RW WR}', 
              r'\textbf{E.Diamond}', r'\textbf{E.Diable}']
    results.columns = header
    results.index = results.index.set_levels(results.index.levels[1].str.replace('Ndcg', 'NDCG').str.replace('-score', '') \
                                    .str.replace('Precision', 'P').str.replace('Recall', 'R'), level=1)
    latex = results.to_latex(bold_rows=True, escape=False, multicolumn=True, multirow=True, column_format='rr|cccc', longtable=True)
    print(latex)

In [None]:
diseases = ['C0003873', 'C0019193', 'C0033578', 'C0919267', 'C0917816']

disease = diseases[0]
algorithms = ['heat','r_walk', 'diamond', 'diable']
ext_files = [f'data/results/extended/metrics_ext_{algorithm}_{disease}.json' for algorithm in algorithms]
results_files = [f'data/results/{disease}/metrics_{algorithm}_{disease}.json' for algorithm in algorithms] + ext_files

In [269]:
print_latex(results_files)

\begin{longtable}{rr|cccc}
\toprule
    &   & \textbf{Diff. Heat} & \textbf{RW WR} & \textbf{Diamond} & \textbf{Diable} \\
\textbf{@} & \textbf{Metric} &                     &                &                  &                 \\
\midrule
\endfirsthead

\toprule
    &   & \textbf{Diff. Heat} & \textbf{RW WR} & \textbf{Diamond} & \textbf{Diable} \\
\textbf{@} & \textbf{Metric} &                     &                &                  &                 \\
\midrule
\endhead
\midrule
\multicolumn{6}{r}{{Continued on next page}} \\
\midrule
\endfoot

\bottomrule
\endlastfoot
\multirow{4}{*}{\textbf{13 }} & \textbf{F1} &            3.75±6.5 &      4.05±3.82 &        4.05±2.03 &       2.03±2.48 \\
    & \textbf{NDCG} &          7.16±12.39 &      5.11±5.39 &        8.08±6.49 &        2.12±2.6 \\
    & \textbf{P} &           5.77±9.99 &      6.15±5.76 &        6.15±3.08 &       3.08±3.77 \\
    & \textbf{R} &           2.78±4.81 &      3.02±2.86 &        3.02±1.51 &       1.51±1.85 \\
\cline{1