<h1>Experiment with std1pos -- 2 files</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import random
import copy
import glob

In [4]:
from models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak, Possible
from discretisation import utils
from discretisation.preprocessing import FileLoader
from shared_bin_matching import SharedBinMatching as Aligner
from ground_truth import GroundTruth

<h2>Experiment Parameters</h2>

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old'
database_file = None
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old/ground_truth/ground_truth.txt'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 5
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 120
hp.alpha_mass = 1.0
hp.dp_alpha = 1000.0
hp.t = 0.0
hp.mass_clustering_n_iterations = 200
hp.rt_clustering_nsamps = 400
hp.rt_clustering_burnin = 200

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30


In [7]:
evaluation_method = 2

Load all the std1pos files that we have

In [8]:
loader = FileLoader()        
data_list = loader.load_model_input(input_dir, synthetic=True)    

4999 features read from std1-file1.txt
4986 features read from std1-file2.txt
6836 features read from std1-file3.txt
9752 features read from std1-file4.txt
7076 features read from std1-file5.txt
4146 features read from std1-file6.txt
6319 features read from std1-file7.txt
4101 features read from std1-file8.txt
5485 features read from std1-file9.txt
5034 features read from std1-file10.txt
5317 features read from std1-file11.txt


In [9]:
param_list = []
for mass_tol in range(3, 10, 3):
    for rt_tol in range(10, 101, 10):
        param_list.append((mass_tol, rt_tol))

In [10]:
n_files = 2
n_iter = 30

<h2>Create the first-stage clustering -- Gibbs, mh_biggest=True</h2>

In [18]:
aligner1 = Aligner(data_list, database_file, transformation_file, 
                       hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)
clustering_results = aligner1._first_stage_clustering()

Hyperparameters across_file_mass_tol=3.0, across_file_rt_tol=60.0, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
First stage clustering -- within_file_mass_tol=5.00, within_file_rt_tol=30.00, alpha=1.00


[Parallel(n_jobs=4)]: Done   1 out of  11 | elapsed:  2.3min remaining: 22.7min
[Parallel(n_jobs=4)]: Done   3 out of  11 | elapsed:  4.3min remaining: 11.4min
[Parallel(n_jobs=4)]: Done   5 out of  11 | elapsed:  6.8min remaining:  8.2min
[Parallel(n_jobs=4)]: Done   7 out of  11 | elapsed:  8.7min remaining:  4.9min
[Parallel(n_jobs=4)]: Done   9 out of  11 | elapsed:  8.7min remaining:  1.9min
[Parallel(n_jobs=4)]: Done  11 out of  11 | elapsed:  9.6min finished


[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Created 4999 clustersCreated 4986 clustersCreated 6836 clustersCreated 9752 clusters



Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Assigning possible transformations 0/4999
Assigning possible transformations 0/4986
Assigning possible transformations 0/6836
Assigning possible transformations 0/9752
Assigning possible transformations 500/4999
Assigning possible transformations 500/4986
Assigning possible transformations 500/6836
Assigning possib

In [28]:
combined_list = zip(data_list, clustering_results)

<h2>Run the Experiment</h2>

In [58]:
def train(selected_data, param_list, hp, evaluation_method):
    
    performances = []
    for param in param_list:

        print "Parameter mass_tol=%f rt_tol=%f" % (param)
        hp.across_file_mass_tol = param[0]
        hp.across_file_rt_tol = param[1]
        selected_files = [x[0] for x in selected_data]        
        aligner = Aligner(selected_files, database_file, transformation_file, 
                               hp, verbose=False, seed=1234567890)
        match_mode = 0
        aligner.run(match_mode)

        res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
        output = param+res[0]
        performances.append(output)
    
    df = pd.DataFrame(performances, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
    
    sorted_df = df.sort_values(['F1', 'mass_tol', 'rt_tol'], ascending=[False, True, True])
    best_row = sorted_df.iloc[0]
    return df, best_row

In [59]:
def test(selected_data, best_row, hp, match_mode, evaluation_method):

    param = (best_row['mass_tol'], best_row['rt_tol'])
    hp.across_file_mass_tol = param[0]
    hp.across_file_rt_tol = param[1]
    selected_files = [x[0] for x in selected_data]
    selected_clusterings = [x[1] for x in selected_data]    
    aligner = Aligner(selected_files, database_file, transformation_file, 
                           hp, verbose=False, seed=1234567890)
    aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

    res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
    output = param+res[0]
    return output

In [61]:
iteration_results = []
n_iter = 30
for i in range(n_iter):

    training_data = random.sample(combined_list, n_files)
    print "Iteration %d" % i
    print "Training on %s" % [x[0].filename for x in training_data]
    training_df, best_training_row = train(training_data, param_list, hp, evaluation_method)
    
    testing_data = random.sample(combined_list, n_files)
    print "Testing on %s" % [x[0].filename for x in testing_data]

    match_mode = 0
    match_feature_res = test(testing_data, best_training_row, hp, match_mode, evaluation_method)
    output = (match_mode,) + match_feature_res
    print "match_mode=%d, mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % output

    match_mode = 1
    match_cluster_res = test(testing_data, best_training_row, hp, match_mode, evaluation_method)
    output = (match_mode,) + match_cluster_res
    print "match_mode=%d, mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % output

    item = (training_data, training_df, best_training_row, match_feature_res, match_cluster_res)
    iteration_results.append(item)
    print

Iteration 0
Training on ['std1-file1.txt', 'std1-file2.txt']
Parameter mass_tol=3.000000 rt_tol=10.000000
Parameter mass_tol=3.000000 rt_tol=20.000000
Parameter mass_tol=3.000000 rt_tol=30.000000
Parameter mass_tol=3.000000 rt_tol=40.000000
Parameter mass_tol=3.000000 rt_tol=50.000000
Parameter mass_tol=3.000000 rt_tol=60.000000
Parameter mass_tol=3.000000 rt_tol=70.000000
Parameter mass_tol=3.000000 rt_tol=80.000000
Parameter mass_tol=3.000000 rt_tol=90.000000
Parameter mass_tol=3.000000 rt_tol=100.000000
Parameter mass_tol=6.000000 rt_tol=10.000000
Parameter mass_tol=6.000000 rt_tol=20.000000
Parameter mass_tol=6.000000 rt_tol=30.000000
Parameter mass_tol=6.000000 rt_tol=40.000000
Parameter mass_tol=6.000000 rt_tol=50.000000
Parameter mass_tol=6.000000 rt_tol=60.000000
Parameter mass_tol=6.000000 rt_tol=70.000000
Parameter mass_tol=6.000000 rt_tol=80.000000
Parameter mass_tol=6.000000 rt_tol=90.000000
Parameter mass_tol=6.000000 rt_tol=100.000000
Parameter mass_tol=9.000000 rt_tol=10

KeyboardInterrupt: 