<h1>Experiment with std1pos -- 10 files</h1>

In [13]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [15]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import copy
import glob

In [16]:
from models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak, Possible
from discretisation import utils
from shared_bin_matching import SharedBinMatching as Aligner
from ground_truth import GroundTruth

In [17]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_10_old'
database_file = None
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_10_old/ground_truth/ground_truth.txt'

In [18]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 5
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 120
hp.alpha_mass = 1.0
hp.dp_alpha = 1000.0
hp.t = 0.0
hp.mass_clustering_n_iterations = 200
hp.rt_clustering_nsamps = 400
hp.rt_clustering_burnin = 200

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30


In [19]:
evaluation_method = 2

Load all the std1pos files that we have

<h2>Simple matching</h2>

In [20]:
match_mode = 0

In [21]:
param_list = []
for mass_tol in range(3, 10, 3):
    for rt_tol in range(10, 101, 10):
        param_list.append((mass_tol, rt_tol))

In [22]:
performances_0 = []
for param in param_list:
    print param
    copy_hp = copy.copy(hp)
    copy_hp.across_file_mass_tol = param[0]
    copy_hp.across_file_rt_tol = param[1]
    aligner = Aligner(input_dir, database_file, transformation_file, 
                           copy_hp, verbose=False, seed=1234567890)
    aligner.run(match_mode)
    res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
    output = param+res[0]
    print "mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f\n" % output
    performances_0.append(output)

(3, 10)
Hyperparameters across_file_mass_tol=3, across_file_rt_tol=10, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4999 features read from std1-file1.txt
4986 features read from std1-file2.txt
6836 features read from std1-file3.txt
9752 features read from std1-file4.txt
7076 features read from std1-file5.txt
4146 features read from std1-file6.txt
6319 features read from std1-file7.txt
4101 features read from std1-file8.txt
5485 features read from std1-file9.txt
5034 features read from std1-file10.txt
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Match mode 0
Matching peak features
Processing file 0
Processing file 1
Processing file 2
Processing file 3
Processing file 4
Processing file 5
Processing file 6
Processing file 7
Processing file 8
Processing file 9

--- TOTAL TIME 136 se

In [23]:
df = pd.DataFrame(performances_0, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
display(df)

Unnamed: 0,mass_tol,rt_tol,TP,FP,FN,Prec,Rec,F1,Threshold
0,3,10,3152,388,2458,0.890395,0.561854,0.688962,1
1,3,20,4101,525,1509,0.886511,0.731016,0.80129,1
2,3,30,4589,605,1021,0.883519,0.818004,0.8495,1
3,3,40,4705,631,905,0.881747,0.838681,0.859675,1
4,3,50,4794,629,816,0.884013,0.854545,0.869029,1
5,3,60,4821,647,789,0.881675,0.859358,0.870374,1
6,3,70,4854,677,756,0.877599,0.865241,0.871376,1
7,3,80,4870,725,740,0.87042,0.868093,0.869255,1
8,3,90,4877,862,733,0.8498,0.86934,0.859459,1
9,3,100,4878,993,732,0.830864,0.869519,0.849752,1


<h2>Matching the precursor clusters -- Gibbs, mh_biggest=True</h2>

In [24]:
match_mode = 1

In [25]:
aligner2 = Aligner(input_dir, database_file, transformation_file, 
                       hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)
clustering_results = aligner2._first_stage_clustering()

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4999 features read from std1-file1.txt
4986 features read from std1-file2.txt
6836 features read from std1-file3.txt
9752 features read from std1-file4.txt
7076 features read from std1-file5.txt
4146 features read from std1-file6.txt
6319 features read from std1-file7.txt
4101 features read from std1-file8.txt
5485 features read from std1-file9.txt
5034 features read from std1-file10.txt
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
First stage clustering -- within_file_mass_tol=5.00, within_file_rt_tol=30.00, alpha=1.00


[Parallel(n_jobs=4)]: Done   2 out of  10 | elapsed:  1.3min remaining:  5.3min
[Parallel(n_jobs=4)]: Done   1 out of  10 | elapsed:  1.4min remaining: 12.3min
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:  2.3min remaining:  1.5min
[Parallel(n_jobs=4)]: Done   8 out of  10 | elapsed:  3.2min remaining:   48.4s
[Parallel(n_jobs=4)]: Done   4 out of  10 | elapsed:  4.6min remaining:  6.9min
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  5.1min finished


[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Created 4999 clustersCreated 4986 clustersCreated 6836 clustersCreated 9752 clusters



Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Assigning possible transformations 0/4999
Assigning possible transformations 0/4986
Assigning possible transformations 0/6836
Assigning possible transformations 0/9752
Assigning possible transformations 500/4999
Assigning possible transformations 500/4986
Assigning possible transformations 500/6836
Assigning possib

In [27]:
performances_2 = []
for param in param_list:
    print param
    copy_hp = copy.copy(hp)
    copy_hp.across_file_mass_tol = param[0]
    copy_hp.across_file_rt_tol = param[1]
    aligner2 = Aligner(input_dir, database_file, transformation_file, 
                           copy_hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)    
    aligner2.run(match_mode, first_stage_clustering_results=clustering_results)
    res = aligner2.evaluate_performance(gt_file, verbose=False, method=evaluation_method)
    output = param+res[0]
    print "mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f\n" % output
    performances_2.append(output)

(3, 10)
Hyperparameters across_file_mass_tol=3, across_file_rt_tol=10, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4999 features read from std1-file1.txt
4986 features read from std1-file2.txt
6836 features read from std1-file3.txt
9752 features read from std1-file4.txt
7076 features read from std1-file5.txt
4146 features read from std1-file6.txt
6319 features read from std1-file7.txt
4101 features read from std1-file8.txt
5485 features read from std1-file9.txt
5034 features read from std1-file10.txt
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Match mode 1
Matching precursor bins
Processing file 0
Processing file 1
Processing file 2
Processing file 3
Processing file 4
Processing file 5
Processing file 6
Processing file 7
Processing file 8
Processing file 9

--- TOTAL TIME 124 s

In [28]:
df = pd.DataFrame(performances_2, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
display(df)

Unnamed: 0,mass_tol,rt_tol,TP,FP,FN,Prec,Rec,F1,Threshold
0,3,10,3182,213,2428,0.937261,0.567201,0.706718,1
1,3,20,4314,363,1296,0.922386,0.768984,0.838728,1
2,3,30,4858,405,752,0.923048,0.865954,0.89359,1
3,3,40,4974,429,636,0.9206,0.886631,0.903296,1
4,3,50,5067,439,543,0.920269,0.903209,0.911659,1
5,3,60,5079,449,531,0.918777,0.905348,0.912013,1
6,3,70,5112,457,498,0.917939,0.91123,0.914572,1
7,3,80,5120,501,490,0.91087,0.912656,0.911762,1
8,3,90,5140,567,470,0.900648,0.916221,0.908368,1
9,3,100,5144,718,466,0.877516,0.916934,0.896792,1
