<h1>Testing the results with standard data</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML
import copy

In [4]:
from models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak, Possible
from shared_bin_matching import SharedBinMatching as Aligner
from ground_truth import GroundTruth

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_4'
database_file = None
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_4/ground_truth/ground_truth.txt'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 5
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 120
hp.alpha_mass = 1.0
hp.dp_alpha = 1000.0
hp.t = 0.0
hp.mass_clustering_n_iterations = 200
hp.rt_clustering_nsamps = 400
hp.rt_clustering_burnin = 200

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30


In [7]:
evaluation_method = 1

<h2>Simple matching</h2>

In [8]:
match_mode = 0

In [9]:
param_list = []
for mass_tol in range(2, 11, 2):
    for rt_tol in range(10, 121, 10):
        param_list.append((mass_tol, rt_tol))

In [12]:
performances_0 = []
for param in param_list:
    print param
    copy_hp = copy.copy(hp)
    copy_hp.across_file_mass_tol = param[0]
    copy_hp.across_file_rt_tol = param[1]
    aligner = Aligner(input_dir, database_file, transformation_file, 
                           copy_hp, verbose=False, seed=1234567890)
    aligner.run(match_mode)
    res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
    output = param+res[0]
    print "mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f\n" % output
    performances_0.append(output)

(2, 10)
Hyperparameters across_file_mass_tol=2, across_file_rt_tol=10, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4999 features read from std1-file1.csv
4986 features read from std1-file2.csv
6836 features read from std1-file3.csv
9752 features read from std1-file4.csv
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Match mode 0
Matching peak features
Processing file 0
Processing file 1
Processing file 2
Processing file 3

--- TOTAL TIME 22 seconds ---

Loaded 229 ground truth entries
{2: 63, 3: 33, 4: 133}

mass_tol=2, rt_tol=10, tp=81, fp=148, fn=0, prec=0.354, rec=1.000, f1=0.523, th_prob=1.000

(2, 20)
Hyperparameters across_file_mass_tol=2, across_file_rt_tol=20, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clusteri

In [22]:
df = pd.DataFrame(performances_0, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
display(df)

Unnamed: 0,mass_tol,rt_tol,TP,FP,FN,Prec,Rec,F1,Threshold
0,2,10,81,148,0,0.353712,1,0.522581,1
1,2,20,124,105,0,0.541485,1,0.70255,1
2,2,30,148,81,0,0.646288,1,0.785146,1
3,2,40,153,76,0,0.668122,1,0.801047,1
4,2,50,156,73,0,0.681223,1,0.81039,1
5,2,60,156,73,0,0.681223,1,0.81039,1
6,2,70,156,73,0,0.681223,1,0.81039,1
7,2,80,157,72,0,0.68559,1,0.813472,1
8,2,90,156,73,0,0.681223,1,0.81039,1
9,2,100,156,73,0,0.681223,1,0.81039,1


<h2>Matching the precursor clusters -- Gibbs, mh_biggest=True</h2>

In [None]:
match_mode = 1

In [27]:
aligner2 = Aligner(input_dir, database_file, transformation_file, 
                       hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)
clustering_results = aligner2._first_stage_clustering()

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4999 features read from std1-file1.csv
4986 features read from std1-file2.csv
6836 features read from std1-file3.csv
9752 features read from std1-file4.csv
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]


In [32]:
performances_2 = []
for param in param_list:
    print param
    copy_hp = copy.copy(hp)
    copy_hp.across_file_mass_tol = param[0]
    copy_hp.across_file_rt_tol = param[1]
    aligner2 = Aligner(input_dir, database_file, transformation_file, 
                           copy_hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)    
    aligner2.run(match_mode, first_stage_clustering_results=clustering_results)
    res = aligner2.evaluate_performance(gt_file, verbose=False, method=evaluation_method)
    output = param+res[0]
    print "mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f\n" % output
    performances_2.append(output)

(2, 10)
Hyperparameters across_file_mass_tol=2, across_file_rt_tol=10, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4999 features read from std1-file1.csv
4986 features read from std1-file2.csv
6836 features read from std1-file3.csv
9752 features read from std1-file4.csv
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Match mode 1
Matching precursor bins
Processing file 0
Processing file 1
Processing file 2
Processing file 3

--- TOTAL TIME 25 seconds ---

Loaded 229 ground truth entries
{2: 63, 3: 33, 4: 133}

mass_tol=2, rt_tol=10, tp=84, fp=145, fn=0, prec=0.367, rec=1.000, f1=0.537, th_prob=1.000

(2, 20)
Hyperparameters across_file_mass_tol=2, across_file_rt_tol=20, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_cluster

In [33]:
df = pd.DataFrame(performances_2, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
display(df)

Unnamed: 0,mass_tol,rt_tol,TP,FP,FN,Prec,Rec,F1,Threshold
0,2,10,84,145,0,0.366812,1,0.536741,1
1,2,20,129,100,0,0.563319,1,0.72067,1
2,2,30,163,66,0,0.71179,1,0.831633,1
3,2,40,166,63,0,0.724891,1,0.840506,1
4,2,50,167,62,0,0.729258,1,0.843434,1
5,2,60,167,62,0,0.729258,1,0.843434,1
6,2,70,167,62,0,0.729258,1,0.843434,1
7,2,80,168,61,0,0.733624,1,0.846348,1
8,2,90,168,61,0,0.733624,1,0.846348,1
9,2,100,168,61,0,0.733624,1,0.846348,1
