<h1>Experiment with std1pos -- 10 files</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import copy
import glob

In [4]:
from models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak, Possible
from discretisation import utils
from shared_bin_matching import SharedBinMatching as Aligner
from ground_truth import GroundTruth

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_10_old'
database_file = None
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_10_old/ground_truth/ground_truth.txt'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 5
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 120
hp.alpha_mass = 1.0
hp.dp_alpha = 1000.0
hp.t = 0.0
hp.mass_clustering_n_iterations = 200
hp.rt_clustering_nsamps = 400
hp.rt_clustering_burnin = 200

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30


In [7]:
evaluation_method = 2

Load all the std1pos files that we have

<h2>Simple matching</h2>

In [8]:
match_mode = 0

In [9]:
param_list = []
for mass_tol in range(3, 10, 3):
    for rt_tol in range(10, 101, 10):
        param_list.append((mass_tol, rt_tol))

In [10]:
performances_0 = []
for param in param_list:
    print param
    copy_hp = copy.copy(hp)
    copy_hp.across_file_mass_tol = param[0]
    copy_hp.across_file_rt_tol = param[1]
    aligner = Aligner(input_dir, database_file, transformation_file, 
                           copy_hp, verbose=False, seed=1234567890)
    aligner.run(match_mode)
    res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
    output = param+res[0]
    print "mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f\n" % output
    performances_0.append(output)

(3, 10)
Hyperparameters across_file_mass_tol=3, across_file_rt_tol=10, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4986 features read from std1-file2.txt
6319 features read from std1-file7.txt
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Match mode 0
Matching peak features
Processing file 0
Processing file 1

--- TOTAL TIME 2 seconds ---

Loaded 29 ground truth entries
{2: 29}

mass_tol=3, rt_tol=10, tp=2, fp=1, fn=27, prec=0.667, rec=0.069, f1=0.125, th_prob=1.000

(3, 20)
Hyperparameters across_file_mass_tol=3, across_file_rt_tol=20, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4986 features read from std1-file2.txt
6319 features read from st

In [11]:
df = pd.DataFrame(performances_0, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
display(df)

Unnamed: 0,mass_tol,rt_tol,TP,FP,FN,Prec,Rec,F1,Threshold
0,3,10,2,1,27,0.666667,0.068966,0.125,1
1,3,20,6,2,23,0.75,0.206897,0.324324,1
2,3,30,7,2,22,0.777778,0.241379,0.368421,1
3,3,40,15,3,14,0.833333,0.517241,0.638298,1
4,3,50,18,3,11,0.857143,0.62069,0.72,1
5,3,60,22,4,7,0.846154,0.758621,0.8,1
6,3,70,25,4,4,0.862069,0.862069,0.862069,1
7,3,80,25,4,4,0.862069,0.862069,0.862069,1
8,3,90,25,4,4,0.862069,0.862069,0.862069,1
9,3,100,25,4,4,0.862069,0.862069,0.862069,1


<h2>Matching the precursor clusters -- Gibbs, mh_biggest=True</h2>

In [12]:
match_mode = 1

In [13]:
aligner2 = Aligner(input_dir, database_file, transformation_file, 
                       hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)
clustering_results = aligner2._first_stage_clustering()

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4986 features read from std1-file2.txt
6319 features read from std1-file7.txt
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
First stage clustering -- within_file_mass_tol=5.00, within_file_rt_tol=30.00, alpha=1.00


[Parallel(n_jobs=4)]: Done   1 out of   2 | elapsed:  1.4min remaining:  1.4min
[Parallel(n_jobs=4)]: Done   2 out of   2 | elapsed:  2.3min finished


[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Created 4986 clustersCreated 6319 clusters

Binning with mh_biggest = True
Binning with mh_biggest = True
Assigning possible transformations 0/4986
Assigning possible transformations 0/6319
Assigning possible transformations 500/4986
Assigning possible transformations 500/6319
Assigning possible transformations 1000/4986
Assigning possible transformations 1000/6319
Assigning possible transformations 1500/4986
Assigning possible transformations 1500/6319
Assigning possible transformations 2000/4986
Assigning possible transformations 2000/6319
Assigning possible transformations 2500/4986
Assigning possible transformations 2500/6319
Assigning possible transformations 3000/4986
Assigning possible transformations 3000/6319
Assigning possible transformations 3

In [14]:
performances_2 = []
for param in param_list:
    print param
    copy_hp = copy.copy(hp)
    copy_hp.across_file_mass_tol = param[0]
    copy_hp.across_file_rt_tol = param[1]
    aligner2 = Aligner(input_dir, database_file, transformation_file, 
                           copy_hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)    
    aligner2.run(match_mode, first_stage_clustering_results=clustering_results)
    res = aligner2.evaluate_performance(gt_file, verbose=False, method=evaluation_method)
    output = param+res[0]
    print "mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f\n" % output
    performances_2.append(output)

(3, 10)
Hyperparameters across_file_mass_tol=3, across_file_rt_tol=10, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4986 features read from std1-file2.txt
6319 features read from std1-file7.txt
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Match mode 1
Matching precursor bins
Processing file 0
Processing file 1

--- TOTAL TIME 2 seconds ---

Loaded 29 ground truth entries
{2: 29}

mass_tol=3, rt_tol=10, tp=2, fp=0, fn=27, prec=1.000, rec=0.069, f1=0.129, th_prob=1.000

(3, 20)
Hyperparameters across_file_mass_tol=3, across_file_rt_tol=20, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30
4986 features read from std1-file2.txt
6319 features read from s

In [15]:
df = pd.DataFrame(performances_2, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
display(df)

Unnamed: 0,mass_tol,rt_tol,TP,FP,FN,Prec,Rec,F1,Threshold
0,3,10,2,0,27,1.0,0.068966,0.129032,1
1,3,20,6,1,23,0.857143,0.206897,0.333333,1
2,3,30,7,1,22,0.875,0.241379,0.378378,1
3,3,40,16,1,13,0.941176,0.551724,0.695652,1
4,3,50,22,1,7,0.956522,0.758621,0.846154,1
5,3,60,25,1,4,0.961538,0.862069,0.909091,1
6,3,70,28,1,1,0.965517,0.965517,0.965517,1
7,3,80,28,1,1,0.965517,0.965517,0.965517,1
8,3,90,28,1,1,0.965517,0.965517,0.965517,1
9,3,100,28,1,1,0.965517,0.965517,0.965517,1
