<h1>Test discretisation</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

In [4]:
from discretisation.preprocessing import FileLoader
from models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak
from shared_bin_matching import SharedBinMatching as Aligner
from ground_truth import GroundTruth

Define input parameters

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/M1_4'
database_file = None
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/M1_4/ground_truth/ground_truth.txt'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 60
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 100
hp.across_file_rt_tol = 100
hp.alpha_mass = 1.0
hp.dp_alpha = 100.0
hp.t = 0
hp.mass_clustering_n_iterations = 100
hp.rt_clustering_nsamps = 200
hp.rt_clustering_burnin = 100

print hp

Hyperparameters across_file_mass_tol=100, across_file_rt_tol=100, alpha_mass=1.0, beta=0.1, dp_alpha=100.0, mass_clustering_n_iterations=100, rt_clustering_burnin=100, rt_clustering_nsamps=200, t=0, within_file_mass_tol=60, within_file_rt_tol=30


In [8]:
loader = FileLoader()        
data_list = loader.load_model_input(input_dir, synthetic=True)    

5843 features read from M1_1.txt
7516 features read from M1_2.txt
9133 features read from M1_3.txt
5877 features read from M1_4.txt


In [10]:
match_mode = 0
aligner0 = Aligner(data_list, database_file, transformation_file, 
                       hp, verbose=True, seed=1234567890)
aligner0.run(match_mode)

Hyperparameters across_file_mass_tol=100, across_file_rt_tol=100, alpha_mass=1.0, beta=0.1, dp_alpha=100.0, mass_clustering_n_iterations=100, rt_clustering_burnin=100, rt_clustering_nsamps=200, t=0, within_file_mass_tol=60, within_file_rt_tol=30
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Match mode 0
Matching peak features
Processing file 0
Processing file 1
Computing score matrix
Running matching

Processing file 2
Computing score matrix
Running matching

Processing file 3
Computing score matrix
Running matching


REPORT
1. avg m/z=510.2474055 avg RT=2220.9 prob=1.0
	feature_id  4949 file_id 0 mz 510.23781 RT 2217.90 intensity 2.7900e+02	None
	feature_id  4936 file_id 3 mz 510.25700 RT 2223.90 intensity 4.4900e+02	None
2. avg m/z=168.077996333 avg RT=2335.24 prob=1.0
	feature_id  6570 file_id 1 mz 168.07665 RT 2337.90 intensity 5.5000e+02	None
	feature_id  7994 file_id 2 mz 168.08662 RT 2335.92 intensity 9.1300

In [12]:
match0_performance = aligner0.evaluate_performance(gt_file, verbose=False, print_TP=True, method=1)
for results in match0_performance:
    print "tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % results

Loaded 906 ground truth entries
{2: 174, 3: 142, 4: 585, 5: 5}

tp=658, fp=248, fn=0, prec=0.726, rec=1.000, f1=0.841, th_prob=1.000


In [13]:
match0_performance = aligner0.evaluate_performance(gt_file, verbose=False, print_TP=True, method=2)
for results in match0_performance:
    print "tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % results

Loaded 906 ground truth entries
{2: 174, 3: 142, 4: 585, 5: 5}

tp=4089, fp=826, fn=42, prec=0.832, rec=0.990, f1=0.904, th_prob=1.000


In [14]:
match_mode = 1
aligner2 = Aligner(data_list, database_file, transformation_file, 
                       hp, verbose=True, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)
aligner2.run(match_mode)

Hyperparameters across_file_mass_tol=100, across_file_rt_tol=100, alpha_mass=1.0, beta=0.1, dp_alpha=100.0, mass_clustering_n_iterations=100, rt_clustering_burnin=100, rt_clustering_nsamps=200, t=0, within_file_mass_tol=60, within_file_rt_tol=30
[M+ACN+H, 2M+H, 2M+Na, M+ACN+2H, M+2ACN+2H, M+2H, M+H, M+H+NH4, M+2ACN+H, M+NH4, 2M+ACN+H, M+CH3OH+H, M+ACN+Na, M+Na]
Match mode 1
First stage clustering -- within_file_mass_tol=60.00, within_file_rt_tol=30.00, alpha=1.00


[Parallel(n_jobs=4)]: Done   1 out of   4 | elapsed:  4.1min remaining: 12.2min
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:  6.4min remaining:  6.4min
[Parallel(n_jobs=4)]: Done   3 out of   4 | elapsed:  7.4min remaining:  2.5min
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:  7.4min finished



File 0 clusters assignment 
Cluster ID 0
	peak_id 1 mass 106.070607 rt 106.020000 intensity 661.000000 (M+H 1.000)
Cluster ID 1
	peak_id 2 mass 83.089358 rt 18.000000 intensity 3035.000000 (M+H 1.000)
Cluster ID 2
	peak_id 3 mass 143.085005 rt 13.980000 intensity 1144.000000 (M+H 1.000)
	peak_id 23 mass 160.116552 rt 40.020000 intensity 144.000000 (M+NH4 1.000)
Cluster ID 3
	peak_id 4 mass 81.036160 rt 223.980000 intensity 665.000000 (M+H 0.740)
Cluster ID 4
	peak_id 5 mass 82.015049 rt 192.000000 intensity 11998.000000 (M+H 1.000)
Cluster ID 5
	peak_id 6 mass 82.013173 rt 226.020000 intensity 5858.000000 (M+H 1.000)
	peak_id 133 mass 145.018294 rt 220.020000 intensity 5749.000000 (M+ACN+Na 1.000)
Cluster ID 6
	peak_id 7 mass 144.089004 rt 10.020000 intensity 174.000000 (M+H 1.000)
Cluster ID 7
	peak_id 8 mass 80.038052 rt 196.020000 intensity 3070.000000 (M+H 1.000)
Cluster ID 8
	peak_id 9 mass 186.135355 rt 112.020000 intensity 473.000000 (M+H 1.000)
Cluster ID 10
	peak_id 11 mass 1

In [15]:
match2_performance = aligner2.evaluate_performance(gt_file, verbose=False, print_TP=True, method=1)
for results in match2_performance:
    print "tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % results

Loaded 906 ground truth entries
{2: 174, 3: 142, 4: 585, 5: 5}

tp=526, fp=380, fn=0, prec=0.581, rec=1.000, f1=0.735, th_prob=1.000


In [16]:
match2_performance = aligner2.evaluate_performance(gt_file, verbose=False, print_TP=True, method=2)
for results in match2_performance:
    print "tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % results

Loaded 906 ground truth entries
{2: 174, 3: 142, 4: 585, 5: 5}

tp=3518, fp=771, fn=613, prec=0.820, rec=0.852, f1=0.836, th_prob=1.000
