<h1>Experiment with std1pos</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip

In [4]:
from models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak, Possible
from discretisation import utils
from discretisation.preprocessing import FileLoader
from shared_bin_matching import SharedBinMatching as Aligner
from ground_truth import GroundTruth

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old/ground_truth/ground_truth.txt'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 5
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 120
hp.alpha_mass = 1.0
hp.dp_alpha = 1000.0
hp.t = 0.0
hp.mass_clustering_n_iterations = 200
hp.rt_clustering_nsamps = 400
hp.rt_clustering_burnin = 200

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30


In [7]:
evaluation_method = 2

In [8]:
param_list = []
for mass_tol in range(3, 10, 3):
    for rt_tol in range(10, 101, 10):
        param_list.append((mass_tol, rt_tol))

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [9]:
def load_or_create_clustering(filename, input_dir, transformation_file, hp):
    try:
        with gzip.GzipFile(filename, 'rb') as f:
            combined_list = cPickle.load(f)
            print "Loaded from %s" % filename
    except (IOError, EOFError):
        loader = FileLoader()        
        data_list = loader.load_model_input(input_dir, synthetic=True)
        aligner = Aligner(data_list, None, transformation_file, 
                               hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)
        clustering_results = aligner._first_stage_clustering()
        combined_list = zip(data_list, clustering_results)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(combined_list, f, protocol=cPickle.HIGHEST_PROTOCOL)        
        print "Saved to %s" % filename
        return combined_list

In [10]:
combined_list = load_or_create_clustering('std1_pos_clustering.p', input_dir, transformation_file, hp)

4999 features read from std1-file1.txt
4986 features read from std1-file2.txt
6836 features read from std1-file3.txt
9752 features read from std1-file4.txt
7076 features read from std1-file5.txt
4146 features read from std1-file6.txt
6319 features read from std1-file7.txt
4101 features read from std1-file8.txt
5485 features read from std1-file9.txt
5034 features read from std1-file10.txt
5317 features read from std1-file11.txt


[Parallel(n_jobs=4)]: Done   1 out of  11 | elapsed:  2.3min remaining: 23.1min
[Parallel(n_jobs=4)]: Done   3 out of  11 | elapsed:  4.1min remaining: 10.9min
[Parallel(n_jobs=4)]: Done   5 out of  11 | elapsed:  6.6min remaining:  8.0min
[Parallel(n_jobs=4)]: Done   7 out of  11 | elapsed:  8.4min remaining:  4.8min
[Parallel(n_jobs=4)]: Done   9 out of  11 | elapsed:  8.4min remaining:  1.9min
[Parallel(n_jobs=4)]: Done  11 out of  11 | elapsed:  9.3min finished


Saved to std1_pos_clustering.p
Created 4999 clusters
Created 4986 clusters
Created 6836 clusters
Created 9752 clusters
Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Assigning possible transformations 0/4999
Assigning possible transformations 0/4986
Assigning possible transformations 0/6836
Assigning possible transformations 0/9752
Assigning possible transformations 500/4999
Assigning possible transformations 500/4986
Assigning possible transformations 500/6836
Assigning possible transformations 500/9752
Assigning possible transformations 1000/4999
Assigning possible transformations 1000/4986
Assigning possible transformations 1000/6836
Assigning possible transformations 1000/9752
Assigning possible transformations 1500/4999
Assigning possible transformations 1500/4986
Assigning possible transformations 1500/6836
Assigning possible transformations 1500/9752
Assigning possible transformations 2000/4999
Assignin

<h2>Define Experimental Methods</h2>

In [52]:
def train(selected_data, param_list, hp, match_mode, evaluation_method):
    
    performances = []
    for param in param_list:

        # print "Parameter mass_tol=%f rt_tol=%f" % (param)
        hp.across_file_mass_tol = param[0]
        hp.across_file_rt_tol = param[1]
        selected_files = [x[0] for x in selected_data]  
        selected_clusterings = [x[1] for x in selected_data]            
        aligner = Aligner(selected_files, None, transformation_file, 
                               hp, verbose=False, seed=1234567890)
        aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

        res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
        output = param+res[0]
        performances.append(output)
    
    df = pd.DataFrame(performances, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
    
    sorted_df = df.sort_values(['F1', 'mass_tol', 'rt_tol'], ascending=[False, True, True])
    best_row = sorted_df.iloc[0]
    return df, best_row

In [53]:
def test(selected_data, best_row, hp, match_mode, evaluation_method):

    param = (best_row['mass_tol'], best_row['rt_tol'])
    hp.across_file_mass_tol = param[0]
    hp.across_file_rt_tol = param[1]
    selected_files = [x[0] for x in selected_data]
    selected_clusterings = [x[1] for x in selected_data]    
    aligner = Aligner(selected_files, None, transformation_file, 
                           hp, verbose=False, seed=1234567890)
    aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

    res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
    output = param+res[0]
    return output

In [43]:
def train_test(match_mode, training_list, testing_list):
    assert len(training_list) == len(testing_list)
    n_iter = len(training_list)
    exp_results = []
    for i in range(n_iter):

        training_data = training_list[i]
        print "Iteration %d" % i
        print "Training on %s" % [x[0].filename for x in training_data]
        training_df, best_training_row = train(training_data, param_list, hp, match_mode, evaluation_method)

        testing_data = testing_list[i]
        print "Testing on %s" % [x[0].filename for x in testing_data]
        match_res = test(testing_data, best_training_row, hp, match_mode, evaluation_method)
        output = (match_mode,) + match_res
        print "match_mode=%d, mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % output

        item = (training_data, training_df, best_training_row, match_res)
        exp_results.append(item)
        print
    return exp_results

In [46]:
def run_experiment(match_mode, training_list, testing_list, filename):
    try:
        with gzip.GzipFile(filename, 'rb') as f:        
            exp_results = cPickle.load(f)
            print "Loaded from %s" % filename
            return exp_results
    except (IOError, EOFError):
        exp_results = train_test(match_mode, training_list, testing_list)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(exp_results, f, protocol=cPickle.HIGHEST_PROTOCOL)                        
        print "Saved to %s" % filename
    return exp_results

In [47]:
def load_or_create_filelist(filename, combined_list, n_iter, n_files):
    try:
        with gzip.GzipFile(filename, 'rb') as f:        
            item_list = cPickle.load(f)
            print "Loaded from %s" % filename
            for item in item_list:
                print "%s" % [x[0].filename for x in item]
            return item_list
    except (IOError, EOFError):
        item_list = []
        for i in range(n_iter):
            item = random.sample(combined_list, n_files)
            print "%s" % [x[0].filename for x in item]
            item_list.append(item)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(item_list, f, protocol=cPickle.HIGHEST_PROTOCOL)                    
        print "Saved to %s" % filename
        return item_list

<h2>Run experiment with 2 random files</h2>

In [30]:
training_list = load_or_create_filelist('training_list_2.p', combined_list, 30, 2)

Loaded from training_list_2.p
['std1-file9.txt', 'std1-file6.txt']
['std1-file1.txt', 'std1-file3.txt']
['std1-file11.txt', 'std1-file6.txt']
['std1-file5.txt', 'std1-file6.txt']
['std1-file2.txt', 'std1-file8.txt']
['std1-file3.txt', 'std1-file4.txt']
['std1-file10.txt', 'std1-file11.txt']
['std1-file1.txt', 'std1-file7.txt']
['std1-file10.txt', 'std1-file1.txt']
['std1-file9.txt', 'std1-file6.txt']
['std1-file1.txt', 'std1-file5.txt']
['std1-file3.txt', 'std1-file1.txt']
['std1-file7.txt', 'std1-file8.txt']
['std1-file4.txt', 'std1-file10.txt']
['std1-file9.txt', 'std1-file11.txt']
['std1-file10.txt', 'std1-file8.txt']
['std1-file11.txt', 'std1-file3.txt']
['std1-file4.txt', 'std1-file7.txt']
['std1-file2.txt', 'std1-file3.txt']
['std1-file1.txt', 'std1-file5.txt']
['std1-file3.txt', 'std1-file2.txt']
['std1-file1.txt', 'std1-file4.txt']
['std1-file5.txt', 'std1-file11.txt']
['std1-file3.txt', 'std1-file4.txt']
['std1-file11.txt', 'std1-file9.txt']
['std1-file10.txt', 'std1-file5.txt

In [31]:
testing_list = load_or_create_filelist('testing_list_2.p', combined_list, 30, 2)

['std1-file8.txt', 'std1-file6.txt']
['std1-file6.txt', 'std1-file7.txt']
['std1-file8.txt', 'std1-file2.txt']
['std1-file8.txt', 'std1-file1.txt']
['std1-file4.txt', 'std1-file5.txt']
['std1-file2.txt', 'std1-file10.txt']
['std1-file9.txt', 'std1-file7.txt']
['std1-file2.txt', 'std1-file10.txt']
['std1-file2.txt', 'std1-file6.txt']
['std1-file6.txt', 'std1-file11.txt']
['std1-file6.txt', 'std1-file7.txt']
['std1-file7.txt', 'std1-file3.txt']
['std1-file4.txt', 'std1-file8.txt']
['std1-file9.txt', 'std1-file4.txt']
['std1-file8.txt', 'std1-file3.txt']
['std1-file2.txt', 'std1-file10.txt']
['std1-file11.txt', 'std1-file2.txt']
['std1-file6.txt', 'std1-file8.txt']
['std1-file10.txt', 'std1-file4.txt']
['std1-file2.txt', 'std1-file10.txt']
['std1-file11.txt', 'std1-file10.txt']
['std1-file2.txt', 'std1-file6.txt']
['std1-file3.txt', 'std1-file10.txt']
['std1-file2.txt', 'std1-file1.txt']
['std1-file2.txt', 'std1-file8.txt']
['std1-file1.txt', 'std1-file9.txt']
['std1-file10.txt', 'std1-fi

In [48]:
exp_results_1a = run_experiment(0, training_list, testing_list, 'res_match_feature_2.p')

Iteration 0
Training on ['std1-file9.txt', 'std1-file6.txt']
Testing on ['std1-file8.txt', 'std1-file6.txt']
match_mode=0, mass_tol=3, rt_tol=40, tp=114, fp=7, fn=16, prec=0.942, rec=0.877, f1=0.908, th_prob=1.000

Iteration 1
Training on ['std1-file1.txt', 'std1-file3.txt']
Testing on ['std1-file6.txt', 'std1-file7.txt']
match_mode=0, mass_tol=3, rt_tol=80, tp=24, fp=5, fn=4, prec=0.828, rec=0.857, f1=0.842, th_prob=1.000

Iteration 2
Training on ['std1-file11.txt', 'std1-file6.txt']
Testing on ['std1-file8.txt', 'std1-file2.txt']
match_mode=0, mass_tol=3, rt_tol=80, tp=119, fp=6, fn=8, prec=0.952, rec=0.937, f1=0.944, th_prob=1.000

Iteration 3
Training on ['std1-file5.txt', 'std1-file6.txt']
Testing on ['std1-file8.txt', 'std1-file1.txt']
match_mode=0, mass_tol=3, rt_tol=70, tp=120, fp=9, fn=12, prec=0.930, rec=0.909, f1=0.920, th_prob=1.000

Iteration 4
Training on ['std1-file2.txt', 'std1-file8.txt']
Testing on ['std1-file4.txt', 'std1-file5.txt']
match_mode=0, mass_tol=6, rt_tol=

In [54]:
exp_results_1b = run_experiment(1, training_list, testing_list, 'res_match_cluster_2.p')

Iteration 0
Training on ['std1-file9.txt', 'std1-file6.txt']
Testing on ['std1-file8.txt', 'std1-file6.txt']
match_mode=1, mass_tol=3, rt_tol=40, tp=112, fp=2, fn=18, prec=0.982, rec=0.862, f1=0.918, th_prob=1.000

Iteration 1
Training on ['std1-file1.txt', 'std1-file3.txt']
Testing on ['std1-file6.txt', 'std1-file7.txt']
match_mode=1, mass_tol=3, rt_tol=80, tp=27, fp=1, fn=1, prec=0.964, rec=0.964, f1=0.964, th_prob=1.000

Iteration 2
Training on ['std1-file11.txt', 'std1-file6.txt']
Testing on ['std1-file8.txt', 'std1-file2.txt']
match_mode=1, mass_tol=3, rt_tol=80, tp=120, fp=3, fn=7, prec=0.976, rec=0.945, f1=0.960, th_prob=1.000

Iteration 3
Training on ['std1-file5.txt', 'std1-file6.txt']
Testing on ['std1-file8.txt', 'std1-file1.txt']
match_mode=1, mass_tol=3, rt_tol=60, tp=119, fp=7, fn=13, prec=0.944, rec=0.902, f1=0.922, th_prob=1.000

Iteration 4
Training on ['std1-file2.txt', 'std1-file8.txt']
Testing on ['std1-file4.txt', 'std1-file5.txt']
match_mode=1, mass_tol=6, rt_tol=

In [55]:
print exp_results_1a[0]

([(<discretisation.models.PeakData object at 0x7f23cf22a750>, <adduct_cluster.AdductCluster object at 0x7f23cec61690>), (<discretisation.models.PeakData object at 0x7f23d5210e90>, <adduct_cluster.AdductCluster object at 0x7f23ca66e490>)],     mass_tol  rt_tol   TP  FP  FN      Prec       Rec        F1  Threshold
0          3      10  117   3  18  0.975000  0.866667  0.917647          1
1          3      20  127   3   8  0.976923  0.940741  0.958491          1
2          3      30  129   4   6  0.969925  0.955556  0.962687          1
3          3      40  131   5   4  0.963235  0.970370  0.966790          1
4          3      50  131   5   4  0.963235  0.970370  0.966790          1
5          3      60  131   5   4  0.963235  0.970370  0.966790          1
6          3      70  131   5   4  0.963235  0.970370  0.966790          1
7          3      80  131   5   4  0.963235  0.970370  0.966790          1
8          3      90  131   5   4  0.963235  0.970370  0.966790          1
9          

In [56]:
<h2>Run experiment with 4 random files</h2>

SyntaxError: invalid syntax (<ipython-input-56-d1e8b4854525>, line 1)

In [None]:
training_list = load_or_create_filelist('training_list_4.p', combined_list, 30, 4)

In [None]:
testing_list = load_or_create_filelist('testing_list_4.p', combined_list, 30, 4)

In [None]:
exp_results_2a = run_experiment(0, training_list, testing_list, 'res_match_feature_4.p')

In [None]:
exp_results_2b = run_experiment(1, training_list, testing_list, 'res_match_cluster_4.p')

In [None]:
<h2>Run experiment with 6 random files</h2>

In [None]:
training_list = load_or_create_filelist('training_list_6.p', combined_list, 30, 6)

In [None]:
testing_list = load_or_create_filelist('testing_list_6.p', combined_list, 30, 6)

In [None]:
exp_results_3a = run_experiment(0, training_list, testing_list, 'res_match_feature_6.p')

In [None]:
exp_results_3b = run_experiment(1, training_list, testing_list, 'res_match_cluster_6.p')