<h1>Experiment with std1pos</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
basedir = '/home/joewandy/git/metabolomics_tools'
sys.path.append(basedir)

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip

In [4]:
from alignment.models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak, Possible
from discretisation import utils
from discretisation.preprocessing import FileLoader
from alignment.shared_bin_matching import SharedBinMatching as Aligner
from alignment.ground_truth import GroundTruth

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old/ground_truth/ground_truth.txt'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 5
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 120
hp.alpha_mass = 1.0
hp.dp_alpha = 1000.0
hp.t = 0.0
hp.mass_clustering_n_iterations = 200
hp.rt_clustering_nsamps = 400
hp.rt_clustering_burnin = 200

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, rt_clustering_burnin=200, rt_clustering_nsamps=400, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30


In [7]:
evaluation_method = 2
n_iter = 30

In [8]:
param_list = []
for mass_tol in range(2, 11, 2):
    for rt_tol in range(5, 101, 5):
        param_list.append((mass_tol, rt_tol))

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [9]:
def load_or_create_clustering(filename, input_dir, transformation_file, hp):
    try:
        with gzip.GzipFile(filename, 'rb') as f:
            combined_list = cPickle.load(f)
            print "Loaded from %s" % filename
            return combined_list
    except (IOError, EOFError):
        loader = FileLoader()        
        data_list = loader.load_model_input(input_dir, synthetic=True)
        aligner = Aligner(data_list, None, transformation_file, 
                               hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)
        clustering_results = aligner._first_stage_clustering()
        combined_list = zip(data_list, clustering_results)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(combined_list, f, protocol=cPickle.HIGHEST_PROTOCOL)        
        print "Saved to %s" % filename
        return combined_list

In [10]:
combined_list = load_or_create_clustering('pickles/std1_pos_clustering.p', input_dir, transformation_file, hp)

Loaded from pickles/std1_pos_clustering.p


<h2>Define Experimental Methods</h2>

In [11]:
def train(selected_data, param_list, hp, match_mode, evaluation_method):
    
    performances = []
    for param in param_list:

        # print "Parameter mass_tol=%f rt_tol=%f" % (param)
        hp.across_file_mass_tol = param[0]
        hp.across_file_rt_tol = param[1]
        selected_files = [x[0] for x in selected_data]  
        selected_clusterings = [x[1] for x in selected_data]            
        aligner = Aligner(selected_files, None, transformation_file, 
                               hp, verbose=False, seed=1234567890)
        aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

        res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
        output = param+res[0]
        performances.append(output)
    
    df = pd.DataFrame(performances, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
    
    sorted_df = df.sort_values(['F1', 'mass_tol', 'rt_tol'], ascending=[False, True, True])
    best_row = sorted_df.iloc[0]
    return df, best_row

In [12]:
def test(selected_data, best_row, hp, match_mode, evaluation_method):

    param = (best_row['mass_tol'], best_row['rt_tol'])
    hp.across_file_mass_tol = param[0]
    hp.across_file_rt_tol = param[1]
    selected_files = [x[0] for x in selected_data]
    selected_clusterings = [x[1] for x in selected_data]    
    aligner = Aligner(selected_files, None, transformation_file, 
                           hp, verbose=False, seed=1234567890)
    aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

    res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
    output = param+res[0]
    return output

In [13]:
def train_test(match_mode, training_list, testing_list):
    assert len(training_list) == len(testing_list)
    n_iter = len(training_list)
    exp_results = []
    for i in range(n_iter):

        training_data = training_list[i]
        print "Iteration %d" % i
        print "Training on %s" % [x[0].filename for x in training_data]
        training_df, best_training_row = train(training_data, param_list, hp, match_mode, evaluation_method)

        testing_data = testing_list[i]
        print "Testing on %s" % [x[0].filename for x in testing_data]
        match_res = test(testing_data, best_training_row, hp, match_mode, evaluation_method)
        output = (match_mode,) + match_res
        print "match_mode=%d, mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % output

        item = (training_data, training_df, best_training_row, match_res)
        exp_results.append(item)
        print
    return exp_results

In [14]:
def run_experiment(match_mode, training_list, testing_list, filename):
    try:
        with gzip.GzipFile(filename, 'rb') as f:        
            exp_results = cPickle.load(f)
            print "Loaded from %s" % filename
            return exp_results
    except (IOError, EOFError):
        exp_results = train_test(match_mode, training_list, testing_list)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(exp_results, f, protocol=cPickle.HIGHEST_PROTOCOL)                        
        print "Saved to %s" % filename
    return exp_results

In [15]:
def load_or_create_filelist(filename, combined_list, n_iter, n_files):
    try:
        with gzip.GzipFile(filename, 'rb') as f:        
            item_list = cPickle.load(f)
            print "Loaded from %s" % filename
            for item in item_list:
                print "%s" % [x[0].filename for x in item]
            return item_list
    except (IOError, EOFError):
        item_list = []
        for i in range(n_iter):
            item = random.sample(combined_list, n_files)
            print "%s" % [x[0].filename for x in item]
            item_list.append(item)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(item_list, f, protocol=cPickle.HIGHEST_PROTOCOL)                    
        print "Saved to %s" % filename
        return item_list

<h2>Run experiment with 2 random files</h2>

In [16]:
n_files = 2

In [None]:
training_list = load_or_create_filelist('pickles/training_list_2.p', combined_list, n_iter, n_files)

Loaded from pickles/training_list_2.p
['std1-file5.txt', 'std1-file7.txt']
['std1-file5.txt', 'std1-file11.txt']
['std1-file10.txt', 'std1-file5.txt']
['std1-file9.txt', 'std1-file2.txt']
['std1-file8.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file6.txt']
['std1-file10.txt', 'std1-file3.txt']
['std1-file8.txt', 'std1-file2.txt']
['std1-file11.txt', 'std1-file7.txt']
['std1-file1.txt', 'std1-file2.txt']
['std1-file8.txt', 'std1-file1.txt']
['std1-file6.txt', 'std1-file9.txt']
['std1-file11.txt', 'std1-file7.txt']
['std1-file7.txt', 'std1-file8.txt']
['std1-file2.txt', 'std1-file7.txt']
['std1-file2.txt', 'std1-file8.txt']
['std1-file10.txt', 'std1-file7.txt']
['std1-file9.txt', 'std1-file4.txt']
['std1-file9.txt', 'std1-file4.txt']
['std1-file7.txt', 'std1-file2.txt']
['std1-file1.txt', 'std1-file10.txt']
['std1-file2.txt', 'std1-file9.txt']
['std1-file11.txt', 'std1-file1.txt']
['std1-file1.txt', 'std1-file10.txt']
['std1-file9.txt', 'std1-file8.txt']
['std1-file3.txt', 'std1-fil

In [None]:
testing_list = load_or_create_filelist('pickles/testing_list_2.p', combined_list, n_iter, n_files)

In [19]:
exp_results_1a = run_experiment(0, training_list, testing_list, 'pickles/res_match_feature_2.p')

Iteration 0
Training on ['std1-file5.txt', 'std1-file7.txt']
Testing on ['std1-file6.txt', 'std1-file8.txt']
match_mode=0, mass_tol=2, rt_tol=100, tp=119, fp=8, fn=11, prec=0.937, rec=0.915, f1=0.926, th_prob=1.000

Iteration 1
Training on ['std1-file5.txt', 'std1-file11.txt']
Testing on ['std1-file7.txt', 'std1-file11.txt']
match_mode=0, mass_tol=4, rt_tol=55, tp=24, fp=3, fn=7, prec=0.889, rec=0.774, f1=0.828, th_prob=1.000

Iteration 2
Training on ['std1-file10.txt', 'std1-file5.txt']
Testing on ['std1-file6.txt', 'std1-file2.txt']
match_mode=0, mass_tol=2, rt_tol=75, tp=119, fp=11, fn=11, prec=0.915, rec=0.915, f1=0.915, th_prob=1.000

Iteration 3
Training on ['std1-file9.txt', 'std1-file2.txt']
Testing on ['std1-file4.txt', 'std1-file8.txt']
match_mode=0, mass_tol=2, rt_tol=25, tp=135, fp=4, fn=21, prec=0.971, rec=0.865, f1=0.915, th_prob=1.000

Iteration 4
Training on ['std1-file8.txt', 'std1-file5.txt']
Testing on ['std1-file4.txt', 'std1-file7.txt']
match_mode=0, mass_tol=6, rt

In [20]:
exp_results_1b = run_experiment(1, training_list, testing_list, 'pickles/res_match_cluster_2.p')

Iteration 0
Training on ['std1-file5.txt', 'std1-file7.txt']
Testing on ['std1-file6.txt', 'std1-file8.txt']
match_mode=1, mass_tol=2, rt_tol=100, tp=120, fp=2, fn=10, prec=0.984, rec=0.923, f1=0.952, th_prob=1.000

Iteration 1
Training on ['std1-file5.txt', 'std1-file11.txt']
Testing on ['std1-file7.txt', 'std1-file11.txt']
match_mode=1, mass_tol=2, rt_tol=55, tp=27, fp=1, fn=4, prec=0.964, rec=0.871, f1=0.915, th_prob=1.000

Iteration 2
Training on ['std1-file10.txt', 'std1-file5.txt']
Testing on ['std1-file6.txt', 'std1-file2.txt']
match_mode=1, mass_tol=2, rt_tol=65, tp=125, fp=4, fn=5, prec=0.969, rec=0.962, f1=0.965, th_prob=1.000

Iteration 3
Training on ['std1-file9.txt', 'std1-file2.txt']
Testing on ['std1-file4.txt', 'std1-file8.txt']
match_mode=1, mass_tol=2, rt_tol=25, tp=126, fp=4, fn=30, prec=0.969, rec=0.808, f1=0.881, th_prob=1.000

Iteration 4
Training on ['std1-file8.txt', 'std1-file5.txt']
Testing on ['std1-file4.txt', 'std1-file7.txt']
match_mode=1, mass_tol=6, rt_t

<h2>Run experiment with 3 random files</h2>

In [21]:
n_files = 3

In [22]:
training_list = load_or_create_filelist('pickles/training_list_3.p', combined_list, n_iter, n_files)

['std1-file10.txt', 'std1-file8.txt', 'std1-file4.txt']
['std1-file7.txt', 'std1-file9.txt', 'std1-file8.txt']
['std1-file4.txt', 'std1-file3.txt', 'std1-file7.txt']
['std1-file2.txt', 'std1-file4.txt', 'std1-file10.txt']
['std1-file2.txt', 'std1-file11.txt', 'std1-file1.txt']
['std1-file11.txt', 'std1-file4.txt', 'std1-file7.txt']
['std1-file7.txt', 'std1-file2.txt', 'std1-file6.txt']
['std1-file9.txt', 'std1-file2.txt', 'std1-file10.txt']
['std1-file1.txt', 'std1-file4.txt', 'std1-file10.txt']
['std1-file10.txt', 'std1-file2.txt', 'std1-file7.txt']
['std1-file11.txt', 'std1-file2.txt', 'std1-file9.txt']
['std1-file1.txt', 'std1-file2.txt', 'std1-file4.txt']
['std1-file2.txt', 'std1-file3.txt', 'std1-file1.txt']
['std1-file6.txt', 'std1-file3.txt', 'std1-file9.txt']
['std1-file2.txt', 'std1-file3.txt', 'std1-file7.txt']
['std1-file5.txt', 'std1-file9.txt', 'std1-file6.txt']
['std1-file11.txt', 'std1-file7.txt', 'std1-file3.txt']
['std1-file7.txt', 'std1-file11.txt', 'std1-file2.txt']


In [23]:
testing_list = load_or_create_filelist('pickles/testing_list_3.p', combined_list, n_iter, n_files)

['std1-file1.txt', 'std1-file2.txt', 'std1-file7.txt']
['std1-file4.txt', 'std1-file6.txt', 'std1-file8.txt']
['std1-file6.txt', 'std1-file2.txt', 'std1-file7.txt']
['std1-file1.txt', 'std1-file9.txt', 'std1-file3.txt']
['std1-file4.txt', 'std1-file9.txt', 'std1-file3.txt']
['std1-file8.txt', 'std1-file4.txt', 'std1-file1.txt']
['std1-file11.txt', 'std1-file4.txt', 'std1-file6.txt']
['std1-file11.txt', 'std1-file1.txt', 'std1-file7.txt']
['std1-file3.txt', 'std1-file8.txt', 'std1-file6.txt']
['std1-file8.txt', 'std1-file10.txt', 'std1-file6.txt']
['std1-file5.txt', 'std1-file9.txt', 'std1-file10.txt']
['std1-file9.txt', 'std1-file2.txt', 'std1-file11.txt']
['std1-file11.txt', 'std1-file3.txt', 'std1-file8.txt']
['std1-file11.txt', 'std1-file7.txt', 'std1-file4.txt']
['std1-file9.txt', 'std1-file4.txt', 'std1-file5.txt']
['std1-file1.txt', 'std1-file2.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file8.txt', 'std1-file2.txt']
['std1-file8.txt', 'std1-file9.txt', 'std1-file5.txt']
['s

In [24]:
exp_results_2a = run_experiment(0, training_list, testing_list, 'pickles/res_match_feature_3.p')

Iteration 0
Training on ['std1-file10.txt', 'std1-file8.txt', 'std1-file4.txt']
Testing on ['std1-file1.txt', 'std1-file2.txt', 'std1-file7.txt']
match_mode=0, mass_tol=6, rt_tol=75, tp=192, fp=32, fn=21, prec=0.857, rec=0.901, f1=0.879, th_prob=1.000

Iteration 1
Training on ['std1-file7.txt', 'std1-file9.txt', 'std1-file8.txt']
Testing on ['std1-file4.txt', 'std1-file6.txt', 'std1-file8.txt']
match_mode=0, mass_tol=4, rt_tol=70, tp=401, fp=27, fn=27, prec=0.937, rec=0.937, f1=0.937, th_prob=1.000

Iteration 2
Training on ['std1-file4.txt', 'std1-file3.txt', 'std1-file7.txt']
Testing on ['std1-file6.txt', 'std1-file2.txt', 'std1-file7.txt']
match_mode=0, mass_tol=4, rt_tol=60, tp=160, fp=32, fn=27, prec=0.833, rec=0.856, f1=0.844, th_prob=1.000

Iteration 3
Training on ['std1-file2.txt', 'std1-file4.txt', 'std1-file10.txt']
Testing on ['std1-file1.txt', 'std1-file9.txt', 'std1-file3.txt']
match_mode=0, mass_tol=6, rt_tol=55, tp=418, fp=40, fn=46, prec=0.913, rec=0.901, f1=0.907, th_pr

In [25]:
exp_results_2b = run_experiment(1, training_list, testing_list, 'pickles/res_match_cluster_3.p')

Iteration 0
Training on ['std1-file10.txt', 'std1-file8.txt', 'std1-file4.txt']
Testing on ['std1-file1.txt', 'std1-file2.txt', 'std1-file7.txt']
match_mode=1, mass_tol=2, rt_tol=75, tp=206, fp=20, fn=7, prec=0.912, rec=0.967, f1=0.938, th_prob=1.000

Iteration 1
Training on ['std1-file7.txt', 'std1-file9.txt', 'std1-file8.txt']
Testing on ['std1-file4.txt', 'std1-file6.txt', 'std1-file8.txt']
match_mode=1, mass_tol=4, rt_tol=70, tp=393, fp=21, fn=35, prec=0.949, rec=0.918, f1=0.933, th_prob=1.000

Iteration 2
Training on ['std1-file4.txt', 'std1-file3.txt', 'std1-file7.txt']
Testing on ['std1-file6.txt', 'std1-file2.txt', 'std1-file7.txt']
match_mode=1, mass_tol=4, rt_tol=60, tp=174, fp=20, fn=13, prec=0.897, rec=0.930, f1=0.913, th_prob=1.000

Iteration 3
Training on ['std1-file2.txt', 'std1-file4.txt', 'std1-file10.txt']
Testing on ['std1-file1.txt', 'std1-file9.txt', 'std1-file3.txt']
match_mode=1, mass_tol=4, rt_tol=55, tp=427, fp=21, fn=37, prec=0.953, rec=0.920, f1=0.936, th_pro

<h2>Run experiment with 4 random files</h2>

In [26]:
n_files = 4

In [27]:
training_list = load_or_create_filelist('pickles/training_list_4.p', combined_list, n_iter, n_files)

['std1-file3.txt', 'std1-file2.txt', 'std1-file9.txt', 'std1-file8.txt']
['std1-file6.txt', 'std1-file3.txt', 'std1-file5.txt', 'std1-file11.txt']
['std1-file1.txt', 'std1-file8.txt', 'std1-file4.txt', 'std1-file2.txt']
['std1-file1.txt', 'std1-file6.txt', 'std1-file5.txt', 'std1-file8.txt']
['std1-file9.txt', 'std1-file4.txt', 'std1-file1.txt', 'std1-file8.txt']
['std1-file4.txt', 'std1-file11.txt', 'std1-file7.txt', 'std1-file10.txt']
['std1-file9.txt', 'std1-file2.txt', 'std1-file4.txt', 'std1-file5.txt']
['std1-file11.txt', 'std1-file5.txt', 'std1-file6.txt', 'std1-file3.txt']
['std1-file1.txt', 'std1-file10.txt', 'std1-file9.txt', 'std1-file2.txt']
['std1-file6.txt', 'std1-file11.txt', 'std1-file8.txt', 'std1-file10.txt']
['std1-file5.txt', 'std1-file10.txt', 'std1-file11.txt', 'std1-file6.txt']
['std1-file8.txt', 'std1-file4.txt', 'std1-file11.txt', 'std1-file7.txt']
['std1-file8.txt', 'std1-file2.txt', 'std1-file10.txt', 'std1-file11.txt']
['std1-file7.txt', 'std1-file10.txt', '

In [28]:
testing_list = load_or_create_filelist('pickles/testing_list_4.p', combined_list, n_iter, n_files)

['std1-file10.txt', 'std1-file8.txt', 'std1-file7.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file1.txt', 'std1-file11.txt', 'std1-file9.txt']
['std1-file9.txt', 'std1-file4.txt', 'std1-file7.txt', 'std1-file10.txt']
['std1-file6.txt', 'std1-file1.txt', 'std1-file8.txt', 'std1-file5.txt']
['std1-file1.txt', 'std1-file7.txt', 'std1-file10.txt', 'std1-file9.txt']
['std1-file2.txt', 'std1-file10.txt', 'std1-file6.txt', 'std1-file1.txt']
['std1-file5.txt', 'std1-file11.txt', 'std1-file7.txt', 'std1-file4.txt']
['std1-file3.txt', 'std1-file7.txt', 'std1-file11.txt', 'std1-file5.txt']
['std1-file2.txt', 'std1-file10.txt', 'std1-file1.txt', 'std1-file11.txt']
['std1-file6.txt', 'std1-file4.txt', 'std1-file9.txt', 'std1-file1.txt']
['std1-file11.txt', 'std1-file10.txt', 'std1-file9.txt', 'std1-file7.txt']
['std1-file1.txt', 'std1-file4.txt', 'std1-file11.txt', 'std1-file10.txt']
['std1-file9.txt', 'std1-file2.txt', 'std1-file7.txt', 'std1-file1.txt']
['std1-file2.txt', 'std1-file6.txt', '

In [29]:
exp_results_3a = run_experiment(0, training_list, testing_list, 'pickles/res_match_feature_4.p')

Iteration 0
Training on ['std1-file3.txt', 'std1-file2.txt', 'std1-file9.txt', 'std1-file8.txt']
Testing on ['std1-file10.txt', 'std1-file8.txt', 'std1-file7.txt', 'std1-file5.txt']
match_mode=0, mass_tol=4, rt_tol=80, tp=511, fp=90, fn=26, prec=0.850, rec=0.952, f1=0.898, th_prob=1.000

Iteration 1
Training on ['std1-file6.txt', 'std1-file3.txt', 'std1-file5.txt', 'std1-file11.txt']
Testing on ['std1-file7.txt', 'std1-file1.txt', 'std1-file11.txt', 'std1-file9.txt']
match_mode=0, mass_tol=6, rt_tol=70, tp=450, fp=90, fn=70, prec=0.833, rec=0.865, f1=0.849, th_prob=1.000

Iteration 2
Training on ['std1-file1.txt', 'std1-file8.txt', 'std1-file4.txt', 'std1-file2.txt']
Testing on ['std1-file9.txt', 'std1-file4.txt', 'std1-file7.txt', 'std1-file10.txt']
match_mode=0, mass_tol=4, rt_tol=85, tp=507, fp=106, fn=50, prec=0.827, rec=0.910, f1=0.867, th_prob=1.000

Iteration 3
Training on ['std1-file1.txt', 'std1-file6.txt', 'std1-file5.txt', 'std1-file8.txt']
Testing on ['std1-file6.txt', 'std

In [30]:
exp_results_3b = run_experiment(1, training_list, testing_list, 'pickles/res_match_cluster_4.p')

Iteration 0
Training on ['std1-file3.txt', 'std1-file2.txt', 'std1-file9.txt', 'std1-file8.txt']
Testing on ['std1-file10.txt', 'std1-file8.txt', 'std1-file7.txt', 'std1-file5.txt']
match_mode=1, mass_tol=4, rt_tol=80, tp=488, fp=82, fn=49, prec=0.856, rec=0.909, f1=0.882, th_prob=1.000

Iteration 1
Training on ['std1-file6.txt', 'std1-file3.txt', 'std1-file5.txt', 'std1-file11.txt']
Testing on ['std1-file7.txt', 'std1-file1.txt', 'std1-file11.txt', 'std1-file9.txt']
match_mode=1, mass_tol=4, rt_tol=70, tp=488, fp=42, fn=32, prec=0.921, rec=0.938, f1=0.930, th_prob=1.000

Iteration 2
Training on ['std1-file1.txt', 'std1-file8.txt', 'std1-file4.txt', 'std1-file2.txt']
Testing on ['std1-file9.txt', 'std1-file4.txt', 'std1-file7.txt', 'std1-file10.txt']
match_mode=1, mass_tol=4, rt_tol=95, tp=505, fp=133, fn=52, prec=0.792, rec=0.907, f1=0.845, th_prob=1.000

Iteration 3
Training on ['std1-file1.txt', 'std1-file6.txt', 'std1-file5.txt', 'std1-file8.txt']
Testing on ['std1-file6.txt', 'std

<h2>Run experiment with 5 random files</h2>

In [31]:
n_files = 5

In [32]:
training_list = load_or_create_filelist('pickles/training_list_5.p', combined_list, n_iter, n_files)

['std1-file4.txt', 'std1-file7.txt', 'std1-file9.txt', 'std1-file5.txt', 'std1-file3.txt']
['std1-file6.txt', 'std1-file2.txt', 'std1-file11.txt', 'std1-file9.txt', 'std1-file5.txt']
['std1-file5.txt', 'std1-file6.txt', 'std1-file10.txt', 'std1-file2.txt', 'std1-file7.txt']
['std1-file9.txt', 'std1-file2.txt', 'std1-file8.txt', 'std1-file1.txt', 'std1-file3.txt']
['std1-file5.txt', 'std1-file11.txt', 'std1-file7.txt', 'std1-file3.txt', 'std1-file10.txt']
['std1-file10.txt', 'std1-file9.txt', 'std1-file1.txt', 'std1-file8.txt', 'std1-file11.txt']
['std1-file6.txt', 'std1-file9.txt', 'std1-file3.txt', 'std1-file2.txt', 'std1-file7.txt']
['std1-file1.txt', 'std1-file6.txt', 'std1-file7.txt', 'std1-file2.txt', 'std1-file9.txt']
['std1-file2.txt', 'std1-file4.txt', 'std1-file8.txt', 'std1-file7.txt', 'std1-file1.txt']
['std1-file11.txt', 'std1-file5.txt', 'std1-file7.txt', 'std1-file10.txt', 'std1-file6.txt']
['std1-file3.txt', 'std1-file9.txt', 'std1-file10.txt', 'std1-file1.txt', 'std1-fi

In [33]:
testing_list = load_or_create_filelist('pickles/testing_list_5.p', combined_list, n_iter, n_files)

['std1-file8.txt', 'std1-file4.txt', 'std1-file10.txt', 'std1-file6.txt', 'std1-file2.txt']
['std1-file6.txt', 'std1-file2.txt', 'std1-file8.txt', 'std1-file3.txt', 'std1-file9.txt']
['std1-file3.txt', 'std1-file2.txt', 'std1-file5.txt', 'std1-file10.txt', 'std1-file4.txt']
['std1-file11.txt', 'std1-file10.txt', 'std1-file1.txt', 'std1-file9.txt', 'std1-file4.txt']
['std1-file6.txt', 'std1-file7.txt', 'std1-file5.txt', 'std1-file11.txt', 'std1-file8.txt']
['std1-file7.txt', 'std1-file11.txt', 'std1-file10.txt', 'std1-file8.txt', 'std1-file3.txt']
['std1-file2.txt', 'std1-file8.txt', 'std1-file9.txt', 'std1-file6.txt', 'std1-file11.txt']
['std1-file11.txt', 'std1-file10.txt', 'std1-file9.txt', 'std1-file1.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file5.txt', 'std1-file10.txt', 'std1-file3.txt', 'std1-file1.txt']
['std1-file6.txt', 'std1-file1.txt', 'std1-file8.txt', 'std1-file5.txt', 'std1-file10.txt']
['std1-file3.txt', 'std1-file2.txt', 'std1-file8.txt', 'std1-file9.txt', 'std1

In [34]:
exp_results_4a = run_experiment(0, training_list, testing_list, 'pickles/res_match_feature_5.p')

Iteration 0
Training on ['std1-file4.txt', 'std1-file7.txt', 'std1-file9.txt', 'std1-file5.txt', 'std1-file3.txt']
Testing on ['std1-file8.txt', 'std1-file4.txt', 'std1-file10.txt', 'std1-file6.txt', 'std1-file2.txt']
match_mode=0, mass_tol=6, rt_tol=60, tp=1292, fp=131, fn=107, prec=0.908, rec=0.924, f1=0.916, th_prob=1.000

Iteration 1
Training on ['std1-file6.txt', 'std1-file2.txt', 'std1-file11.txt', 'std1-file9.txt', 'std1-file5.txt']
Testing on ['std1-file6.txt', 'std1-file2.txt', 'std1-file8.txt', 'std1-file3.txt', 'std1-file9.txt']
match_mode=0, mass_tol=4, rt_tol=70, tp=1260, fp=142, fn=142, prec=0.899, rec=0.899, f1=0.899, th_prob=1.000

Iteration 2
Training on ['std1-file5.txt', 'std1-file6.txt', 'std1-file10.txt', 'std1-file2.txt', 'std1-file7.txt']
Testing on ['std1-file3.txt', 'std1-file2.txt', 'std1-file5.txt', 'std1-file10.txt', 'std1-file4.txt']
match_mode=0, mass_tol=2, rt_tol=70, tp=1306, fp=164, fn=289, prec=0.888, rec=0.819, f1=0.852, th_prob=1.000

Iteration 3
Tra

In [35]:
exp_results_4b = run_experiment(1, training_list, testing_list, 'pickles/res_match_cluster_5.p')

Iteration 0
Training on ['std1-file4.txt', 'std1-file7.txt', 'std1-file9.txt', 'std1-file5.txt', 'std1-file3.txt']
Testing on ['std1-file8.txt', 'std1-file4.txt', 'std1-file10.txt', 'std1-file6.txt', 'std1-file2.txt']
match_mode=1, mass_tol=4, rt_tol=60, tp=1269, fp=90, fn=130, prec=0.934, rec=0.907, f1=0.920, th_prob=1.000

Iteration 1
Training on ['std1-file6.txt', 'std1-file2.txt', 'std1-file11.txt', 'std1-file9.txt', 'std1-file5.txt']
Testing on ['std1-file6.txt', 'std1-file2.txt', 'std1-file8.txt', 'std1-file3.txt', 'std1-file9.txt']
match_mode=1, mass_tol=4, rt_tol=65, tp=1298, fp=83, fn=104, prec=0.940, rec=0.926, f1=0.933, th_prob=1.000

Iteration 2
Training on ['std1-file5.txt', 'std1-file6.txt', 'std1-file10.txt', 'std1-file2.txt', 'std1-file7.txt']
Testing on ['std1-file3.txt', 'std1-file2.txt', 'std1-file5.txt', 'std1-file10.txt', 'std1-file4.txt']
match_mode=1, mass_tol=4, rt_tol=60, tp=1460, fp=112, fn=135, prec=0.929, rec=0.915, f1=0.922, th_prob=1.000

Iteration 3
Train

<h2>Run experiment with 6 random files</h2>

In [36]:
n_files = 6

In [37]:
training_list = load_or_create_filelist('pickles/training_list_6.p', combined_list, n_iter, n_files)

['std1-file8.txt', 'std1-file6.txt', 'std1-file3.txt', 'std1-file7.txt', 'std1-file4.txt', 'std1-file5.txt']
['std1-file9.txt', 'std1-file6.txt', 'std1-file8.txt', 'std1-file4.txt', 'std1-file11.txt', 'std1-file3.txt']
['std1-file4.txt', 'std1-file6.txt', 'std1-file7.txt', 'std1-file9.txt', 'std1-file3.txt', 'std1-file10.txt']
['std1-file1.txt', 'std1-file2.txt', 'std1-file3.txt', 'std1-file9.txt', 'std1-file4.txt', 'std1-file11.txt']
['std1-file2.txt', 'std1-file8.txt', 'std1-file1.txt', 'std1-file6.txt', 'std1-file11.txt', 'std1-file4.txt']
['std1-file4.txt', 'std1-file1.txt', 'std1-file5.txt', 'std1-file8.txt', 'std1-file3.txt', 'std1-file9.txt']
['std1-file6.txt', 'std1-file1.txt', 'std1-file4.txt', 'std1-file3.txt', 'std1-file9.txt', 'std1-file8.txt']
['std1-file10.txt', 'std1-file1.txt', 'std1-file2.txt', 'std1-file8.txt', 'std1-file5.txt', 'std1-file6.txt']
['std1-file2.txt', 'std1-file4.txt', 'std1-file7.txt', 'std1-file3.txt', 'std1-file8.txt', 'std1-file1.txt']
['std1-file8.t

In [38]:
testing_list = load_or_create_filelist('pickles/testing_list_6.p', combined_list, n_iter, n_files)

['std1-file6.txt', 'std1-file8.txt', 'std1-file3.txt', 'std1-file7.txt', 'std1-file5.txt', 'std1-file10.txt']
['std1-file6.txt', 'std1-file10.txt', 'std1-file9.txt', 'std1-file4.txt', 'std1-file8.txt', 'std1-file3.txt']
['std1-file3.txt', 'std1-file1.txt', 'std1-file2.txt', 'std1-file5.txt', 'std1-file10.txt', 'std1-file7.txt']
['std1-file10.txt', 'std1-file4.txt', 'std1-file11.txt', 'std1-file1.txt', 'std1-file3.txt', 'std1-file6.txt']
['std1-file2.txt', 'std1-file8.txt', 'std1-file6.txt', 'std1-file4.txt', 'std1-file3.txt', 'std1-file10.txt']
['std1-file6.txt', 'std1-file3.txt', 'std1-file8.txt', 'std1-file9.txt', 'std1-file5.txt', 'std1-file2.txt']
['std1-file4.txt', 'std1-file2.txt', 'std1-file10.txt', 'std1-file3.txt', 'std1-file7.txt', 'std1-file11.txt']
['std1-file9.txt', 'std1-file10.txt', 'std1-file7.txt', 'std1-file4.txt', 'std1-file2.txt', 'std1-file5.txt']
['std1-file9.txt', 'std1-file5.txt', 'std1-file7.txt', 'std1-file10.txt', 'std1-file3.txt', 'std1-file8.txt']
['std1-fi

In [None]:
exp_results_5a = run_experiment(0, training_list, testing_list, 'pickles/res_match_feature_6.p')

Iteration 0
Training on ['std1-file8.txt', 'std1-file6.txt', 'std1-file3.txt', 'std1-file7.txt', 'std1-file4.txt', 'std1-file5.txt']
Testing on ['std1-file6.txt', 'std1-file8.txt', 'std1-file3.txt', 'std1-file7.txt', 'std1-file5.txt', 'std1-file10.txt']
match_mode=0, mass_tol=4, rt_tol=60, tp=1465, fp=184, fn=150, prec=0.888, rec=0.907, f1=0.898, th_prob=1.000

Iteration 1
Training on ['std1-file9.txt', 'std1-file6.txt', 'std1-file8.txt', 'std1-file4.txt', 'std1-file11.txt', 'std1-file3.txt']
Testing on ['std1-file6.txt', 'std1-file10.txt', 'std1-file9.txt', 'std1-file4.txt', 'std1-file8.txt', 'std1-file3.txt']
match_mode=0, mass_tol=4, rt_tol=80, tp=2030, fp=217, fn=190, prec=0.903, rec=0.914, f1=0.909, th_prob=1.000

Iteration 2
Training on ['std1-file4.txt', 'std1-file6.txt', 'std1-file7.txt', 'std1-file9.txt', 'std1-file3.txt', 'std1-file10.txt']
Testing on ['std1-file3.txt', 'std1-file1.txt', 'std1-file2.txt', 'std1-file5.txt', 'std1-file10.txt', 'std1-file7.txt']
match_mode=0, ma

In [None]:
exp_results_5b = run_experiment(1, training_list, testing_list, 'pickles/res_match_cluster_6.p')

Iteration 0
Training on ['std1-file8.txt', 'std1-file6.txt', 'std1-file3.txt', 'std1-file7.txt', 'std1-file4.txt', 'std1-file5.txt']
