<h1>Experiment with adding noise on the std1pos data</h1>

In [4]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
import os
import sys
basedir = '/home/joewandy/git/metabolomics_tools'
sys.path.append(basedir)

In [89]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip

In [54]:
from alignment.models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak, Possible
from discretisation import utils
from discretisation.preprocessing import FileLoader
from alignment.shared_bin_matching import SharedBinMatching as Aligner
from alignment.ground_truth import GroundTruth
from discretisation.models import Feature, PeakData

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [8]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old/ground_truth/ground_truth.txt'

In [9]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 5
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 60
hp.alpha_mass = 1.0
hp.dp_alpha = 1000.0
hp.beta = 0.1
hp.t = 0.0
hp.mass_clustering_n_iterations = 200
hp.rt_clustering_nsamps = 100
hp.rt_clustering_burnin = 0

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=60, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, matching_alpha=0.3, rt_clustering_burnin=0, rt_clustering_nsamps=100, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30


In [10]:
evaluation_method = 2
n_iter = 30

In [11]:
param_list = []
for mass_tol in range(2, 11, 2):
    for rt_tol in range(5, 101, 5):
        param_list.append((mass_tol, rt_tol))

# overwrite
param_list = [(hp.across_file_mass_tol, hp.across_file_rt_tol)]

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [12]:
def load_or_create_clustering(filename, input_dir, transformation_file, hp):
    try:
        with gzip.GzipFile(filename, 'rb') as f:
            combined_list = cPickle.load(f)
            print "Loaded from %s" % filename
            return combined_list
    except (IOError, EOFError):
        loader = FileLoader()        
        data_list = loader.load_model_input(input_dir, synthetic=True)
        aligner = Aligner(data_list, None, transformation_file, 
                               hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)
        clustering_results = aligner._first_stage_clustering()
        combined_list = zip(data_list, clustering_results)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(combined_list, f, protocol=cPickle.HIGHEST_PROTOCOL)        
        print "Saved to %s" % filename
        return combined_list

In [13]:
combined_list = load_or_create_clustering('pickles/std1_pos_clustering.p', input_dir, transformation_file, hp)

Loaded from pickles/std1_pos_clustering.p


<h2>Define Experimental Methods</h2>

In [14]:
def train(selected_data, param_list, hp, match_mode, evaluation_method):
    
    performances = []
    for param in param_list:

        # print "Parameter mass_tol=%f rt_tol=%f" % (param)
        hp.across_file_mass_tol = param[0]
        hp.across_file_rt_tol = param[1]
        selected_files = [x[0] for x in selected_data]  
        selected_clusterings = [x[1] for x in selected_data]            
        aligner = Aligner(selected_files, None, transformation_file, 
                               hp, verbose=False, seed=1234567890)
        aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

        res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
        output = param+res[0]
        performances.append(output)
    
    df = pd.DataFrame(performances, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
    
    sorted_df = df.sort_values(['F1', 'mass_tol', 'rt_tol'], ascending=[False, True, True])
    best_row = sorted_df.iloc[0]
    return df, best_row

In [15]:
def test(selected_data, best_row, hp, match_mode, evaluation_method):

    param = (best_row['mass_tol'], best_row['rt_tol'])
    hp.across_file_mass_tol = param[0]
    hp.across_file_rt_tol = param[1]
    selected_files = [x[0] for x in selected_data]
    selected_clusterings = [x[1] for x in selected_data]    
    aligner = Aligner(selected_files, None, transformation_file, 
                           hp, verbose=False, seed=1234567890)
    aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

    res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
    output = param+res[0]
    return output

In [103]:
def train_test_single(match_mode, training_data, testing_data, i):
    
    print training_data[0][0].num_peaks
    print training_data[1][0].num_peaks    

    print testing_data[0][0].num_peaks
    print testing_data[1][0].num_peaks
    
    print "Iteration %d" % i
    print "Training on %s" % [x[0].filename for x in training_data]
    training_df, best_training_row = train(training_data, param_list, hp, match_mode, evaluation_method)

    testing_data = testing_list[i]
    print "Testing on %s" % [x[0].filename for x in testing_data]
    match_res = test(testing_data, best_training_row, hp, match_mode, evaluation_method)
    output = (match_mode,) + match_res
    print "match_mode=%d, mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % output

    item = (training_data, training_df, best_training_row, match_res)
    return item

In [17]:
def train_test(match_mode, training_list, testing_list, idx=None):
    assert len(training_list) == len(testing_list)
    n_iter = len(training_list)
    exp_results = []
    if idx is not None:
        for i in range(n_iter):
            training_data = training_list[i]
            testing_data = testing_list[i]
            item = train_test_single(match_mode, training_data, testing_data, i)
            exp_results.append(item)
            print
    else:
        training_data = training_list[idx]
        testing_data = testing_list[idx]
        item = train_test_single(match_mode, training_data, testing_data, idx)
        exp_results.append(item)
        print
        
    return exp_results

In [18]:
def run_experiment(match_mode, training_list, testing_list, filename):
    try:
        with gzip.GzipFile(filename, 'rb') as f:        
            exp_results = cPickle.load(f)
            print "Loaded from %s" % filename
            return exp_results
    except (IOError, EOFError):
        exp_results = train_test(match_mode, training_list, testing_list)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(exp_results, f, protocol=cPickle.HIGHEST_PROTOCOL)                        
        print "Saved to %s" % filename
    return exp_results

In [19]:
def load_or_create_filelist(filename, combined_list, n_iter, n_files):
    try:
        with gzip.GzipFile(filename, 'rb') as f:        
            item_list = cPickle.load(f)
            print "Loaded from %s" % filename
            for item in item_list:
                print "%s" % [x[0].filename for x in item]
            return item_list
    except (IOError, EOFError):
        item_list = []
        for i in range(n_iter):
            item = random.sample(combined_list, n_files)
            print "%s" % [x[0].filename for x in item]
            item_list.append(item)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(item_list, f, protocol=cPickle.HIGHEST_PROTOCOL)                    
        print "Saved to %s" % filename
        return item_list

<h2>Run experiment with 2 random files</h2>

In [20]:
n_files = 2

In [212]:
training_list = load_or_create_filelist('pickles/training_list_2.p', combined_list, n_iter, n_files)

Loaded from pickles/training_list_2.p
['std1-file5.txt', 'std1-file7.txt']
['std1-file5.txt', 'std1-file11.txt']
['std1-file10.txt', 'std1-file5.txt']
['std1-file9.txt', 'std1-file2.txt']
['std1-file8.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file6.txt']
['std1-file10.txt', 'std1-file3.txt']
['std1-file8.txt', 'std1-file2.txt']
['std1-file11.txt', 'std1-file7.txt']
['std1-file1.txt', 'std1-file2.txt']
['std1-file8.txt', 'std1-file1.txt']
['std1-file6.txt', 'std1-file9.txt']
['std1-file11.txt', 'std1-file7.txt']
['std1-file7.txt', 'std1-file8.txt']
['std1-file2.txt', 'std1-file7.txt']
['std1-file2.txt', 'std1-file8.txt']
['std1-file10.txt', 'std1-file7.txt']
['std1-file9.txt', 'std1-file4.txt']
['std1-file9.txt', 'std1-file4.txt']
['std1-file7.txt', 'std1-file2.txt']
['std1-file1.txt', 'std1-file10.txt']
['std1-file2.txt', 'std1-file9.txt']
['std1-file11.txt', 'std1-file1.txt']
['std1-file1.txt', 'std1-file10.txt']
['std1-file9.txt', 'std1-file8.txt']
['std1-file3.txt', 'std1-fil

In [22]:
testing_list = load_or_create_filelist('pickles/testing_list_2.p', combined_list, n_iter, n_files)

Loaded from pickles/testing_list_2.p
['std1-file6.txt', 'std1-file8.txt']
['std1-file7.txt', 'std1-file11.txt']
['std1-file6.txt', 'std1-file2.txt']
['std1-file4.txt', 'std1-file8.txt']
['std1-file4.txt', 'std1-file7.txt']
['std1-file7.txt', 'std1-file11.txt']
['std1-file10.txt', 'std1-file1.txt']
['std1-file3.txt', 'std1-file10.txt']
['std1-file8.txt', 'std1-file11.txt']
['std1-file5.txt', 'std1-file3.txt']
['std1-file7.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file10.txt']
['std1-file11.txt', 'std1-file9.txt']
['std1-file2.txt', 'std1-file10.txt']
['std1-file3.txt', 'std1-file7.txt']
['std1-file11.txt', 'std1-file10.txt']
['std1-file6.txt', 'std1-file9.txt']
['std1-file11.txt', 'std1-file2.txt']
['std1-file4.txt', 'std1-file10.txt']
['std1-file10.txt', 'std1-file4.txt']
['std1-file8.txt', 'std1-file3.txt']
['std1-file8.txt', 'std1-file4.txt']
['std1-file3.txt', 'std1-file11.txt']
['std1-file9.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file4.txt']
['std1-file3.txt', 'std1

In [187]:
idx = 0
training_data = training_list[idx]
testing_data = testing_list[idx]

In [188]:
exp_results = train_test_single(0,  training_data, testing_data, idx)

7076
6319
4146
4101
Iteration 0
Training on ['std1-file5.txt', 'std1-file7.txt']
Testing on ['std1-file6.txt', 'std1-file8.txt']
match_mode=0, mass_tol=10, rt_tol=60, tp=117, fp=7, fn=13, prec=0.944, rec=0.900, f1=0.921, th_prob=1.000


In [190]:
print "Training result"
print exp_results[2]

Training result
mass_tol     10.000000
rt_tol       60.000000
TP           25.000000
FP            3.000000
FN            8.000000
Prec          0.892857
Rec           0.757576
F1            0.819672
Threshold     1.000000
Name: 0, dtype: float64


In [191]:
print "Testing result"
df = pd.DataFrame([exp_results[3]], columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
print df.iloc[0]

Testing result
mass_tol      10.000000
rt_tol        60.000000
TP           117.000000
FP             7.000000
FN            13.000000
Prec           0.943548
Rec            0.900000
F1             0.921260
Threshold      1.000000
Name: 0, dtype: float64


In [213]:
idx = 0
training_data = training_list[idx]
testing_data = testing_list[idx]

In [200]:
def add_noisy_peaks(peakdata, num_add):

    next_id = peakdata.features[-1].feature_id
    file_id = peakdata.features[-1].file_id
    min_mz, max_mz = np.min(peakdata.mass), np.max(peakdata.mass)
    min_intensity, max_intensity = np.min(peakdata.intensity), np.max(peakdata.intensity)
    
    if num_add > peakdata.num_peaks:
        num_add = peakdata.num_peaks
        
    new_peaks = random.sample(peakdata.features, num_add)
    to_add = list(peakdata.features)
    for i in range(num_add):
        next_id = next_id + 1
        # mz = np.random.uniform(low=min_mz, high=max_mz)
        mz = new_peaks[i].mass
        rt = new_peaks[i].rt
        intensity = np.random.uniform(low=min_intensity, high=max_intensity)
        new_feature = Feature(next_id, mz, rt, intensity, file_id)
        to_add.append(new_feature)
        
    new_peakdata = PeakData(to_add, peakdata.filename)
    return new_peakdata

In [201]:
num_add = 10000

single_set = training_data[0]
peakdata = single_set[0]
new_peakdata = add_noisy_peaks(peakdata, num_add)
training_data[0] = (new_peakdata, None)

single_set = training_data[1]
peakdata = single_set[0]
new_peakdata = add_noisy_peaks(peakdata, num_add)
training_data[1] = (new_peakdata, None)

Add a single noisy peak

In [227]:
single_set = training_data[1]
peakdata = single_set[0]
to_add = list(peakdata.features)
next_id = peakdata.features[-1].feature_id
next_id = next_id + 1
mz = 126.0219
rt = 1000
intensity = 475694
new_feature = Feature(next_id, mz, rt, intensity, file_id)
to_add.append(new_feature)
new_peakdata = PeakData(to_add, peakdata.filename)
training_data[1] = (new_peakdata, None)

In [229]:
print training_data[1][0].features[-5:-1]

[id=(6320,4) mass=126.0219 rt=1000.00 int=475690.72, id=(6321,4) mass=126.0219 rt=1000.00 int=475691.00, id=(6322,4) mass=126.0219 rt=1000.00 int=475692.00, id=(6323,4) mass=126.0219 rt=1000.00 int=475693.00]


In [230]:
exp_results = train_test_single(0,  training_data, testing_data, idx)

7076
6324
4146
4101
Iteration 0
Training on ['std1-file5.txt', 'std1-file7.txt']
TP
id=(472,4) mass=419.1511 rt=844.11 int=330838.97 id=(1651,6) mass=419.1511 rt=868.82 int=53371.18
id=(1757,4) mass=187.0380 rt=1009.00 int=37857.54 id=(2439,6) mass=187.0380 rt=994.31 int=22681.91
id=(1098,4) mass=148.0036 rt=988.13 int=84804.91 id=(2559,6) mass=148.0037 rt=1028.53 int=20248.03
id=(1207,4) mass=167.0482 rt=983.63 int=71338.05 id=(2486,6) mass=167.0484 rt=1028.53 int=21731.50
id=(207,4) mass=339.0159 rt=1010.44 int=991850.38 id=(1500,6) mass=339.0158 rt=991.12 int=63851.89
id=(69,4) mass=330.0594 rt=921.04 int=2748293.50 id=(265,6) mass=330.0592 rt=955.90 int=653659.31
id=(233,4) mass=441.1332 rt=845.51 int=857483.19 id=(439,6) mass=441.1332 rt=868.82 int=355520.81
id=(1037,4) mass=207.1125 rt=751.21 int=95757.73 id=(2388,6) mass=207.1125 rt=771.41 int=24067.34
id=(125,4) mass=232.0611 rt=842.71 int=1696881.62 id=(269,6) mass=232.0611 rt=865.77 int=648982.25
id=(190,4) mass=360.1497 rt=8

In [231]:
print "Training result"
print exp_results[2]

Training result
mass_tol     10.000000
rt_tol       60.000000
TP           24.000000
FP            5.000000
FN            9.000000
Prec          0.827586
Rec           0.727273
F1            0.774194
Threshold     1.000000
Name: 0, dtype: float64


In [223]:
print "Testing result"
df = pd.DataFrame([exp_results[3]], columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
print df.iloc[0]

Testing result
mass_tol      10.000000
rt_tol        60.000000
TP           117.000000
FP             7.000000
FN            13.000000
Prec           0.943548
Rec            0.900000
F1             0.921260
Threshold      1.000000
Name: 0, dtype: float64
