<h1>Experiment with adding noise on the std1pos data</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
basedir = '/home/joewandy/git/metabolomics_tools'
sys.path.append(basedir)

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip

In [4]:
from alignment.models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak, Possible
from discretisation import utils
from discretisation.preprocessing import FileLoader
from alignment.shared_bin_matching import SharedBinMatching as Aligner
from alignment.ground_truth import GroundTruth
from discretisation.models import Feature, PeakData

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/std1_csv_full_old/ground_truth/ground_truth.txt'

In [43]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 5
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 120
hp.alpha_mass = 1.0
hp.dp_alpha = 1000.0
hp.beta = 0.1
hp.t = 0.0
hp.mass_clustering_n_iterations = 200
hp.rt_clustering_nsamps = 100
hp.rt_clustering_burnin = 0

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=120, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=200, matching_alpha=0.3, rt_clustering_burnin=0, rt_clustering_nsamps=100, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30


In [7]:
evaluation_method = 2
n_iter = 30

In [8]:
param_list = []
for mass_tol in range(2, 11, 2):
    for rt_tol in range(5, 101, 5):
        param_list.append((mass_tol, rt_tol))

# overwrite
param_list = [(hp.across_file_mass_tol, hp.across_file_rt_tol)]

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [9]:
def load_or_create_clustering(filename, input_dir, transformation_file, hp):
    try:
        with gzip.GzipFile(filename, 'rb') as f:
            combined_list = cPickle.load(f)
            print "Loaded from %s" % filename
            return combined_list
    except (IOError, EOFError):
        loader = FileLoader()        
        data_list = loader.load_model_input(input_dir, synthetic=True)
        aligner = Aligner(data_list, None, transformation_file, 
                               hp, verbose=False, seed=1234567890, parallel=True, mh_biggest=True, use_vb=False)
        clustering_results = aligner._first_stage_clustering()
        combined_list = zip(data_list, clustering_results)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(combined_list, f, protocol=cPickle.HIGHEST_PROTOCOL)        
        print "Saved to %s" % filename
        return combined_list

In [10]:
combined_list = load_or_create_clustering('pickles/std1_pos_clustering.p', input_dir, transformation_file, hp)

Loaded from pickles/std1_pos_clustering.p


<h2>Define Experimental Methods</h2>

In [11]:
def train(selected_data, param_list, hp, match_mode, evaluation_method):
    
    performances = []
    for param in param_list:

        # print "Parameter mass_tol=%f rt_tol=%f" % (param)
        hp.across_file_mass_tol = param[0]
        hp.across_file_rt_tol = param[1]
        selected_files = [x[0] for x in selected_data]  
        selected_clusterings = [x[1] for x in selected_data]            
        aligner = Aligner(selected_files, None, transformation_file, 
                               hp, verbose=False, seed=1234567890)
        aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

        res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
        output = param+res[0]
        performances.append(output)
    
    df = pd.DataFrame(performances, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
    
    sorted_df = df.sort_values(['F1', 'mass_tol', 'rt_tol'], ascending=[False, True, True])
    best_row = sorted_df.iloc[0]
    return df, best_row

In [12]:
def test(selected_data, best_row, hp, match_mode, evaluation_method):

    param = (best_row['mass_tol'], best_row['rt_tol'])
    hp.across_file_mass_tol = param[0]
    hp.across_file_rt_tol = param[1]
    selected_files = [x[0] for x in selected_data]
    selected_clusterings = [x[1] for x in selected_data]    
    aligner = Aligner(selected_files, None, transformation_file, 
                           hp, verbose=False, seed=1234567890)
    aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

    res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
    output = param+res[0]
    return output

In [56]:
def train_test_single(match_mode, training_data, testing_data, i):
    
    for n in range(len(training_data)):
        print training_data[n][0].num_peaks
    for n in range(len(testing_data)):
        print testing_data[n][0].num_peaks

    print "Iteration %d" % i
    print "Training on %s" % [x[0].filename for x in training_data]
    training_df, best_training_row = train(training_data, param_list, hp, match_mode, evaluation_method)

    testing_data = testing_list[i]
    print "Testing on %s" % [x[0].filename for x in testing_data]
    match_res = test(testing_data, best_training_row, hp, match_mode, evaluation_method)
    output = (match_mode,) + match_res
    print "match_mode=%d, mass_tol=%d, rt_tol=%d, tp=%d, fp=%d, fn=%d, prec=%.3f, rec=%.3f, f1=%.3f, th_prob=%.3f" % output

    item = (training_data, training_df, best_training_row, match_res)
    return item

In [14]:
def train_test(match_mode, training_list, testing_list, idx=None):
    assert len(training_list) == len(testing_list)
    n_iter = len(training_list)
    exp_results = []
    if idx is not None:
        for i in range(n_iter):
            training_data = training_list[i]
            testing_data = testing_list[i]
            item = train_test_single(match_mode, training_data, testing_data, i)
            exp_results.append(item)
            print
    else:
        training_data = training_list[idx]
        testing_data = testing_list[idx]
        item = train_test_single(match_mode, training_data, testing_data, idx)
        exp_results.append(item)
        print
        
    return exp_results

In [15]:
def run_experiment(match_mode, training_list, testing_list, filename):
    try:
        with gzip.GzipFile(filename, 'rb') as f:        
            exp_results = cPickle.load(f)
            print "Loaded from %s" % filename
            return exp_results
    except (IOError, EOFError):
        exp_results = train_test(match_mode, training_list, testing_list)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(exp_results, f, protocol=cPickle.HIGHEST_PROTOCOL)                        
        print "Saved to %s" % filename
    return exp_results

In [16]:
def load_or_create_filelist(filename, combined_list, n_iter, n_files):
    try:
        with gzip.GzipFile(filename, 'rb') as f:        
            item_list = cPickle.load(f)
            print "Loaded from %s" % filename
            for item in item_list:
                print "%s" % [x[0].filename for x in item]
            return item_list
    except (IOError, EOFError):
        item_list = []
        for i in range(n_iter):
            item = random.sample(combined_list, n_files)
            print "%s" % [x[0].filename for x in item]
            item_list.append(item)
        with gzip.GzipFile(filename, 'wb') as f:
            cPickle.dump(item_list, f, protocol=cPickle.HIGHEST_PROTOCOL)                    
        print "Saved to %s" % filename
        return item_list

<h2>Methods to add noisy peaks</h2>

In [17]:
def add_noisy_peaks(peakdata, num_add):

    next_id = peakdata.features[-1].feature_id
    file_id = peakdata.features[-1].file_id
    min_mz, max_mz = np.min(peakdata.mass), np.max(peakdata.mass)
    min_rt, max_rt = np.min(peakdata.rt), np.max(peakdata.rt)
    min_intensity, max_intensity = np.min(peakdata.intensity), np.max(peakdata.intensity)
    
#     if num_add > peakdata.num_peaks:
#         num_add = peakdata.num_peaks
        
    # new_peaks = random.sample(peakdata.features, num_add)
    to_add = list(peakdata.features)
    for i in range(num_add):
        next_id = next_id + 1
        # mz = new_peaks[i].mass
        # rt = new_peaks[i].rt
        mz = np.random.uniform(low=min_mz, high=max_mz)
        rt = np.random.uniform(low=min_rt, high=max_rt)
        intensity = np.random.uniform(low=min_intensity, high=max_intensity)
        new_feature = Feature(next_id, mz, rt, intensity, file_id)
        to_add.append(new_feature)
        
    new_peakdata = PeakData(to_add, peakdata.filename)
    return new_peakdata

In [31]:
def add_noisy_peaks_to_set(training_data, set_idx, num_add):
    for i in range(len(training_data)):
        single_set = training_data[i]
        peakdata = single_set[0]
        new_peakdata = add_noisy_peaks(peakdata, num_add)
        training_data[i] = (new_peakdata, None)

<h2>Run experiment with 2 random files</h2>

<h3>Load data</h3>

In [44]:
n_files = 2

In [45]:
training_list = load_or_create_filelist('pickles/training_list_2.p', combined_list, n_iter, n_files)

Loaded from pickles/training_list_2.p
['std1-file5.txt', 'std1-file7.txt']
['std1-file5.txt', 'std1-file11.txt']
['std1-file10.txt', 'std1-file5.txt']
['std1-file9.txt', 'std1-file2.txt']
['std1-file8.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file6.txt']
['std1-file10.txt', 'std1-file3.txt']
['std1-file8.txt', 'std1-file2.txt']
['std1-file11.txt', 'std1-file7.txt']
['std1-file1.txt', 'std1-file2.txt']
['std1-file8.txt', 'std1-file1.txt']
['std1-file6.txt', 'std1-file9.txt']
['std1-file11.txt', 'std1-file7.txt']
['std1-file7.txt', 'std1-file8.txt']
['std1-file2.txt', 'std1-file7.txt']
['std1-file2.txt', 'std1-file8.txt']
['std1-file10.txt', 'std1-file7.txt']
['std1-file9.txt', 'std1-file4.txt']
['std1-file9.txt', 'std1-file4.txt']
['std1-file7.txt', 'std1-file2.txt']
['std1-file1.txt', 'std1-file10.txt']
['std1-file2.txt', 'std1-file9.txt']
['std1-file11.txt', 'std1-file1.txt']
['std1-file1.txt', 'std1-file10.txt']
['std1-file9.txt', 'std1-file8.txt']
['std1-file3.txt', 'std1-fil

In [46]:
testing_list = load_or_create_filelist('pickles/testing_list_2.p', combined_list, n_iter, n_files)

Loaded from pickles/testing_list_2.p
['std1-file6.txt', 'std1-file8.txt']
['std1-file7.txt', 'std1-file11.txt']
['std1-file6.txt', 'std1-file2.txt']
['std1-file4.txt', 'std1-file8.txt']
['std1-file4.txt', 'std1-file7.txt']
['std1-file7.txt', 'std1-file11.txt']
['std1-file10.txt', 'std1-file1.txt']
['std1-file3.txt', 'std1-file10.txt']
['std1-file8.txt', 'std1-file11.txt']
['std1-file5.txt', 'std1-file3.txt']
['std1-file7.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file10.txt']
['std1-file11.txt', 'std1-file9.txt']
['std1-file2.txt', 'std1-file10.txt']
['std1-file3.txt', 'std1-file7.txt']
['std1-file11.txt', 'std1-file10.txt']
['std1-file6.txt', 'std1-file9.txt']
['std1-file11.txt', 'std1-file2.txt']
['std1-file4.txt', 'std1-file10.txt']
['std1-file10.txt', 'std1-file4.txt']
['std1-file8.txt', 'std1-file3.txt']
['std1-file8.txt', 'std1-file4.txt']
['std1-file3.txt', 'std1-file11.txt']
['std1-file9.txt', 'std1-file5.txt']
['std1-file7.txt', 'std1-file4.txt']
['std1-file3.txt', 'std1

<h3>Test performance before adding noise</h3>

In [47]:
idx = 0
training_data = training_list[idx]
testing_data = testing_list[idx]

In [48]:
exp_results = train_test_single(0,  training_data, testing_data, idx)

7076
6319
4146
4101
Iteration 0
Training on ['std1-file5.txt', 'std1-file7.txt']
Testing on ['std1-file6.txt', 'std1-file8.txt']
match_mode=0, mass_tol=10, rt_tol=60, tp=117, fp=7, fn=13, prec=0.944, rec=0.900, f1=0.921, th_prob=1.000


In [49]:
print "Training result"
print exp_results[2]

Training result
mass_tol     10.000000
rt_tol       60.000000
TP           25.000000
FP            3.000000
FN            8.000000
Prec          0.892857
Rec           0.757576
F1            0.819672
Threshold     1.000000
Name: 0, dtype: float64


In [50]:
print "Testing result"
df = pd.DataFrame([exp_results[3]], columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
print df.iloc[0]

Testing result
mass_tol      10.000000
rt_tol        60.000000
TP           117.000000
FP             7.000000
FN            13.000000
Prec           0.943548
Rec            0.900000
F1             0.921260
Threshold      1.000000
Name: 0, dtype: float64


<h3>Add some noise</h3>

In [66]:
num_add = 10000
add_noisy_peaks_to_set(training_data, idx, num_add)
add_noisy_peaks_to_set(testing_data, idx, num_add)

<h3>Add a single noisy peak</h3>

In [52]:
# single_set = training_data[1]
# peakdata = single_set[0]
# to_add = list(peakdata.features)
# next_id = peakdata.features[-1].feature_id
# next_id = next_id + 1
# file_id = peakdata.features[-1].file_id
# mz = 126.0219
# rt = 1000
# intensity = 475694
# new_feature = Feature(next_id, mz, rt, intensity, file_id)
# to_add.append(new_feature)
# new_peakdata = PeakData(to_add, peakdata.filename)
# training_data[1] = (new_peakdata, None)

In [53]:
# single_set = training_data[1]
# peakdata = single_set[0]
# to_add = list(peakdata.features)
# next_id = peakdata.features[-1].feature_id
# next_id = next_id + 1
# file_id = peakdata.features[-1].file_id
# mz = 251.0361
# rt = 2000
# intensity = 475694
# new_feature = Feature(next_id, mz, rt, intensity, file_id)
# to_add.append(new_feature)
# new_peakdata = PeakData(to_add, peakdata.filename)
# training_data[1] = (new_peakdata, None)

In [54]:
# print new_feature
# print new_peakdata.features[-1]

<h3>Test performance after adding noise</h3>

In [67]:
match_mode = 0
exp_results = train_test_single(match_mode,  training_data, testing_data, idx)

57076
56319
54146
54101
Iteration 0
Training on ['std1-file5.txt', 'std1-file7.txt']
Testing on ['std1-file6.txt', 'std1-file8.txt']
match_mode=0, mass_tol=10, rt_tol=60, tp=117, fp=7, fn=13, prec=0.944, rec=0.900, f1=0.921, th_prob=1.000


In [68]:
print "Training result"
print exp_results[2]

Training result
mass_tol     10.000000
rt_tol       60.000000
TP           25.000000
FP            3.000000
FN            8.000000
Prec          0.892857
Rec           0.757576
F1            0.819672
Threshold     1.000000
Name: 0, dtype: float64


In [69]:
print "Testing result"
df = pd.DataFrame([exp_results[3]], columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
print df.iloc[0]

Testing result
mass_tol      10.000000
rt_tol        60.000000
TP           117.000000
FP             7.000000
FN            13.000000
Prec           0.943548
Rec            0.900000
F1             0.921260
Threshold      1.000000
Name: 0, dtype: float64
