<h1>Experiment with beer3pos</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
basedir = '/home/joewandy/git/metabolomics_tools'
sys.path.append(basedir)

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip

In [4]:
from alignment.models import HyperPars as AlignmentHyperPars
from alignment.experiment import *

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/beer3pos'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/beer3pos/ground_truth/beer3.positive.dat'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 3
hp.within_file_rt_tol = 10
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 60
hp.alpha_mass = 1
hp.dp_alpha = 1000.0
hp.beta = 0.1
hp.t = 0.0
hp.mass_clustering_n_iterations = 400
hp.rt_clustering_nsamps = 100
hp.rt_clustering_burnin = 0

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=60, alpha_mass=1, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=400, matching_alpha=0.3, rt_clustering_burnin=0, rt_clustering_nsamps=100, t=0.0, within_file_mass_tol=3, within_file_rt_tol=10


In [7]:
evaluation_method = 2
n_iter = 1

In [8]:
param_list = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        param_list.append((mass_tol, rt_tol))

In [9]:
param_list_mwg = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        for group_tol in range(2, 11, 2):
            for alpha in range(0, 11, 2):
                param_list_mwg.append((mass_tol, rt_tol, group_tol, alpha/10.0))

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [10]:
combined_list = load_or_create_clustering('pickles/beer3pos/clustering.p', input_dir, transformation_file, hp)

7553 features read from beer3-file1.csv
7579 features read from beer3-file2.csv
7240 features read from beer3-file3.csv


[Parallel(n_jobs=4)]: Done   1 out of   3 | elapsed:  2.9min remaining:  5.9min
[Parallel(n_jobs=4)]: Done   2 out of   3 | elapsed:  3.0min remaining:  1.5min
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:  3.0min finished


Saved to pickles/beer3pos/clustering.p
Created 7553 clusters
Created 7579 clusters
Created 7240 clusters
Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Assigning possible transformations 0/7553
Assigning possible transformations 0/7579
Assigning possible transformations 0/7240
Assigning possible transformations 500/7553
Assigning possible transformations 500/7579
Assigning possible transformations 500/7240
Assigning possible transformations 1000/7553
Assigning possible transformations 1000/7579
Assigning possible transformations 1000/7240
Assigning possible transformations 1500/7553
Assigning possible transformations 1500/7579
Assigning possible transformations 1500/7240
Assigning possible transformations 2000/7553
Assigning possible transformations 2000/7579
Assigning possible transformations 2000/7240
Assigning possible transformations 2500/7553
Assigning possible transformations 2500/7579
Assigning possible transformations 2500/7240
Assi

<h2>Run experiment with beer3pos</h2>

In [11]:
n_files = 3

In [12]:
training_list = load_or_create_filelist('pickles/beer3pos/training_list.p', combined_list, n_iter, n_files)

['beer3-file3.csv', 'beer3-file1.csv', 'beer3-file2.csv']
Saved to pickles/beer3pos/training_list.p


In [13]:
testing_list = load_or_create_filelist('pickles/beer3pos/testing_list.p', combined_list, n_iter, n_files)

['beer3-file2.csv', 'beer3-file3.csv', 'beer3-file1.csv']
Saved to pickles/beer3pos/testing_list.p


In [14]:
exp_results_1a = run_experiment(0, training_list, testing_list, param_list, 'pickles/beer3pos/res_match_feature.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beer3-file3.csv', 'beer3-file1.csv', 'beer3-file2.csv']
mass_tol=2, rt_tol=5, tp=231, fp=23, fn=45, prec=0.909, rec=0.837, f1=0.872, th_prob=1.000
mass_tol=2, rt_tol=10, tp=249, fp=28, fn=27, prec=0.899, rec=0.902, f1=0.901, th_prob=1.000
mass_tol=2, rt_tol=15, tp=253, fp=28, fn=23, prec=0.900, rec=0.917, f1=0.908, th_prob=1.000
mass_tol=2, rt_tol=20, tp=253, fp=30, fn=23, prec=0.894, rec=0.917, f1=0.905, th_prob=1.000
mass_tol=2, rt_tol=25, tp=253, fp=30, fn=23, prec=0.894, rec=0.917, f1=0.905, th_prob=1.000
mass_tol=2, rt_tol=30, tp=253, fp=30, fn=23, prec=0.894, rec=0.917, f1=0.905, th_prob=1.000
mass_tol=4, rt_tol=5, tp=231, fp=23, fn=45, prec=0.909, rec=0.837, f1=0.872, th_prob=1.000
mass_tol=4, rt_tol=10, tp=249, fp=28, fn=27, prec=0.899, rec=0.902, f1=0.901, th_prob=1.000
mass_tol=4, rt_tol=15, tp=253, fp=28, fn=23, prec=0.900, rec=0.917, f1=0.908, th_prob=1.000
mass_tol=4, rt_tol=20, tp=253, fp=30, fn=23, prec=0.894, rec=0.917, f1=0.905, th_prob=1.000


In [15]:
exp_results_1b = run_experiment(1, training_list, testing_list, param_list, 'pickles/beer3pos/res_match_cluster.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beer3-file3.csv', 'beer3-file1.csv', 'beer3-file2.csv']
mass_tol=2, rt_tol=5, tp=223, fp=20, fn=53, prec=0.918, rec=0.808, f1=0.859, th_prob=1.000
mass_tol=2, rt_tol=10, tp=241, fp=22, fn=35, prec=0.916, rec=0.873, f1=0.894, th_prob=1.000
mass_tol=2, rt_tol=15, tp=247, fp=22, fn=29, prec=0.918, rec=0.895, f1=0.906, th_prob=1.000
mass_tol=2, rt_tol=20, tp=247, fp=24, fn=29, prec=0.911, rec=0.895, f1=0.903, th_prob=1.000
mass_tol=2, rt_tol=25, tp=247, fp=24, fn=29, prec=0.911, rec=0.895, f1=0.903, th_prob=1.000
mass_tol=2, rt_tol=30, tp=247, fp=25, fn=29, prec=0.908, rec=0.895, f1=0.901, th_prob=1.000
mass_tol=4, rt_tol=5, tp=223, fp=20, fn=53, prec=0.918, rec=0.808, f1=0.859, th_prob=1.000
mass_tol=4, rt_tol=10, tp=241, fp=22, fn=35, prec=0.916, rec=0.873, f1=0.894, th_prob=1.000
mass_tol=4, rt_tol=15, tp=247, fp=22, fn=29, prec=0.918, rec=0.895, f1=0.906, th_prob=1.000
mass_tol=4, rt_tol=20, tp=247, fp=24, fn=29, prec=0.911, rec=0.895, f1=0.903, th_prob=1.000


In [16]:
exp_results_1c = run_experiment(3, training_list, testing_list, param_list, 'pickles/beer3pos/res_mwg.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beer3-file3.csv', 'beer3-file1.csv', 'beer3-file2.csv']
mass_tol=2, rt_tol=5, tp=233, fp=23, fn=43, prec=0.910, rec=0.844, f1=0.876, th_prob=1.000
mass_tol=2, rt_tol=10, tp=253, fp=26, fn=23, prec=0.907, rec=0.917, f1=0.912, th_prob=1.000
mass_tol=2, rt_tol=15, tp=258, fp=25, fn=18, prec=0.912, rec=0.935, f1=0.923, th_prob=1.000
mass_tol=2, rt_tol=20, tp=258, fp=27, fn=18, prec=0.905, rec=0.935, f1=0.920, th_prob=1.000
mass_tol=2, rt_tol=25, tp=258, fp=27, fn=18, prec=0.905, rec=0.935, f1=0.920, th_prob=1.000
mass_tol=2, rt_tol=30, tp=258, fp=27, fn=18, prec=0.905, rec=0.935, f1=0.920, th_prob=1.000
mass_tol=4, rt_tol=5, tp=233, fp=23, fn=43, prec=0.910, rec=0.844, f1=0.876, th_prob=1.000
mass_tol=4, rt_tol=10, tp=253, fp=26, fn=23, prec=0.907, rec=0.917, f1=0.912, th_prob=1.000
mass_tol=4, rt_tol=15, tp=258, fp=25, fn=18, prec=0.912, rec=0.935, f1=0.923, th_prob=1.000
mass_tol=4, rt_tol=20, tp=258, fp=27, fn=18, prec=0.905, rec=0.935, f1=0.920, th_prob=1.000


<h2>Plotting</h2>

In [None]:
def plot_density(exp_res, title):
    training_dfs = []
    for item in exp_res:
        training_data, training_df, best_training_row, match_res = item
        training_dfs.append(training_df)
    combined = pd.concat(training_dfs, axis=0)
    combined = combined.reset_index(drop=True)
#     f, ax = plt.subplots(figsize=(6, 6))    
#     sns.kdeplot(combined.Rec, combined.Prec, ax=ax)
#     sns.rugplot(combined.Rec, ax=ax)
#     sns.rugplot(combined.Prec, vertical=True, ax=ax)    
#     ax.set_xlim([0.7, 1.0])
#     ax.set_ylim([0.7, 1.0])
    g = sns.JointGrid(x="Rec", y="Prec", data=combined, xlim=(0.7, 1.0), ylim=(0.7, 1.0))
    g = g.plot_joint(sns.kdeplot)
    g = g.plot_marginals(sns.kdeplot, shade=True)
    ax = g.ax_joint
    ax.set_xlabel('Rec')
    ax.set_ylabel('Prec')
    ax = g.ax_marg_x
    ax.set_title(title)    

In [None]:
plot_density(exp_results_1a, 'Feature matching')
plot_density(exp_results_1b, 'Cluster matching')