<h1>Experiment with beerallpos</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
basedir = '/home/joewandy/git/metabolomics_tools'
sys.path.append(basedir)

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip

In [4]:
from alignment.models import HyperPars as AlignmentHyperPars
from alignment.experiment import *

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/beerallpos'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/beerallpos/ground_truth/beerall.positive.dat'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 3
hp.within_file_rt_tol = 10
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 60
hp.alpha_mass = 1
hp.dp_alpha = 1000.0
hp.beta = 0.1
hp.t = 0.0
hp.mass_clustering_n_iterations = 400
hp.rt_clustering_nsamps = 100
hp.rt_clustering_burnin = 0

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=60, alpha_mass=1, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=400, matching_alpha=0.3, rt_clustering_burnin=0, rt_clustering_nsamps=100, t=0.0, within_file_mass_tol=3, within_file_rt_tol=10


In [7]:
evaluation_method = 2
n_iter = 1

In [8]:
param_list = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        param_list.append((mass_tol, rt_tol))

In [9]:
param_list_mwg = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        for group_tol in range(2, 11, 2):
            for alpha in range(0, 11, 2):
                param_list_mwg.append((mass_tol, rt_tol, group_tol, alpha/10.0))

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [10]:
combined_list = load_or_create_clustering('pickles/beerallpos/clustering.p', input_dir, transformation_file, hp)

6733 features read from beerall-file1.csv
6997 features read from beerall-file2.csv
7553 features read from beerall-file3.csv


[Parallel(n_jobs=4)]: Done   1 out of   3 | elapsed:  2.2min remaining:  4.4min
[Parallel(n_jobs=4)]: Done   2 out of   3 | elapsed:  2.3min remaining:  1.2min
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:  2.7min finished


Saved to pickles/beerallpos/clustering.p
Created 6733 clusters
Created 6997 clusters
Created 7553 clusters
Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Assigning possible transformations 0/6733
Assigning possible transformations 0/6997
Assigning possible transformations 0/7553
Assigning possible transformations 500/6733
Assigning possible transformations 500/6997
Assigning possible transformations 500/7553
Assigning possible transformations 1000/6733
Assigning possible transformations 1000/6997
Assigning possible transformations 1000/7553
Assigning possible transformations 1500/6733
Assigning possible transformations 1500/6997
Assigning possible transformations 1500/7553
Assigning possible transformations 2000/6733
Assigning possible transformations 2000/6997
Assigning possible transformations 2000/7553
Assigning possible transformations 2500/6733
Assigning possible transformations 2500/6997
Assigning possible transformations 2500/7553
As

<h2>Run experiment with beerallpos</h2>

In [11]:
n_files = 3

In [12]:
training_list = load_or_create_filelist('pickles/beerallpos/training_list.p', combined_list, n_iter, n_files)

['beerall-file3.csv', 'beerall-file1.csv', 'beerall-file2.csv']
Saved to pickles/beerallpos/training_list.p


In [13]:
testing_list = load_or_create_filelist('pickles/beerallpos/testing_list.p', combined_list, n_iter, n_files)

['beerall-file1.csv', 'beerall-file2.csv', 'beerall-file3.csv']
Saved to pickles/beerallpos/testing_list.p


In [14]:
exp_results_1a = run_experiment(0, training_list, testing_list, param_list, 'pickles/beerallpos/res_match_feature.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beerall-file3.csv', 'beerall-file1.csv', 'beerall-file2.csv']
mass_tol=2, rt_tol=5, tp=183, fp=34, fn=44, prec=0.843, rec=0.806, f1=0.824, th_prob=1.000
mass_tol=2, rt_tol=10, tp=198, fp=36, fn=29, prec=0.846, rec=0.872, f1=0.859, th_prob=1.000
mass_tol=2, rt_tol=15, tp=198, fp=37, fn=29, prec=0.843, rec=0.872, f1=0.857, th_prob=1.000
mass_tol=2, rt_tol=20, tp=199, fp=39, fn=28, prec=0.836, rec=0.877, f1=0.856, th_prob=1.000
mass_tol=2, rt_tol=25, tp=203, fp=39, fn=24, prec=0.839, rec=0.894, f1=0.866, th_prob=1.000
mass_tol=2, rt_tol=30, tp=203, fp=39, fn=24, prec=0.839, rec=0.894, f1=0.866, th_prob=1.000
mass_tol=4, rt_tol=5, tp=183, fp=34, fn=44, prec=0.843, rec=0.806, f1=0.824, th_prob=1.000
mass_tol=4, rt_tol=10, tp=198, fp=36, fn=29, prec=0.846, rec=0.872, f1=0.859, th_prob=1.000
mass_tol=4, rt_tol=15, tp=198, fp=37, fn=29, prec=0.843, rec=0.872, f1=0.857, th_prob=1.000
mass_tol=4, rt_tol=20, tp=199, fp=39, fn=28, prec=0.836, rec=0.877, f1=0.856, th_prob=

In [15]:
exp_results_1b = run_experiment(1, training_list, testing_list, param_list, 'pickles/beerallpos/res_match_cluster.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beerall-file3.csv', 'beerall-file1.csv', 'beerall-file2.csv']
mass_tol=2, rt_tol=5, tp=167, fp=30, fn=60, prec=0.848, rec=0.736, f1=0.788, th_prob=1.000
mass_tol=2, rt_tol=10, tp=183, fp=32, fn=44, prec=0.851, rec=0.806, f1=0.828, th_prob=1.000
mass_tol=2, rt_tol=15, tp=183, fp=33, fn=44, prec=0.847, rec=0.806, f1=0.826, th_prob=1.000
mass_tol=2, rt_tol=20, tp=185, fp=34, fn=42, prec=0.845, rec=0.815, f1=0.830, th_prob=1.000
mass_tol=2, rt_tol=25, tp=187, fp=36, fn=40, prec=0.839, rec=0.824, f1=0.831, th_prob=1.000
mass_tol=2, rt_tol=30, tp=187, fp=36, fn=40, prec=0.839, rec=0.824, f1=0.831, th_prob=1.000
mass_tol=4, rt_tol=5, tp=167, fp=30, fn=60, prec=0.848, rec=0.736, f1=0.788, th_prob=1.000
mass_tol=4, rt_tol=10, tp=183, fp=32, fn=44, prec=0.851, rec=0.806, f1=0.828, th_prob=1.000
mass_tol=4, rt_tol=15, tp=183, fp=33, fn=44, prec=0.847, rec=0.806, f1=0.826, th_prob=1.000
mass_tol=4, rt_tol=20, tp=185, fp=34, fn=42, prec=0.845, rec=0.815, f1=0.830, th_prob=

In [16]:
exp_results_1c = run_experiment(3, training_list, testing_list, param_list, 'pickles/beerallpos/res_mwg.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beerall-file3.csv', 'beerall-file1.csv', 'beerall-file2.csv']
mass_tol=2, rt_tol=5, tp=181, fp=36, fn=46, prec=0.834, rec=0.797, f1=0.815, th_prob=1.000
mass_tol=2, rt_tol=10, tp=196, fp=38, fn=31, prec=0.838, rec=0.863, f1=0.850, th_prob=1.000
mass_tol=2, rt_tol=15, tp=199, fp=37, fn=28, prec=0.843, rec=0.877, f1=0.860, th_prob=1.000
mass_tol=2, rt_tol=20, tp=198, fp=40, fn=29, prec=0.832, rec=0.872, f1=0.852, th_prob=1.000
mass_tol=2, rt_tol=25, tp=202, fp=40, fn=25, prec=0.835, rec=0.890, f1=0.861, th_prob=1.000
mass_tol=2, rt_tol=30, tp=202, fp=40, fn=25, prec=0.835, rec=0.890, f1=0.861, th_prob=1.000
mass_tol=4, rt_tol=5, tp=181, fp=36, fn=46, prec=0.834, rec=0.797, f1=0.815, th_prob=1.000
mass_tol=4, rt_tol=10, tp=196, fp=38, fn=31, prec=0.838, rec=0.863, f1=0.850, th_prob=1.000
mass_tol=4, rt_tol=15, tp=199, fp=37, fn=28, prec=0.843, rec=0.877, f1=0.860, th_prob=1.000
mass_tol=4, rt_tol=20, tp=198, fp=40, fn=29, prec=0.832, rec=0.872, f1=0.852, th_prob=

<h2>Plotting</h2>

In [None]:
def plot_density(exp_res, title):
    training_dfs = []
    for item in exp_res:
        training_data, training_df, best_training_row, match_res = item
        training_dfs.append(training_df)
    combined = pd.concat(training_dfs, axis=0)
    combined = combined.reset_index(drop=True)
#     f, ax = plt.subplots(figsize=(6, 6))    
#     sns.kdeplot(combined.Rec, combined.Prec, ax=ax)
#     sns.rugplot(combined.Rec, ax=ax)
#     sns.rugplot(combined.Prec, vertical=True, ax=ax)    
#     ax.set_xlim([0.7, 1.0])
#     ax.set_ylim([0.7, 1.0])
    g = sns.JointGrid(x="Rec", y="Prec", data=combined, xlim=(0.7, 1.0), ylim=(0.7, 1.0))
    g = g.plot_joint(sns.kdeplot)
    g = g.plot_marginals(sns.kdeplot, shade=True)
    ax = g.ax_joint
    ax.set_xlabel('Rec')
    ax.set_ylabel('Prec')
    ax = g.ax_marg_x
    ax.set_title(title)    

In [None]:
plot_density(exp_results_1a, 'Feature matching')
plot_density(exp_results_1b, 'Cluster matching')