<h1>Experiment with beer2pos</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
basedir = '/home/joewandy/git/metabolomics_tools'
sys.path.append(basedir)

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip

In [4]:
from alignment.models import HyperPars as AlignmentHyperPars
from alignment.experiment import *

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [15]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/beer2pos'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/beer2pos/ground_truth/beer2.positive.dat'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 3
hp.within_file_rt_tol = 10
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 60
hp.alpha_mass = 1
hp.dp_alpha = 1000.0
hp.beta = 0.1
hp.t = 0.0
hp.mass_clustering_n_iterations = 400
hp.rt_clustering_nsamps = 100
hp.rt_clustering_burnin = 0

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=60, alpha_mass=1, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=400, matching_alpha=0.3, rt_clustering_burnin=0, rt_clustering_nsamps=100, t=0.0, within_file_mass_tol=3, within_file_rt_tol=10


In [7]:
evaluation_method = 2
n_iter = 1

In [8]:
param_list = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        param_list.append((mass_tol, rt_tol))

In [9]:
param_list_mwg = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        for group_tol in range(2, 11, 2):
            for alpha in range(0, 11, 2):
                param_list_mwg.append((mass_tol, rt_tol, group_tol, alpha/10.0))

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [10]:
combined_list = load_or_create_clustering('pickles/beer2pos/clustering.p', input_dir, transformation_file, hp)

6997 features read from beer2-file1.csv
7354 features read from beer2-file2.csv
7620 features read from beer2-file3.csv


[Parallel(n_jobs=4)]: Done   1 out of   3 | elapsed:  2.3min remaining:  4.7min
[Parallel(n_jobs=4)]: Done   2 out of   3 | elapsed:  2.6min remaining:  1.3min
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:  2.8min finished


Saved to pickles/beer2pos/clustering.p
Created 6997 clusters
Created 7354 clusters
Created 7620 clusters
Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Assigning possible transformations 0/6997
Assigning possible transformations 0/7354
Assigning possible transformations 0/7620
Assigning possible transformations 500/6997
Assigning possible transformations 500/7354
Assigning possible transformations 500/7620
Assigning possible transformations 1000/6997
Assigning possible transformations 1000/7354
Assigning possible transformations 1000/7620
Assigning possible transformations 1500/6997
Assigning possible transformations 1500/7354
Assigning possible transformations 1500/7620
Assigning possible transformations 2000/6997
Assigning possible transformations 2000/7354
Assigning possible transformations 2000/7620
Assigning possible transformations 2500/6997
Assigning possible transformations 2500/7354
Assigning possible transformations 2500/7620
Assi

<h2>Run experiment with beer2pos</h2>

In [11]:
n_files = 3

In [12]:
training_list = load_or_create_filelist('pickles/beer2pos/training_list.p', combined_list, n_iter, n_files)

['beer2-file1.csv', 'beer2-file2.csv', 'beer2-file3.csv']
Saved to pickles/beer2pos/training_list.p


In [13]:
testing_list = load_or_create_filelist('pickles/beer2pos/testing_list.p', combined_list, n_iter, n_files)

['beer2-file3.csv', 'beer2-file2.csv', 'beer2-file1.csv']
Saved to pickles/beer2pos/testing_list.p


In [16]:
exp_results_1a = run_experiment(0, training_list, testing_list, param_list, 'pickles/beer2pos/res_match_feature.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beer2-file1.csv', 'beer2-file2.csv', 'beer2-file3.csv']
mass_tol=2, rt_tol=5, tp=233, fp=36, fn=41, prec=0.866, rec=0.850, f1=0.858, th_prob=1.000
mass_tol=2, rt_tol=10, tp=252, fp=37, fn=22, prec=0.872, rec=0.920, f1=0.895, th_prob=1.000
mass_tol=2, rt_tol=15, tp=253, fp=40, fn=21, prec=0.863, rec=0.923, f1=0.892, th_prob=1.000
mass_tol=2, rt_tol=20, tp=253, fp=41, fn=21, prec=0.861, rec=0.923, f1=0.891, th_prob=1.000
mass_tol=2, rt_tol=25, tp=253, fp=43, fn=21, prec=0.855, rec=0.923, f1=0.888, th_prob=1.000
mass_tol=2, rt_tol=30, tp=253, fp=43, fn=21, prec=0.855, rec=0.923, f1=0.888, th_prob=1.000
mass_tol=4, rt_tol=5, tp=233, fp=36, fn=41, prec=0.866, rec=0.850, f1=0.858, th_prob=1.000
mass_tol=4, rt_tol=10, tp=252, fp=37, fn=22, prec=0.872, rec=0.920, f1=0.895, th_prob=1.000
mass_tol=4, rt_tol=15, tp=253, fp=40, fn=21, prec=0.863, rec=0.923, f1=0.892, th_prob=1.000
mass_tol=4, rt_tol=20, tp=253, fp=41, fn=21, prec=0.861, rec=0.923, f1=0.891, th_prob=1.000


In [17]:
exp_results_1b = run_experiment(1, training_list, testing_list, param_list, 'pickles/beer2pos/res_match_cluster.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beer2-file1.csv', 'beer2-file2.csv', 'beer2-file3.csv']
mass_tol=2, rt_tol=5, tp=221, fp=28, fn=53, prec=0.888, rec=0.807, f1=0.845, th_prob=1.000
mass_tol=2, rt_tol=10, tp=242, fp=29, fn=32, prec=0.893, rec=0.883, f1=0.888, th_prob=1.000
mass_tol=2, rt_tol=15, tp=243, fp=32, fn=31, prec=0.884, rec=0.887, f1=0.885, th_prob=1.000
mass_tol=2, rt_tol=20, tp=243, fp=33, fn=31, prec=0.880, rec=0.887, f1=0.884, th_prob=1.000
mass_tol=2, rt_tol=25, tp=243, fp=34, fn=31, prec=0.877, rec=0.887, f1=0.882, th_prob=1.000
mass_tol=2, rt_tol=30, tp=243, fp=34, fn=31, prec=0.877, rec=0.887, f1=0.882, th_prob=1.000
mass_tol=4, rt_tol=5, tp=221, fp=28, fn=53, prec=0.888, rec=0.807, f1=0.845, th_prob=1.000
mass_tol=4, rt_tol=10, tp=242, fp=29, fn=32, prec=0.893, rec=0.883, f1=0.888, th_prob=1.000
mass_tol=4, rt_tol=15, tp=243, fp=32, fn=31, prec=0.884, rec=0.887, f1=0.885, th_prob=1.000
mass_tol=4, rt_tol=20, tp=243, fp=33, fn=31, prec=0.880, rec=0.887, f1=0.884, th_prob=1.000


In [18]:
exp_results_1c = run_experiment(3, training_list, testing_list, param_list, 'pickles/beer2pos/res_mwg.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beer2-file1.csv', 'beer2-file2.csv', 'beer2-file3.csv']
mass_tol=2, rt_tol=5, tp=231, fp=38, fn=43, prec=0.859, rec=0.843, f1=0.851, th_prob=1.000
mass_tol=2, rt_tol=10, tp=250, fp=39, fn=24, prec=0.865, rec=0.912, f1=0.888, th_prob=1.000
mass_tol=2, rt_tol=15, tp=251, fp=42, fn=23, prec=0.857, rec=0.916, f1=0.885, th_prob=1.000
mass_tol=2, rt_tol=20, tp=251, fp=43, fn=23, prec=0.854, rec=0.916, f1=0.884, th_prob=1.000
mass_tol=2, rt_tol=25, tp=251, fp=45, fn=23, prec=0.848, rec=0.916, f1=0.881, th_prob=1.000
mass_tol=2, rt_tol=30, tp=253, fp=44, fn=21, prec=0.852, rec=0.923, f1=0.886, th_prob=1.000
mass_tol=4, rt_tol=5, tp=231, fp=38, fn=43, prec=0.859, rec=0.843, f1=0.851, th_prob=1.000
mass_tol=4, rt_tol=10, tp=250, fp=39, fn=24, prec=0.865, rec=0.912, f1=0.888, th_prob=1.000
mass_tol=4, rt_tol=15, tp=251, fp=42, fn=23, prec=0.857, rec=0.916, f1=0.885, th_prob=1.000
mass_tol=4, rt_tol=20, tp=251, fp=43, fn=23, prec=0.854, rec=0.916, f1=0.884, th_prob=1.000


<h2>Plotting</h2>

In [None]:
def plot_density(exp_res, title):
    training_dfs = []
    for item in exp_res:
        training_data, training_df, best_training_row, match_res = item
        training_dfs.append(training_df)
    combined = pd.concat(training_dfs, axis=0)
    combined = combined.reset_index(drop=True)
#     f, ax = plt.subplots(figsize=(6, 6))    
#     sns.kdeplot(combined.Rec, combined.Prec, ax=ax)
#     sns.rugplot(combined.Rec, ax=ax)
#     sns.rugplot(combined.Prec, vertical=True, ax=ax)    
#     ax.set_xlim([0.7, 1.0])
#     ax.set_ylim([0.7, 1.0])
    g = sns.JointGrid(x="Rec", y="Prec", data=combined, xlim=(0.7, 1.0), ylim=(0.7, 1.0))
    g = g.plot_joint(sns.kdeplot)
    g = g.plot_marginals(sns.kdeplot, shade=True)
    ax = g.ax_joint
    ax.set_xlabel('Rec')
    ax.set_ylabel('Prec')
    ax = g.ax_marg_x
    ax.set_title(title)    

In [None]:
plot_density(exp_results_1a, 'Feature matching')
plot_density(exp_results_1b, 'Cluster matching')