<h1>Experiment with beer1pos</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
basedir = '/home/joewandy/git/metabolomics_tools'
sys.path.append(basedir)

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip



In [4]:
from alignment.models import HyperPars as AlignmentHyperPars
from alignment.experiment import *

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/beer1pos'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/beer1pos/ground_truth/ground_truth.txt'

In [17]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 5
hp.within_file_rt_tol = 30
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 60
hp.alpha_mass = 1.0
hp.dp_alpha = 1000.0
hp.beta = 0.1
hp.t = 0.0
hp.mass_clustering_n_iterations = 400
hp.rt_clustering_nsamps = 100
hp.rt_clustering_burnin = 0

print hp

Hyperparameters across_file_mass_tol=10, across_file_rt_tol=60, alpha_mass=1.0, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=400, matching_alpha=0.3, rt_clustering_burnin=0, rt_clustering_nsamps=100, t=0.0, within_file_mass_tol=5, within_file_rt_tol=30


In [18]:
evaluation_method = 2
n_iter = 1

In [19]:
param_list = []
for mass_tol in range(2, 11, 2):
    for rt_tol in range(5, 61, 5):
        param_list.append((mass_tol, rt_tol))

In [20]:
param_list_mwg = []
for mass_tol in range(2, 11, 2):
    for rt_tol in range(5, 61, 5):
        for group_tol in range(2, 11, 2):
            for alpha in range(0, 11, 2):
                param_list_mwg.append((mass_tol, rt_tol, group_tol, alpha/10.0))

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [21]:
combined_list = load_or_create_clustering('pickles/beer1pos/beer1_pos_clustering.p', input_dir, transformation_file, hp)

6733 features read from beer1-file1.csv
7586 features read from beer1-file2.csv
6823 features read from beer1-file3.csv


[Parallel(n_jobs=4)]: Done   1 out of   3 | elapsed:  5.1min remaining: 10.1min
[Parallel(n_jobs=4)]: Done   2 out of   3 | elapsed:  5.8min remaining:  2.9min
[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:  5.8min finished


Saved to pickles/beer1pos/beer1_pos_clustering.p
Created 6733 clusters
Created 7586 clusters
Created 6823 clusters
Binning with mh_biggest = True
Binning with mh_biggest = True
Binning with mh_biggest = True
Assigning possible transformations 0/6733
Assigning possible transformations 0/7586
Assigning possible transformations 0/6823
Assigning possible transformations 500/6733
Assigning possible transformations 500/7586
Assigning possible transformations 500/6823
Assigning possible transformations 1000/6733
Assigning possible transformations 1000/7586
Assigning possible transformations 1000/6823
Assigning possible transformations 1500/6733
Assigning possible transformations 1500/7586
Assigning possible transformations 1500/6823
Assigning possible transformations 2000/6733
Assigning possible transformations 2000/7586
Assigning possible transformations 2000/6823
Assigning possible transformations 2500/6733
Assigning possible transformations 2500/7586
Assigning possible transformations 2500

<h2>Run experiment with beer1pos</h2>

In [22]:
n_files = 3

In [23]:
training_list = load_or_create_filelist('pickles/beer1pos/training_list.p', combined_list, n_iter, n_files)

['beer1-file3.csv', 'beer1-file1.csv', 'beer1-file2.csv']
Saved to pickles/beer1pos/training_list.p


In [24]:
testing_list = load_or_create_filelist('pickles/beer1pos/testing_list.p', combined_list, n_iter, n_files)

['beer1-file1.csv', 'beer1-file2.csv', 'beer1-file3.csv']
Saved to pickles/beer1pos/testing_list.p


In [27]:
exp_results_1a = run_experiment(0, training_list, testing_list, param_list, 'pickles/beer1pos/res_match_feature.p', hp, evaluation_method, transformation_file, gt_file)

Iteration 0
Training on ['beer1-file3.csv', 'beer1-file1.csv', 'beer1-file2.csv']
mass_tol=2, rt_tol=5, tp=233, fp=20, fn=44, prec=0.921, rec=0.841, f1=0.879, th_prob=1.000
mass_tol=2, rt_tol=10, tp=251, fp=23, fn=26, prec=0.916, rec=0.906, f1=0.911, th_prob=1.000
mass_tol=2, rt_tol=15, tp=258, fp=26, fn=19, prec=0.908, rec=0.931, f1=0.920, th_prob=1.000


KeyboardInterrupt: 

In [None]:
exp_results_1b = run_experiment(1, training_list, testing_list, param_list, 'pickles/beer1pos/res_match_cluster.p', hp, evaluation_method, transformation_file, gt_file)

In [None]:
exp_results_1c = run_experiment(3, training_list, testing_list, param_list, 'pickles/beer1pos/res_mwg.p', hp, evaluation_method, transformation_file, gt_file)

<h2>Plotting</h2>

In [None]:
def plot_density(exp_res, title):
    training_dfs = []
    for item in exp_res:
        training_data, training_df, best_training_row, match_res = item
        training_dfs.append(training_df)
    combined = pd.concat(training_dfs, axis=0)
    combined = combined.reset_index(drop=True)
#     f, ax = plt.subplots(figsize=(6, 6))    
#     sns.kdeplot(combined.Rec, combined.Prec, ax=ax)
#     sns.rugplot(combined.Rec, ax=ax)
#     sns.rugplot(combined.Prec, vertical=True, ax=ax)    
#     ax.set_xlim([0.7, 1.0])
#     ax.set_ylim([0.7, 1.0])
    g = sns.JointGrid(x="Rec", y="Prec", data=combined, xlim=(0.7, 1.0), ylim=(0.7, 1.0))
    g = g.plot_joint(sns.kdeplot)
    g = g.plot_marginals(sns.kdeplot, shade=True)
    ax = g.ax_joint
    ax.set_xlabel('Rec')
    ax.set_ylabel('Prec')
    ax = g.ax_marg_x
    ax.set_title(title)    

In [None]:
plot_density(exp_results_1a, 'Feature matching')
plot_density(exp_results_1b, 'Cluster matching')