<h1>Experiment with beer1pos</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
basedir = '/home/joewandy/git/metabolomics_tools'
sys.path.append(basedir)

In [3]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip



In [4]:
from alignment.models import HyperPars as AlignmentHyperPars
from alignment.experiment import *

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [5]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/beer1pos'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/beer1pos/ground_truth/beer1.positive.dat'

In [6]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 3
hp.within_file_rt_tol = 5
hp.across_file_mass_tol = 6
hp.across_file_rt_tol = 30
hp.alpha_mass = 1
hp.dp_alpha = 1000.0
hp.beta = 0.1
hp.t = 0.0
hp.mass_clustering_n_iterations = 400
hp.rt_clustering_nsamps = 200
hp.rt_clustering_burnin = 0

print hp

Hyperparameters across_file_mass_tol=6, across_file_rt_tol=30, alpha_mass=1, beta=0.1, dp_alpha=1000.0, mass_clustering_n_iterations=400, matching_alpha=0.3, rt_clustering_burnin=0, rt_clustering_nsamps=200, t=0.0, within_file_mass_tol=3, within_file_rt_tol=5


In [7]:
evaluation_method = 2
n_iter = 1

In [8]:
param_list = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        param_list.append((mass_tol, rt_tol))

In [9]:
param_list_mwg = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        for group_tol in range(2, 11, 2):
            for alpha in range(0, 11, 2):
                param_list_mwg.append((mass_tol, rt_tol, group_tol, alpha/10.0))

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [10]:
combined_list = load_or_create_clustering('pickles/beer1pos/clustering.p', input_dir, transformation_file, hp)

Loaded from pickles/beer1pos/clustering.p


<h2>Run experiment with beer1pos</h2>

In [11]:
n_files = 3

In [12]:
training_list = load_or_create_filelist('pickles/beer1pos/training_list.p', combined_list, n_iter, n_files)

Loaded from pickles/beer1pos/training_list.p
['beer1-file1.csv', 'beer1-file3.csv', 'beer1-file2.csv']


In [13]:
testing_list = load_or_create_filelist('pickles/beer1pos/testing_list.p', combined_list, n_iter, n_files)

Loaded from pickles/beer1pos/testing_list.p
['beer1-file1.csv', 'beer1-file3.csv', 'beer1-file2.csv']


In [14]:
exp_results_1a = run_experiment(0, training_list, testing_list, param_list, 'pickles/beer1pos/res_match_feature.p', hp, evaluation_method, transformation_file, gt_file)

Loaded from pickles/beer1pos/res_match_feature.p


In [None]:
exp_results_1b = run_experiment(1, training_list, testing_list, param_list, 'pickles/beer1pos/res_match_cluster.p', hp, evaluation_method, transformation_file, gt_file)

In [None]:
exp_results_1c = run_experiment(3, training_list, testing_list, param_list, 'pickles/beer1pos/res_mwg.p', hp, evaluation_method, transformation_file, gt_file)

In [None]:
def second_stage_clustering(training_list, i):

    training_data = training_list[i]
    print "Iteration %d" % i
    print "Training on %s" % [x[0].filename for x in training_data]

    param = (hp.across_file_mass_tol, hp.across_file_rt_tol )
    selected_files = [x[0] for x in training_data]  
    selected_clusterings = [x[1] for x in training_data]            
    aligner = Aligner(selected_files, None, transformation_file, 
                           hp, verbose=False, seed=1234567890, parallel=False)
    match_mode = 2
    aligner.run(match_mode, first_stage_clustering_results=selected_clusterings)

    res = aligner.evaluate_performance(gt_file, verbose=False, print_TP=True, method=evaluation_method)
    performances = []
    for r in res:
        performances.append(param+r)
    df = pd.DataFrame(performances, columns=['mass_tol', 'rt_tol', 'TP', 'FP', 'FN', 'Prec', 'Rec', 'F1', 'Threshold'])
    return df

In [None]:
df1 = second_stage_clustering(training_list, 0)

Iteration 0
Training on ['beer1-file1.csv', 'beer1-file3.csv', 'beer1-file2.csv']
n 0	cluster_list=1	last_K = 1
n 1	cluster_list=7	last_K = 3


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    0.3s


n 2	cluster_list=1	last_K = 1
n 3	cluster_list=3	last_K = 2
n 4	cluster_list=6	last_K = 2
n 5	cluster_list=3	last_K = 1
n 6	cluster_list=3	last_K = 1
n 7	cluster_list=1	last_K = 1


[Parallel(n_jobs=1)]: Done   5 jobs       | elapsed:    0.6s
[Parallel(n_jobs=1)]: Done   8 jobs       | elapsed:    0.7s


n 8	cluster_list=1	last_K = 1
n 9	cluster_list=1	last_K = 1
n 10	cluster_list=15	last_K = 12
n 11	cluster_list=6	last_K = 3
n 12	cluster_list=1	last_K = 1
n 13	cluster_list=7	last_K = 3
n 14	cluster_list=1	last_K = 1
n 15	cluster_list=1	last_K = 1
n 16	cluster_list=2	last_K = 1
n 17	cluster_list=4	last_K = 3


[Parallel(n_jobs=1)]: Done  13 jobs       | elapsed:    2.0s
[Parallel(n_jobs=1)]: Done  18 jobs       | elapsed:    2.5s


n 18	cluster_list=5	last_K = 2
n 19	cluster_list=1	last_K = 1
n 20	cluster_list=2	last_K = 1
n 21	cluster_list=2	last_K = 1
n 22	cluster_list=2	last_K = 2
n 23	cluster_list=14	last_K = 6
n 24	cluster_list=1	last_K = 1
n 25	cluster_list=3	last_K = 3
n 26	cluster_list=2	last_K = 2
n 27	cluster_list=18	last_K = 7
n 28	cluster_list=18	last_K = 11
n 29	cluster_list=17	last_K = 17
n 30	cluster_list=12	last_K = 6
n 31	cluster_list=1	last_K = 1


[Parallel(n_jobs=1)]: Done  25 jobs       | elapsed:    3.5s
[Parallel(n_jobs=1)]: Done  32 jobs       | elapsed:    7.7s


n 32	cluster_list=2	last_K = 1
n 33	cluster_list=3	last_K = 3
n 34	cluster_list=8	last_K = 4
n 35	cluster_list=3	last_K = 1
n 36	cluster_list=1	last_K = 1
n 37	cluster_list=4	last_K = 2
n 38	cluster_list=2	last_K = 1
n 39	cluster_list=13	last_K = 6
n 40	cluster_list=1	last_K = 1
n 41	cluster_list=5	last_K = 2
n 42	cluster_list=1	last_K = 1
n 43	cluster_list=15	last_K = 9
n 44	cluster_list=1	last_K = 1
n 45	cluster_list=3	last_K = 1
n 46	cluster_list=2	last_K = 2
n 47	cluster_list=2	last_K = 2
n 48	cluster_list=4	last_K = 2
n 49	cluster_list=7	last_K = 4


[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed:    9.1s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:   10.8s


n 50	cluster_list=5	last_K = 3
n 51	cluster_list=2	last_K = 1
n 52	cluster_list=11	last_K = 5
n 53	cluster_list=2	last_K = 2
n 54	cluster_list=4	last_K = 4
n 55	cluster_list=2	last_K = 2
n 56	cluster_list=5	last_K = 3
n 57	cluster_list=3	last_K = 1
n 58	cluster_list=4	last_K = 3
n 59	cluster_list=14	last_K = 13
n 60	cluster_list=9	last_K = 4
n 61	cluster_list=1	last_K = 1
n 62	cluster_list=3	last_K = 1
n 63	cluster_list=1	last_K = 1
n 64	cluster_list=3	last_K = 1
n 65	cluster_list=3	last_K = 2
n 66	cluster_list=3	last_K = 1
n 67	cluster_list=3	last_K = 1
n 68	cluster_list=1	last_K = 1
n 69	cluster_list=5	last_K = 2
n 70	cluster_list=5	last_K = 3
n 71	cluster_list=1	last_K = 1


[Parallel(n_jobs=1)]: Done  61 jobs       | elapsed:   13.7s
[Parallel(n_jobs=1)]: Done  72 jobs       | elapsed:   14.5s


n 72	cluster_list=3	last_K = 1
n 73	cluster_list=17	last_K = 8
n 74	cluster_list=7	last_K = 5
n 75	cluster_list=5	last_K = 2
n 76	cluster_list=1	last_K = 1
n 77	cluster_list=2	last_K = 2
n 78	cluster_list=1	last_K = 1
n 79	cluster_list=9	last_K = 4
n 80	cluster_list=18	last_K = 8
n 81	cluster_list=11	last_K = 6
n 82	cluster_list=14	last_K = 10
n 83	cluster_list=1	last_K = 1
n 84	cluster_list=10	last_K = 5
n 85	cluster_list=2	last_K = 2
n 86	cluster_list=3	last_K = 1
n 87	cluster_list=6	last_K = 3
n 88	cluster_list=26	last_K = 23
n 89	cluster_list=3	last_K = 3
n 90	cluster_list=12	last_K = 4
n 91	cluster_list=6	last_K = 5
n 92	cluster_list=5	last_K = 2
n 93	cluster_list=16	last_K = 6
n 94	cluster_list=3	last_K = 1
n 95	cluster_list=2	last_K = 1
n 96	cluster_list=1	last_K = 1
n 97	cluster_list=11	last_K = 10


[Parallel(n_jobs=1)]: Done  85 jobs       | elapsed:   19.1s
[Parallel(n_jobs=1)]: Done  98 jobs       | elapsed:   24.9s


n 98	cluster_list=7	last_K = 3
n 99	cluster_list=5	last_K = 3
n 100	cluster_list=5	last_K = 3
n 101	cluster_list=1	last_K = 1
n 102	cluster_list=3	last_K = 1
n 103	cluster_list=5	last_K = 4
n 104	cluster_list=4	last_K = 3
n 105	cluster_list=4	last_K = 2
n 106	cluster_list=1	last_K = 1
n 107	cluster_list=1	last_K = 1
n 108	cluster_list=3	last_K = 1
n 109	cluster_list=3	last_K = 1
n 110	cluster_list=24	last_K = 15
n 111	cluster_list=1	last_K = 1
n 112	cluster_list=5	last_K = 5
n 113	cluster_list=1	last_K = 1
n 114	cluster_list=2	last_K = 1
n 115	cluster_list=3	last_K = 1
n 116	cluster_list=4	last_K = 3
n 117	cluster_list=1	last_K = 1
n 118	cluster_list=3	last_K = 2
n 119	cluster_list=3	last_K = 2
n 120	cluster_list=2	last_K = 2
n 121	cluster_list=14	last_K = 6
n 122	cluster_list=23	last_K = 9
n 123	cluster_list=7	last_K = 3
n 124	cluster_list=1	last_K = 1
n 125	cluster_list=3	last_K = 3
n 126	cluster_list=5	last_K = 3
n 127	cluster_list=1	last_K = 1


[Parallel(n_jobs=1)]: Done 113 jobs       | elapsed:   28.2s
[Parallel(n_jobs=1)]: Done 128 jobs       | elapsed:   31.3s


n 128	cluster_list=1	last_K = 1
n 129	cluster_list=5	last_K = 2
n 130	cluster_list=11	last_K = 5
n 131	cluster_list=3	last_K = 3
n 132	cluster_list=5	last_K = 3
n 133	cluster_list=2	last_K = 2
n 134	cluster_list=7	last_K = 3
n 135	cluster_list=3	last_K = 1
n 136	cluster_list=19	last_K = 13
n 137	cluster_list=7	last_K = 3
n 138	cluster_list=2	last_K = 1
n 139	cluster_list=6	last_K = 2
n 140	cluster_list=5	last_K = 3
n 141	cluster_list=4	last_K = 2
n 142	cluster_list=2	last_K = 1
n 143	cluster_list=1	last_K = 1
n 144	cluster_list=6	last_K = 3
n 145	cluster_list=3	last_K = 2
n 146	cluster_list=9	last_K = 5
n 147	cluster_list=4	last_K = 2
n 148	cluster_list=2	last_K = 1
n 149	cluster_list=9	last_K = 4
n 150	cluster_list=1	last_K = 1
n 151	cluster_list=3	last_K = 1
n 152	cluster_list=3	last_K = 2
n 153	cluster_list=1	last_K = 1
n 154	cluster_list=9	last_K = 5
n 155	cluster_list=3	last_K = 2
n 156	cluster_list=1	last_K = 1
n 157	cluster_list=3	last_K = 2
n 158	cluster_list=2	last_K = 2
n 159

[Parallel(n_jobs=1)]: Done 145 jobs       | elapsed:   35.1s
[Parallel(n_jobs=1)]: Done 162 jobs       | elapsed:   37.9s


n 162	cluster_list=3	last_K = 1
n 163	cluster_list=3	last_K = 1
n 164	cluster_list=18	last_K = 7
n 165	cluster_list=1	last_K = 1
n 166	cluster_list=10	last_K = 6
n 167	cluster_list=13	last_K = 5
n 168	cluster_list=14	last_K = 10
n 169	cluster_list=2	last_K = 2
n 170	cluster_list=1	last_K = 1
n 171	cluster_list=6	last_K = 4
n 172	cluster_list=13	last_K = 7
n 173	cluster_list=9	last_K = 3
n 174	cluster_list=2	last_K = 2
n 175	cluster_list=6	last_K = 4
n 176	cluster_list=12	last_K = 12
n 177	cluster_list=37	last_K = 16
n 178	cluster_list=5	last_K = 3
n 179	cluster_list=6	last_K = 3
n 180	cluster_list=8	last_K = 3
n 181	cluster_list=4	last_K = 2
n 182	cluster_list=6	last_K = 2
n 183	cluster_list=11	last_K = 8
n 184	cluster_list=8	last_K = 6
n 185	cluster_list=5	last_K = 3
n 186	cluster_list=1	last_K = 1
n 187	cluster_list=1	last_K = 1
n 188	cluster_list=1	last_K = 1
n 189	cluster_list=1	last_K = 1
n 190	cluster_list=3	last_K = 1
n 191	cluster_list=14	last_K = 6
n 192	cluster_list=1	last_K 

[Parallel(n_jobs=1)]: Done 181 jobs       | elapsed:   46.8s
[Parallel(n_jobs=1)]: Done 200 jobs       | elapsed:   54.9s


n 200	cluster_list=9	last_K = 7
n 201	cluster_list=6	last_K = 4
n 202	cluster_list=3	last_K = 1
n 203	cluster_list=1	last_K = 1
n 204	cluster_list=1	last_K = 1
n 205	cluster_list=21	last_K = 19
n 206	cluster_list=1	last_K = 1
n 207	cluster_list=8	last_K = 4
n 208	cluster_list=5	last_K = 4
n 209	cluster_list=1	last_K = 1
n 210	cluster_list=1	last_K = 1
n 211	cluster_list=2	last_K = 2
n 212	cluster_list=6	last_K = 3
n 213	cluster_list=9	last_K = 3
n 214	cluster_list=1	last_K = 1
n 215	cluster_list=3	last_K = 1
n 216	cluster_list=1	last_K = 1
n 217	cluster_list=1	last_K = 1
n 218	cluster_list=15	last_K = 6
n 219	cluster_list=1	last_K = 1
n 220	cluster_list=3	last_K = 1
n 221	cluster_list=1	last_K = 1
n 222	cluster_list=20	last_K = 17
n 223	cluster_list=4	last_K = 4
n 224	cluster_list=3	last_K = 1
n 225	cluster_list=2	last_K = 2
n 226	cluster_list=6	last_K = 4
n 227	cluster_list=5	last_K = 3
n 228	cluster_list=1	last_K = 1
n 229	cluster_list=3	last_K = 1
n 230	cluster_list=3	last_K = 1
n 2

In [None]:
df1.to_pickle('pickles/beer1pos/df1.p')

<h2>Plotting</h2>

In [None]:
def plot_density(exp_res, title):
    training_dfs = []
    for item in exp_res:
        training_data, training_df, best_training_row, match_res = item
        training_dfs.append(training_df)
    combined = pd.concat(training_dfs, axis=0)
    combined = combined.reset_index(drop=True)
#     f, ax = plt.subplots(figsize=(6, 6))    
#     sns.kdeplot(combined.Rec, combined.Prec, ax=ax)
#     sns.rugplot(combined.Rec, ax=ax)
#     sns.rugplot(combined.Prec, vertical=True, ax=ax)    
#     ax.set_xlim([0.7, 1.0])
#     ax.set_ylim([0.7, 1.0])
    g = sns.JointGrid(x="Rec", y="Prec", data=combined, xlim=(0.7, 1.0), ylim=(0.7, 1.0))
    g = g.plot_joint(sns.kdeplot)
    g = g.plot_marginals(sns.kdeplot, shade=True)
    ax = g.ax_joint
    ax.set_xlabel('Rec')
    ax.set_ylabel('Prec')
    ax = g.ax_marg_x
    ax.set_title(title)    

In [None]:
plot_density(exp_results_1a, 'Feature matching')
plot_density(exp_results_1b, 'Cluster matching')

In [None]:
print exp_results_1a

In [None]:
print exp_results_1b

In [None]:
def plot_scatter(exp_res, idx, df, title):
    item = exp_res[idx]
    training_data, training_df, best_training_row, match_res = item
    training_df = training_df.reset_index(drop=True)
    g = sns.JointGrid(x="Rec", y="Prec", data=training_df)
    g = g.plot_joint(plt.scatter, color=".5", edgecolor="white")
    plt.figure(g.fig.number)
    plt.plot(df.Rec, df.Prec, '.r-')    
    # g = g.plot_marginals(sns.distplot, kde=False, color=".5")  
    g = g.plot_marginals(sns.kdeplot, shade=True)
    ax = g.ax_joint
    ax.set_xlabel('Rec')
    ax.set_ylabel('Prec')
    ax.set_ylim([0.7, 1.0])
    ax = g.ax_marg_x
    ax.set_title(title)    

In [None]:
plot_scatter(exp_results_1a, 0, df1, 'Title')

In [None]:
plot_scatter(exp_results_1a, 0, df1, 'Title')