<h1>Experiment with beer3pos</h1>

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
import sys
basedir = '/home/joewandy/git/metabolomics_tools'
sys.path.append(basedir)

In [None]:
import numpy as np
import pylab as plt
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
from IPython.display import display, HTML

import cPickle
import random
import copy
import glob
import gzip

In [None]:
from alignment.models import HyperPars as AlignmentHyperPars
from alignment.experiment import *

<h2>Experiment Parameters</h2>

Set up all the experiment parameters

In [None]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/beer3pos'
transformation_file = '/home/joewandy/git/metabolomics_tools/alignment/pos_transformations_full.yml'
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/beer3pos/ground_truth/beer3.positive.dat'

In [None]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 3
hp.within_file_rt_tol = 10
hp.across_file_mass_tol = 10
hp.across_file_rt_tol = 60
hp.alpha_mass = 1
hp.dp_alpha = 1000.0
hp.beta = 0.1
hp.t = 0.0
hp.mass_clustering_n_iterations = 10000
hp.rt_clustering_nsamps = 100
hp.rt_clustering_burnin = 0

print hp

In [None]:
evaluation_method = 2
n_iter = 1

In [None]:
param_list = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        param_list.append((mass_tol, rt_tol))

In [None]:
param_list_mwg = []
for mass_tol in range(2, 7, 2):
    for rt_tol in range(5, 31, 5):
        for group_tol in range(2, 11, 2):
            for alpha in range(0, 11, 2):
                param_list_mwg.append((mass_tol, rt_tol, group_tol, alpha/10.0))

<h2>Create the first-stage clustering for all input files -- Gibbs, mh_biggest=True</h2>

In [None]:
combined_list = load_or_create_clustering('pickles/beer3pos/clustering.p', input_dir, transformation_file, hp)

<h2>Run experiment with beer3pos</h2>

In [None]:
n_files = 3

In [None]:
training_list = load_or_create_filelist('pickles/test/training_list.p', combined_list, n_iter, n_files)

In [None]:
testing_list = load_or_create_filelist('pickles/test/testing_list.p', combined_list, n_iter, n_files)

In [None]:
exp_results_1a = run_experiment(0, training_list, testing_list, param_list, 'pickles/test/res_match_feature.p', hp, evaluation_method, transformation_file, gt_file)

In [None]:
exp_results_1b = run_experiment(1, training_list, testing_list, param_list, 'pickles/test/res_match_cluster.p', hp, evaluation_method, transformation_file, gt_file)

In [None]:
evaluation_method = 3
exp_results_1b_q3 = run_experiment(1, training_list, testing_list, param_list, 'pickles/test/res_match_cluster_q3.p', 
                                   hp, evaluation_method, transformation_file, gt_file, q=3)

In [None]:
exp_results_1c = run_experiment(3, training_list, testing_list, param_list_mwg, 'pickles/test/res_mwg.p', hp, evaluation_method, transformation_file, gt_file)

In [None]:
cluster_cluster = second_stage_clustering(hp, training_list, 0, evaluation_method, transformation_file, gt_file, 
                                          clustering_out='pickles/test/cc_adduct.p', df_out='pickles/test/cluster_cluster.p',
                                          use_adduct_likelihood=True)

In [None]:
filename = 'pickles/test/cc_adduct.p'
with gzip.GzipFile(filename, 'rb') as f:
    ac = cPickle.load(f)
    print "Loaded from %s" % filename

In [None]:
peaksets = [
    [(210, 0), (470, 2), (217, 1), (1001, 3), (10, 4)],
    [(1816, 0), (2059, 2), (2161, 1)],
    [(242, 0), (237, 2)]
]

In [None]:
import itertools
import operator

In [None]:
whitelist = [(210, 0), (470, 2)]

In [None]:
q = 2
results = []
for ps in peaksets:
    print 'ps = %s' % ps
    for combi in itertools.combinations(ps, q):
        temp = list(combi)
        temp.sort(key=operator.itemgetter(1))
        temp = tuple(temp)
        found = False
        for to_check in temp:
            if to_check in whitelist:
                found = True
                break
        if found:
            print temp
            results.append(temp)

In [None]:
results = []
for ps in peaksets:
    print ps
    if len(ps) == 1:
        results.append(ps)
    else:
        for item1 in ps:
            for item2 in ps:
                if item1 == item2:
                    continue
                elif item1[1] > item2[1]:
                    continue
                else:
                    print (item1, item2)
                    results.append((item1, item2))

In [None]:
cluster_cluster_2_q2 = evaluate_performance(hp, ac, gt_file, 3, q=2)
cluster_cluster_2_q3 = evaluate_performance(hp, ac, gt_file, 3, q=3)

In [None]:
plot_scatter(exp_results_1b, 0, cluster_cluster_2_q2, 'Cluster-Cluster (Beer) -- 2')
plot_scatter(exp_results_1b, 0, cluster_cluster_2_q3, 'Cluster-Cluster (Beer) -- 3')

In [None]:
plot_scatter(exp_results_1b, 0, cluster_cluster, 'Cluster-Cluster (Beer)')

In [None]:
cluster_cluster_no_adduct = second_stage_clustering(hp, training_list, 0, evaluation_method, transformation_file, gt_file, 
                                          clustering_out='pickles/test/cc_no_adduct.p', df_out='pickles/test/cluster_cluster_no_adduct.p',
                                          use_adduct_likelihood=False)

<h2>Plotting</h2>

In [None]:
sns.set_context("notebook", font_scale=2.0, rc={"lines.linewidth": 2.5})
sns.set_style("whitegrid")

In [None]:
plot_density(exp_results_1a, 'Feature matching (MW)', xlim=(0.75, 0.95), ylim=(0.85, 1.0))
plot_density(exp_results_1b, 'Cluster matching (Cluster-Match)', xlim=(0.75, 0.95), ylim=(0.85, 1.0))
plot_density(exp_results_1c, 'Modified feature matching (MWG)', xlim=(0.75, 0.95), ylim=(0.85, 1.0))

In [None]:
plot_scatter(exp_results_1b, 0, cluster_cluster, 'Cluster-Cluster (Beer)')

In [None]:
plot_scatter(exp_results_1b, 0, cluster_cluster_no_adduct, 'Cluster-Cluster -- Without Adduct Likelihood (Beer)')