<h1>Checking the low recall when aligning M1 data with method #1</h1>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))

In [3]:
from discretisation.preprocessing import FileLoader
from models import HyperPars as AlignmentHyperPars
from discretisation.adduct_cluster import AdductCluster, Peak
from shared_bin_matching import SharedBinMatching as Aligner
from ground_truth import GroundTruth

<h2>0. Precursor Clustering on each file</h2>

Define input parameters

In [None]:
input_dir = '/home/joewandy/git/metabolomics_tools/alignment/input/M1_4'
database_file = None
transformation_file = '/home/joewandy/git/metabolomics_tools/discretisation/mulsubs/pos_transformations.yml'

In [None]:
hp = AlignmentHyperPars()    
hp.within_file_mass_tol = 10
hp.within_file_rt_tol = 5
hp.across_file_mass_tol = 30
hp.across_file_rt_tol = 100
hp.alpha_mass = 1.0
hp.dp_alpha = 100.0
hp.t = 0
hp.mass_clustering_n_iterations = 100
hp.rt_clustering_nsamps = 200
hp.rt_clustering_burnin = 100

print hp

In [None]:
loader = FileLoader()
data_list = loader.load_model_input(input_dir, database_file, 0, 0, make_bins=False)

For some reasons, the cell below that does precursor clustering for each file takes **a lot** longer to run in the notebook vs. when run outside ... Not sure why??!

In [None]:
clustering_results = []
for peak_data in data_list:

    ac = AdductCluster(mass_tol=hp.within_file_mass_tol, rt_tol=hp.within_file_rt_tol, 
                       alpha=hp.alpha_mass, mh_biggest=True, transformation_file=transformation_file, verbose=2)

    peak_list = peak_data.features
    ac.init_from_list(peak_list)

    ac.init_vb()
    for n in range(hp.mass_clustering_n_iterations):
        print "VB step %d file %d " % (n, j)
        sys.stdout.flush()
        ac.vb_step()
        
    clustering_results.append(ac)

<hr/>

<h2>Checking</h2>

In [4]:
aligner = Aligner.resume_from('/home/joewandy/git/metabolomics_tools/alignment/input/M1_4/results.project')

Project loaded from /home/joewandy/git/metabolomics_tools/alignment/input/M1_4/results.project time taken = 19.7304940224


In [6]:
data_list = aligner.data_list
hp = aligner.hp
file_adduct_clusterers = aligner.clustering_results # list of adduct clusterer for each file
file_clusterings = aligner.file_data # dict of file idx to the list of clusters in that file
print hp

Hyperparameters across_file_mass_tol=30.0, across_file_rt_tol=100.0, alpha_mass=1.0, beta=0.1, dp_alpha=100.0, mass_clustering_n_iterations=100, rt_clustering_burnin=20, rt_clustering_nsamps=40, t=0.0, within_file_mass_tol=10.0, within_file_rt_tol=5.0


Find some big clusters in the first file. We have performed MAP assignment of each peak feature into its most likely cluster.

In [None]:
def plot_biggest(file_idx, threshold):

    ac = file_adduct_clusterers[file_idx]
    clusters_list = file_clusterings[file_idx]
    singleton_count = 0
    
    big_clusters = []
    biggest = clusters_list[0]
    for cluster in clusters_list:
        if cluster.N == 1:
            singleton_count += 1
        if cluster.N >= threshold:
            big_clusters.append(cluster)
            if cluster.N >= biggest.N:
                biggest = cluster

    print "Singleton count {}".format(singleton_count)
    print "{} big clusters found".format(len(big_clusters))
    print "Biggest has {} members".format(biggest.N)

    for c in big_clusters:
        ac.cluster_plot(c)

In [None]:
plot_biggest(file_idx=0, threshold=4)

In [None]:
plot_biggest(file_idx=1, threshold=4)

Print out all the aligned peaksets

In [None]:
aligned_peaksets = []
i = 0
for i in range(len(aligner.alignment_results)):
    peakset = aligner.alignment_results[i].peakset
    aligned_peaksets.append(peakset)

Load the ground truth and check the annotations

In [None]:
file_list = aligner.file_list
gt_file = '/home/joewandy/git/metabolomics_tools/alignment/input/M1_4/ground_truth/ground_truth.txt'
gt = GroundTruth(gt_file, file_list, data_list)

In [None]:
def found_in(gt_entry, aligned_peaksets):
    for ps in aligned_peaksets:
        ps_keys = [f._get_key() for f in ps]
        for f in gt_entry:
            if f._get_key() not in ps_keys:
                all_found = False
        if all_found:
            return True
    return False

In [None]:
groups = gt.gt_features
not_found_list = []
found_list = []
for group in groups:
    found = found_in(group, aligned_peaksets)
    if not found: # store the not-found ground truth entries
        not_found_list.append(group)
    else:
        found_list.append(group)

In [None]:
print "Aligned peaksets that agree with ground truth = %d/%d" % (len(found_list), len(groups)) 
print "Aligned peaksets that disagree with ground truth = %d/%d" % (len(not_found_list), len(groups)) 

Print the found ones

In [None]:
i = 0
for group in found_list:
    print "Group %d" % i
    i += 1
    for f in group:
        key = f._get_key()
        annot = aligner.annotations[key]
        print "- id %s mass %.4f rt %.2f MAP_trans %s" % ((key, f.mass, f.rt, annot))

Print the not-found ones

In [None]:
def find_overlap(gt_entry, aligned_peaksets):
    overlap = []
    for ps in aligned_peaksets:
        ps_keys = [f._get_key() for f in ps]
        any_found = False
        for f in gt_entry:
            if f._get_key() in ps_keys:
                any_found = True
        if any_found:
            overlap.append(ps)
    return overlap

def print_peakset(peakset):
    print "\tPeakset"
    for f in peakset:
        key = f._get_key()
        annot = aligner.annotations[key]
        print "\t- id %s mass %.4f rt %.2f MAP_trans %s" % ((key, f.mass, f.rt, annot))    

In [None]:
for group in not_found_list:
    
    print "Ground Truth Group %d" % i
    i += 1
    for f in group:
        key = f._get_key()
        print "- id %s mass %.4f rt %.2f" % ((key, f.mass, f.rt))
    
    print "Overlapping peaksets:"
    overlap = find_overlap(group, aligned_peaksets)
    for ps in overlap:
        print_peakset(ps)
    print

Some peaks seem to have disappeared from the output aligned peaksets? This looks like a bug, which explains the lower recall ... For example, peak (1327, 1) below ..

In [None]:
print "Found in input file?"
for f in data_list[1].features:
    key = f._get_key()
    if (1327, 1) == key:
        print "- id %s mass %.4f rt %.2f" % ((key, f.mass, f.rt))

# check in the output of first-stage clustering
print "\nAnd also in the clustering"
first_file_clusterings = file_clusterings[1]
for cluster in first_file_clusterings:
    member_keys = [f._get_key() for f, poss in cluster.members]
    if (1327, 1) in member_keys:
        print "Cluster %d %.4f %.2f" % (cluster.id, cluster.mu_mass, cluster.mu_rt)
        print member_keys   
        for f, poss in cluster.members:
            print "- id %s mass %.4f rt %.2f" % ((f._get_key(), f.mass, f.rt))

# check in the output aligned peaksets
print "\nBut missing in the output ??!!"
for ps in aligned_peaksets:
    ps_keys = [f._get_key() for f in ps]
    if (1327, 1) in ps_keys:
        print ps_keys

<hr/>