Positive mode voting with the direct xcms output
===

In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import numpy as np
from scipy import stats

base_dir = '/Users/simon/git/ms1fun/'
sys.path.append(base_dir + 'code')
sys.path.append(base_dir + 'dbs')

from corr_cluster import Peak,BetaLike,CorrCluster
from formula import Formula
import pylab as plt
%matplotlib inline



In [13]:
files = []

prefix = '/Users/simon/Dropbox/BioResearch/Meta_clustering/ms1fundata/Beer/PositiveMode/Beer3Full/xcms/'
filename = 'Beer_3_Full1'
files.append((prefix,filename,False))

prefix = '/Users/simon/Dropbox/BioResearch/Meta_clustering/ms1fundata/urine/pos/xcms/'
filename = 'Urine_37_fullscan1_POS'
files.append((prefix,filename,False))

In [14]:
import transformation
transformations = transformation.load_from_file(base_dir + 'dbs/pos_transformations_reduced.yml')
print "Loaded " + str(len(transformations)) + " transformations"

Loaded 87 transformations


In [16]:
bl = BetaLike()
clusterings = {}
for f in files:

    filename = f[1]
    prefix = f[0]
    print "Running file " + filename
    csvfile = prefix + filename + '.xcms.csv'
    coc = CorrCluster(bl,csvfile,greedy_thresh=0.7,correct=f[2],file_type='xcms.csv',rt_thresh=2)
    clusterings[filename] = coc

Running file Beer_3_Full1
Loaded 8394 peaks
Greedy clustering done, resulting in 439 clusters
Running file Urine_37_fullscan1_POS
Loaded 8388 peaks
Greedy clustering done, resulting in 473 clusters


In [17]:
from voter import Voter,PeakGroup
groups = {}
for f in files:
    v = Voter(transformations)
    filename = f[1]
    print "Performing voting on file {}".format(filename)
    file_groups = []
    for cluster in clusterings[filename].clusters:
        file_groups += v.make_groups(cluster.members)
    groups[filename] = file_groups

Performing voting on file Beer_3_Full1
Performing voting on file Urine_37_fullscan1_POS


In [18]:
from databases import Standards
st = Standards()
all_hits = {}
for f in files:
    filename = f[1]
    all_hits[filename] = st.get_group_hits(groups[filename],use_max_vote = True)

    

In [19]:
# Sort the groups according to votes (top ones first)
global_adduct_counts = {}
global_fragment_counts = {}
global_transformation_counts = {}
for f in files:
    filename = f[1]
    outpre = 'xcms_output/pos/' + filename

    temp_groups = sorted(groups[filename],key = lambda x:x.vote,reverse=True)
    outfile = outpre + '_by_vote.txt'
    

    with open(outfile,'w') as f:
        for i,group in enumerate(temp_groups):
            line = "vote: {}, M: {}\n".format(group.vote,group.M)
            f.write(line)
            head_line = '\tPeak m/z,Peak rt,Peak intensity,transformation (transformed mass,vote)\n'
            f.write(head_line)
            for (peak,transformation,transmass) in sorted(group.members,key = lambda x: x[1].vote,reverse=True):
                line = "\t{:.4f},{:.4f},{:.2e},{} ({:.4f},{})\n".format(peak.mass,peak.rt,peak.intensity,transformation,transmass,transformation.vote)
                f.write(line)
            f.write('\n')
            
    temp_groups = sorted(groups[filename],key = lambda x:x.M)
    outfile = outpre + '_by_M.txt'
    
    with open(outfile,'w') as f:
        for i,group in enumerate(temp_groups):
            line = "vote: {}, M: {}\n".format(group.vote,group.M)
            f.write(line)
            head_line = '\tPeak m/z,Peak rt,Peak intensity,transformation (transformed mass,vote)\n'
            f.write(head_line)
            for (peak,transformation,transmass) in sorted(group.members,key = lambda x: x[1].vote,reverse=True):
                line = "\t{:.4f},{:.4f},{:.2e},{} ({:.4f},{})\n".format(peak.mass,peak.rt,peak.intensity,transformation,transmass,transformation.vote)
                f.write(line)
            f.write('\n')

    with open(outpre + '_matched_std.txt','w') as f:
        for mol in all_hits[filename]:
            group = all_hits[filename][mol]
            line = "{} (vote={})\n".format(mol,group.vote)
            f.write(line)
            head_line = '\tPeak m/z,Peak rt,Peak intensity,transformation (transformed mass,vote)\n'
            f.write(head_line)
            for (peak,transformation,transmass) in sorted(group.members,key = lambda x: x[1].vote,reverse=True):
                line = "\t{:.4f},{:.4f},{:.2e},{} ({:.4f},{})\n".format(peak.mass,peak.rt,peak.intensity,transformation,transmass,transformation.vote)
                f.write(line)
            f.write('\n')


    include_singletons = False
    trans_counts = {}
    tot = 0
    for tr in transformations:
        trans_counts[tr] = 0
    for group in groups[filename]:
        if not include_singletons:
            if len(group.members) == 1:
                continue
        for p,t,_ in group.members:
            trans_counts[t] += 1
            tot += 1

    with open(outpre + '_tran_counts.txt','w') as f:
        for tr in sorted(transformations,key = lambda x: x.vote, reverse=True):
            line = "{},{},{:.4f}\n".format(tr,trans_counts[tr],trans_counts[tr]/(1.0*tot))
            f.write(line)

    # output the counts of particular adducts / fragments
    frag_counts = {}
    adduct_counts = {}
    adduct_tot = 0
    frag_tot = 0
    for tr in transformations:
        for f in tr.fragments:
            if not f in frag_counts:
                frag_counts[f] = 0
        for a in tr.adducts:
            if not a in adduct_counts:
                adduct_counts[a] = 0
                
    for group in groups[filename]:
        for p,t,_ in group.members:
            for f in t.fragments:
                frag_counts[f] += 1
                frag_tot += 1
            for a in t.adducts:
                adduct_counts[a] += 1
                adduct_tot += 1
                
    with open(outpre + '_adduct_counts.txt','w') as f:
        for a in adduct_counts:
            line = "{},{},{:.4f}\n".format(a,adduct_counts[a],adduct_counts[a]/(1.0*adduct_tot))
            f.write(line)
    
    with open(outpre + '_fragment_counts.txt','w') as f:
        for fr in frag_counts:
            line = "{},{},{:.4f}\n".format(fr,frag_counts[fr],frag_counts[fr]/(1.0*frag_tot))
            f.write(line)
    
    global_adduct_counts[filename] = adduct_counts
    global_fragment_counts[filename] = frag_counts
    global_transformation_counts[filename] = trans_counts