In [3]:
import xml.etree.ElementTree
import pandas as pd
import os, sys
import pickle

MNET_PATH = '/Volumes/Transcend2/git/molnet/code'
sys.path.append(MNET_PATH)

In [4]:
WIN = 'D:/'
MAC = '/Volumes/Transcend2/'

osp = MAC

In [5]:
sys.path.append('..')
import alignment

## Create metabolites dictionary

Parse metabolites list downloaded from HMDB in xml format

In [3]:
path_to_hmdbfile = osp+ 'Metabolomics_datasets/serum_metabolites.xml'
et = xml.etree.ElementTree.parse(path_to_hmdbfile)
root = et.getroot()


In [4]:
metabolites = {}
stringtag = '{http://www.hmdb.ca}'
for metabolite in root.findall(stringtag+'metabolite'):
    accid = metabolite.find(stringtag+'accession').text
    name = metabolite.find(stringtag+'name').text
    mass = metabolite.find(stringtag+'average_molecular_weight').text
    chemform = metabolite.find(stringtag+'chemical_formula').text
    
    metabolites[accid] = (mass, chemform, name)  
    
with open(os.path.join(osp,'Metabolomics_datasets/metabolites.dict'), 'wb') as handle:
    pickle.dump(metabolites, handle)  

In [6]:
with open(os.path.join(osp,'Metabolomics_datasets/metabolites.dict'), 'rb') as file:
    metabolites = pickle.load(file)

## Load annotated data

In [7]:
annotated_data = pd.read_csv(osp+'pymz/mzmine/221119/annotated_data.csv', sep = ',', index_col = 0)

## Filter out data with no spectra

In [8]:
with open(os.path.join(osp,'pymz/mzmine/aligner_gp.txt'), 'rb') as file:
    aligner_gp = pickle.load(file)   

In [9]:
peaksets = []
annotated_data['spectra'] = ['nan']*len(annotated_data)
for peakid in annotated_data['custom_id']:
    new_peakid = peakid - 1
    np = aligner_gp.peaksets[new_peakid].n_peaks
    spectra = {}
    if np > 1:
        for i in range(np-1):
            source = aligner_gp.peaksets[new_peakid].peaks[i+1].source_file
            msms = aligner_gp.peaksets[new_peakid].peaks[i+1].ms2_spectrum
            spectra[source] = msms
        annotated_data.at[peakid,'spectra'] = spectra


In [29]:
annotated_data_with_spectra = annotated_data[annotated_data['spectra'] != 'nan']

## Add possible adducts for each peak and their respective HMDB accession id

In [10]:
def compute_adducts(mz, positive = True):
    
    PROTON = 1.00727646677
    if positive:
        addList = [(mz - PROTON, 'M+H[1+]', ''),
             ((mz - PROTON)*2, 'M+2H[2+]', ''),
             ((mz - PROTON)*3, 'M+3H[3+]', ''),
             (mz - 1.0034 - PROTON, 'M(C13)+H[1+]', 'C'),
             ((mz - 0.5017 - PROTON)*2, 'M(C13)+2H[2+]', 'C'),
             ((mz - 0.3344 - PROTON)*3, 'M(C13)+3H[3+]', 'C'),
             (mz -1.9958 - PROTON, 'M(S34)+H[1+]', 'S'),
             (mz -1.9972 - PROTON, 'M(Cl37)+H[1+]', 'Cl'),
             (mz - 21.9820 - PROTON, 'M+Na[1+]', ''), 
             ((mz - 10.991 - PROTON)*2, 'M+H+Na[2+]', ''),
             (mz - 37.9555 - PROTON, 'M+K[1+]', ''), 
             (mz - 18.0106 - PROTON, 'M+H2O+H[1+]', ''), 
             (mz + 18.0106 - PROTON, 'M-H2O+H[1+]', 'H2O'), 
             (mz + 36.0212 - PROTON, 'M-H4O2+H[1+]', 'H4O2'),
             (mz + 17.0265 - PROTON, 'M-NH3+H[1+]', 'NH3'),
             (mz + 27.9950 - PROTON, 'M-CO+H[1+]', 'CO'),
             (mz + 43.9898 - PROTON, 'M-CO2+H[1+]', 'CO2'),
             (mz + 46.0054 - PROTON, 'M-HCOOH+H[1+]', 'H2CO2'),
             (mz - 67.9874 - PROTON, 'M+HCOONa[1+]', ''),
             (mz + 67.9874 - PROTON, 'M-HCOONa+H[1+]', 'HCO2Na'),
             (mz - 57.9586 - PROTON, 'M+NaCl[1+]', ''), 
             (mz + 72.0211 - PROTON, 'M-C3H4O2+H[1+]', 'C3H4O2'),
             (mz - 83.9613 - PROTON, 'M+HCOOK[1+]', ''),
             (mz + 83.9613 - PROTON, 'M-HCOOK+H[1+]', 'HCO2K'),
             ] 

    return addList

In [12]:
def annotate_adduct(adducts, metabolites, tolerance = 0.01):
    ids = []
    names = []
    for adduct in adducts:
        mw = adduct[0]
        for key in metabolites:
            if metabolites[key][0] != None:
                if mw > float(metabolites[key][0]) - tolerance and mw < float(metabolites[key][0]) + tolerance:
                    ids.append(key)
                    names.append(metabolites[key][2])
    return ids, names
        

In [11]:
annotated_data['adducts'] = ['nan']*len(annotated_data)
for i,row in annotated_data.iterrows():
    mz = row['row m/z']
    annotated_data.at[i, 'adducts'] = compute_adducts(mz)

In [13]:
annotated_data['hmdbacc']=['nan']*len(annotated_data)
annotated_data['hmdbname']=['nan']*len(annotated_data)
for i,row in annotated_data.iterrows():
    adducts = row['adducts']
    
    
    ids, names = annotate_adduct(adducts, metabolites, 0.2)
    
    annotated_data.at[i, 'hmdbacc'] = ids
    annotated_data.at[i, 'hmdbname'] = names
        

In [22]:
metabolites['HMDB0000936']

('830.7469', 'C40H38N4O16', 'Uroporphyrin I')

## Get ms2 data from HMDB ms2 xml files

In [23]:
def get_ms2_spec_from_hmdb(file, parent_mz):
    import mnet
    path_to_hmdbfile = file
    et = xml.etree.ElementTree.parse(path_to_hmdbfile)
    element = et.getroot()
    mode = element.find('ionization-mode').text
    if mode == 'positive':
        instrtype = element.find('instrument-type').text
        filename = element.find('database-id').text
        np = element.find('peak-counter').text
        peaks = []
        
        for msms in element.find('ms-ms-peaks'):
            mz = float(msms.find('mass-charge').text)
            intensity = float(msms.find('intensity').text)
            
            peaks.append((mz, intensity))
            
        
        ms2_spectrum = mnet.Spectrum(peaks, filename, None, None, parent_mz, parent_mz, metadata = (instrtype, mode))
        return ms2_spectrum
    

In [61]:
def return_best_match_metabolite(ms2specdata, hmdbacclist):
    import glob
    from scoring_functions import fast_cosine
   
    best_score_metab = 0.0
    best_metab_tuple = (0,0,0)
    for acc in hmdbacclist:
        
        files = glob.glob(os.path.join(osp+ 'Metabolomics_datasets/hmdb_experimental_msms_spectra/', acc+'*'))
        if len(files)>0:
            best_score_acc = 0.0
            best_ms2_acc = 'nan'
            for file in files:
                ms2spec = get_ms2_spec_from_hmdb(file, float(metabolites[acc][0]))
                if ms2spec != None: #msms is positive
                    score, used_matches = fast_cosine(ms2spec, ms2specdata, 0.2, 2)
                    
                    if score > best_score_acc:
                        best_score_acc = score
                        best_ms2_acc = ms2spec
            
            best_acc_tuple = (acc, best_score_acc, best_ms2_acc)
            
            if best_score_acc > best_score_metab:
                best_score_metab = best_score_acc
                best_metab_tuple = best_acc_tuple
                
    return best_metab_tuple


In [None]:
annotated_data_with_spectra['matchesHMDB'] = ['nan']*len(annotated_data_with_spectra)
for i, row in annotated_data_with_spectra.iterrows():
    print(i)
    spectra_list = row['spectra']
    hmdbacc_list = row['hmdbacc']
    best_matches = []
    for spec in spectra_list:
        ms2specdata = spectra_list[spec]
        best_metab_tuple = return_best_match_metabolite(ms2specdata, hmdbacc_list)
        best_matches.append(best_metab_tuple)
    annotated_data_with_spectra.at[i, 'matchesHMDB'] = best_matches
    



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



1401
2664
613
324
3108
2989
2936
2980
2960
2995
365
3025
1993
2997
2915
2722
2693
