In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import mist_cf.common as common
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 2 necessary files:
# data/bile_acid/bile_acid_refined.csv
# data/bile_acid/Refined_24d96e55.mgf

In [3]:
quant_file = "../../data/bile_acid/bile_acid_refined.csv"
mgf_file = "../../data/bile_acid/Refined_24d96e55.mgf"

export_mgf = "../../data/bile_acid/bile_acid_refined_processed.mgf"
export_csv = "../../data/bile_acid/bile_acid_refined_processed.csv"
export_labels = "../../data/bile_acid/bile_acid_refined_processed_labels.tsv"

In [4]:
quant_df = pd.read_csv(quant_file)
quant_df.columns = quant_df.iloc[0]
quant_df = quant_df.drop(quant_df.index[0])
quant_df = quant_df.dropna()

# preserve valid adduct

quant_df['Adduct'] = quant_df['Adduct'].apply(lambda x: common.ion_remap[x] 
                                              if x in common.ion_remap
                                              else x)

In [5]:
# records with common adduct
display(quant_df[~quant_df["Adduct"].isin(common.ION_LST)])
quant_df = quant_df[quant_df["Adduct"].isin(common.ION_LST)]

Unnamed: 0,SpectrumID,is_BA,Compound_Name,Adduct,Precursor_MZ,ExactMass,Smiles,INCHI,IonMode,#Scan#,SharedPeaks,LibMZ,SpecMZ,molecular_formula,TotalIonsMGF,TotalIonsPerSharedPeaks
8,CCMSLIB00005464718,yes,"(4R)-4-((3S,5R,9S,10S,13R,14S,17R)-3-hydroxy-1...",2M+H,781.561,390.277,[H][C@@]([C@@H]([C@@]12C)CC[C@@H]1[C@@H](CCC(O...,InChI=1S/C24H38O4/c1-14(4-9-22(27)28)18-7-8-19...,Positive,2832,12,781.561,781.561,C24H38O4,21,57.14285714
9,CCMSLIB00005464718,yes,"(4R)-4-((3S,5R,9S,10S,13R,14S,17R)-3-hydroxy-1...",2M+H,781.561,390.277,[H][C@@]([C@@H]([C@@]12C)CC[C@@H]1[C@@H](CCC(O...,InChI=1S/C24H38O4/c1-14(4-9-22(27)28)18-7-8-19...,Positive,2837,7,781.561,781.561,C24H38O4,19,36.84210526
11,CCMSLIB00005464789,yes,"(R)-4-((3S,5S,7R,8R,9S,10S,13R,14S,17R)-3,7-di...",2M+H,813.551,406.272,C[C@@H]([C@H]1CC[C@]2([H])[C@]1(C)C(C[C@@]3([H...,InChI=1S/C24H38O5/c1-13(4-7-21(28)29)16-5-6-17...,Positive,1182,12,813.551,813.551,C24H38O5,22,54.54545455
12,CCMSLIB00005464754,yes,"(R)-4-((5R,8R,9S,10S,13R,14S,17R)-10,13-dimeth...",2M+H,749.571,374.282,C[C@@H]([C@H]1CC[C@]2([H])[C@]1(C)CC[C@@]3([H]...,InChI=1S/C24H38O3/c1-15(4-9-22(26)27)19-7-8-20...,Positive,999,13,749.571,749.571,C24H38O3,30,43.33333333
13,CCMSLIB00005464803,yes,"(R)-4-((5S,8S,9S,10R,13R,14S,17R)-10,13-dimeth...",2M+H,777.53,388.261,C[C@@H]([C@H]1CC[C@]2([H])[C@]1(C)CC[C@@]3([H]...,InChI=1S/C24H36O4/c1-14(4-7-22(27)28)17-5-6-18...,Positive,1176,8,777.53,777.531,C24H36O4,35,22.85714286
734,CCMSLIB00005435564,yes,taurocholic acid,M-3H2O+H,462.266,515.292,C[C@H](CCC(=O)NCCS(=O)(=O)O)[C@H]1CC[C@@H]2[C@...,"1S/C26H45NO7S/c1-15(4-7-23(31)27-10-11-35(32,3...",Positive,1228,12,462.266,462.267,C26H45NO7S,21,57.14285714


In [6]:
# eliminate some wrong records
# already double checked with mgf file
quant_df['mz_difference'] = quant_df.apply(lambda x: (abs(float(x.Precursor_MZ) - 
                                                          common.ion_to_mass[x.Adduct] - 
                                                          common.formula_mass(x.molecular_formula)
                                                          )), axis=1)
display(quant_df[quant_df["mz_difference"] > 0.02])
quant_df = quant_df[quant_df["mz_difference"] < 0.02]

# Sort by scan number
quant_df = quant_df.sort_values(by=['#Scan#'])
quant_df.to_csv(export_csv,index=False)

Unnamed: 0,SpectrumID,is_BA,Compound_Name,Adduct,Precursor_MZ,ExactMass,Smiles,INCHI,IonMode,#Scan#,SharedPeaks,LibMZ,SpecMZ,molecular_formula,TotalIonsMGF,TotalIonsPerSharedPeaks,mz_difference
1054,CCMSLIB00005467948,yes,Tyrosine conjugated deoxycholic acid putative,[M-H2O+H]+,556.364,0.0,[H]C12C(O)CC3CC(CCC3(C1CCC4(C2CCC4C(C)CCC(NC(C...,InChI=1S/C33H49NO6/c1-19(4-11-29(38)34-27(31(3...,Positive,937,11,556.364,556.362,C33H49NO6,19,57.89473684,18.0113
1055,CCMSLIB00005467948,yes,Tyrosine conjugated deoxycholic acid putative,[M-H2O+H]+,556.364,0.0,[H]C12C(O)CC3CC(CCC3(C1CCC4(C2CCC4C(C)CCC(NC(C...,InChI=1S/C33H49NO6/c1-19(4-11-29(38)34-27(31(3...,Positive,931,12,556.364,556.362,C33H49NO6,27,44.44444444,18.0113
1056,CCMSLIB00005467948,yes,Tyrosine conjugated deoxycholic acid putative,[M-H2O+H]+,556.364,0.0,[H]C12C(O)CC3CC(CCC3(C1CCC4(C2CCC4C(C)CCC(NC(C...,InChI=1S/C33H49NO6/c1-19(4-11-29(38)34-27(31(3...,Positive,930,13,556.364,556.362,C33H49NO6,26,50.0,18.0113
1057,CCMSLIB00005467948,yes,Tyrosine conjugated deoxycholic acid putative,[M-H2O+H]+,556.364,0.0,[H]C12C(O)CC3CC(CCC3(C1CCC4(C2CCC4C(C)CCC(NC(C...,InChI=1S/C33H49NO6/c1-19(4-11-29(38)34-27(31(3...,Positive,2722,13,556.364,556.362,C33H49NO6,27,48.14814815,18.0113


In [7]:
quant_df.shape

(945, 17)

In [8]:
np.unique(quant_df.IonMode.values)

array(['Positive', 'positive'], dtype=object)

# Construct MGF output

In [9]:
# Read in mgf 
parsed_mgf = common.parse_spectra_mgf(mgf_file)
feat_id_to_entry = {i[0]['SCANS'] : i for i in parsed_mgf}

5720it [00:00, 30371.08it/s]


In [10]:
# check all features are present in mgf
for id in quant_df['#Scan#'].values:
    if id not in feat_id_to_entry:
        print(id)

In [11]:
# check the quant_df 'Precursor_MZ' correspond to mgf 'PEPMASS'
for scan, mz in zip(quant_df['#Scan#'].values, quant_df['Precursor_MZ'].values):
    measured_mz = feat_id_to_entry[scan][0]['PEPMASS']
    if (float(mz)-float(measured_mz)>0.02):
        print(f'{float(mz)} {float(measured_mz)}')
        print(scan)

In [12]:
keep_entries = []
ctr = 0 
for _, quant_df_row  in quant_df.iterrows():
    feat_id = str(quant_df_row['#Scan#'])
    if feat_id in feat_id_to_entry:
        sub_entry = feat_id_to_entry[feat_id]
        sub_entry[0]['FEATURE_ID'] = feat_id
        assert feat_id == sub_entry[0]['SCANS']
        keep_entries.append(sub_entry)
    else:
        print(f"Missing feature ID: {feat_id}")
        ctr += 1

In [13]:
# Export mgf
out_str = common.build_mgf_str(keep_entries)
with open(export_mgf, 'w') as f:
    f.write(out_str)

100%|██████████| 945/945 [00:00<00:00, 16981.72it/s]


# Construct output label

In [14]:
#dataset	spec	name	formula	ionization	smiles	inchikey	instrument
new_df = []
for _, row in quant_df.iterrows():
    new_entry = {"dataset": "bile_acid_refined",
                 "spec": row['#Scan#'],
                 "name": row['SpectrumID'],
                    "formula": row['molecular_formula'],
                    "ionization": row['Adduct'],
                    "smiles": row['Smiles'],
                    "inchikey": row['INCHI'],
                    "instrument": "ion trap"
                 }
    new_df.append(new_entry)
out_df = pd.DataFrame(new_df)

In [15]:
out_df.to_csv(export_labels, sep="\t", index=False)