# Match feature table with compounds annotated by Compound Discoverer
- Operater: Minghao Gong
- Run time: 05/21/2021
- Data: Rafi Ahmed, Chris, CD8 exhuasted T cell project
- Note: 
  * The notebook is modified from in-hourse library annotation pipeline

In [46]:
import pandas as pd
import os

# Input the file path

In [47]:
# Compound Discoverer exported file path
file1_wt_path = "../../../data/input/CD_exported/042821_Rafi_HILICpos_AX_DeepScan_CDexport/Compounds_unhidden_background.txt"

# xcms feature table file path
file2_wt_path = "../../../data/output/MS2_Spectra_massbank_match_05152021/HILICpos/Significant_list/sorted_filt_med_16.6_padj_0.05_FC_1_ttest_R5posvsR5neg.csv"

In [48]:
output_path = "../../../data/output/annotation-with-CD-exported-tables/sorted_filt_med_16.6_padj_0.05_FC_1/"

In [49]:
# create the output_path
import os
import errno

try:
    os.makedirs(output_path)
except OSError as e:
    if errno.EEXIST != e.errno:
        raise

In [50]:
infile1 = file1_wt_path.split("/")[-1]
infile2 = file2_wt_path.split("/")[-1]

### match  function with both m/z and Rtime

In [83]:
# default PPM_tolerance = 10
# default RTime_tolerance = 100       
                            # seconds in retention time, usually a small number
                            # more lenient for diff instruments
                            # and possible diff void volume

def match2MzRt(F1, F2, PPM_tolerance, RTime_tolerance):
    # The input format: F1, F2 = (mz, rt,ppm,Rt_tolerance), (mz,rt, ppm,Rt_tolerance)
    if RTime_tolerance == False:
        if abs(F1[0]-F2[0])/F1[0] < PPM_tolerance*(10**(-6)):
            return True
        else:
            return False
    else:
        if abs(F1[0]-F2[0])/F1[0] < PPM_tolerance*(10**(-6)) and abs(F1[1] - F2[1]) < RTime_tolerance:
            return True
        else:
            return False

In [84]:
match2MzRt((1000,100),(1001,46),5,False)

False

In [86]:
match2MzRt((1000,100),(1000.0010,46),5,100)

True

### match function with only m/z

In [87]:
def match2Mz(F1, F2, PPM_tolerance):
    # The input format: F1, F2 = mz,mz
    if abs(F1-F2)/F1 < PPM_tolerance*(10**(-6)):
        return True
    else:
        return False

In [90]:
match2Mz(1000,1000.0010,10)

True

---

----

# Read the table and visualize the data structure

### CD exported dataframe

In [56]:
infile1

'Compounds_unhidden_background.txt'

In [57]:
if ".csv" in infile1:
    cpEp_df = pd.read_csv(file1_wt_path)
elif ".txt" in infile1:
     cpEp_df = pd.read_csv(file1_wt_path, sep = "\t")
print(cpEp_df.shape)
cpEp_df.head()  #this check will be important to determine the later part: F1 = c18.iloc[ii, 1:3]

(247, 26)


Unnamed: 0,Compounds ID,Checked,Name,Formula,Annotation Source: Predicted Compositions,Annotation Source: mzCloud Search,Annotation Source: Metabolika Search,Annotation Source: ChemSpider Search,FISh Coverage,Molecular Weight,...,Metabolika Pathways,mzCloud Best Match,mzCloud Best Sim. Match,MS2,Background,Norm. Area: Blank_01.raw (F1),Norm. Area: Blank_02.raw (F2),Norm. Area: Blank_03.raw (F3),Norm. Area: Blank_04.raw (F4),Norm. Area: Sample_01.raw (F12)
0,1,True,,C12 H21 N3,Full match,No results,No results,No results,,207.17388,...,,,,PreferredDDA,True,631171000.0,63409190.0,7572468.0,5328103.0,2656655.0
1,157,True,,C7 H16 N2,Full match,No results,No results,No results,,128.13146,...,,,,PreferredDDA,True,7730415.0,8578198.0,4734613.0,5602371.0,6743121.0
2,158,True,,C15 H28 N4,Full match,No results,No results,No results,,264.23168,...,,,,PreferredDDA,True,3697681.0,7704188.0,4267816.0,1750758.0,1413146.0
3,159,True,,C12 H23 N3 O,Full match,No results,No results,No results,,225.18445,...,,,,PreferredDDA,True,7663994.0,5976299.0,1927373.0,1150319.0,905809.3
4,160,True,(2S)-3-Hydroxy-2-(octanoyloxy)propyl decanoate,C21 H40 O5,Full match,No results,No results,Full match,,372.28792,...,,,,OtherDDA,True,2169354.0,1774173.0,2619920.0,2850015.0,7497879.0


In [61]:
cpEp_df["m/z"]

0      208.18114
1      129.13875
2      265.23895
3      226.19173
4      373.29520
         ...    
242    137.10745
243    175.09796
244    124.08704
245    153.11353
246    158.19043
Name: m/z, Length: 247, dtype: float64

-------

## xcms feature table

In [64]:
xcms_df = pd.read_csv(file2_wt_path)

In [65]:
xcms_df

Unnamed: 0,X,mzmed,mzmin,mzmax,rtmed,rtmin,rtmax,npeaks,G1_Naive,G2_R5pos,...,pval,padj,Median,mean_G1,mean_G2,mean_G3,log2FC_R5posvsR5neg,log2FC_R5posvsNaive,log2FC_R5negvsNaive,abs_log2FC_R5posvsR5neg
0,FT0079,98.984056,98.984019,98.984090,82.824403,80.895142,87.400208,12,3,3,...,0.002103,0.024507,31.559718,31.698055,32.166539,30.860295,1.306244,0.468484,-0.837760,1.306244
1,FT0078,98.984037,98.984005,98.984080,71.554264,68.552124,72.795082,12,3,3,...,0.003002,0.024515,29.703925,29.825963,30.696946,27.921746,2.775199,0.870983,-1.904217,2.775199
2,FT1000,209.072494,209.072472,209.072566,51.194403,50.866180,51.380032,12,3,3,...,0.003073,0.024635,29.602925,29.579819,29.178349,29.935523,-0.757174,-0.401470,0.355704,0.757174
3,FT1995,415.210700,415.210595,415.210827,26.567628,25.837378,26.800518,12,3,3,...,0.003250,0.024716,28.395786,28.671290,29.393605,27.856347,1.537258,0.722316,-0.814942,1.537258
4,FT0022,90.525781,90.525754,90.525815,76.246063,75.016243,78.078156,12,3,3,...,0.001668,0.024507,27.983579,28.149316,27.571623,28.587093,-1.015470,-0.577693,0.437777,1.015470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
923,FT1467,290.909365,290.909300,290.909470,72.898315,72.162308,73.741318,11,3,3,...,0.008722,0.035591,17.698629,17.256158,17.858234,17.164337,0.693897,0.602076,-0.091821,0.693897
924,FT2192,472.133516,472.133272,472.133739,37.349213,36.731449,38.067780,11,3,3,...,0.014126,0.047379,17.695261,17.592903,17.809136,17.643975,0.165161,0.216233,0.051072,0.165161
925,FT1004,209.200592,209.200536,209.200623,39.743437,38.795712,40.376534,10,2,2,...,0.007053,0.032148,17.605663,17.537103,17.248014,18.919227,-1.671213,-0.289089,1.382124,1.671213
926,FT0031,91.039650,91.039609,91.039681,45.105955,44.541294,46.052765,12,3,3,...,0.008605,0.035418,17.503207,17.665896,17.260824,18.309150,-1.048325,-0.405071,0.643254,1.048325


In [79]:
xcms_df["mzmed"]

0       98.984056
1       98.984037
2      209.072494
3      415.210700
4       90.525781
          ...    
923    290.909365
924    472.133516
925    209.200592
926     91.039650
927    200.237180
Name: mzmed, Length: 928, dtype: float64

-------

In [77]:
match2((1000,100),(1001,46),5,False)

False

In [80]:
cpEp_df

Unnamed: 0,Compounds ID,Checked,Name,Formula,Annotation Source: Predicted Compositions,Annotation Source: mzCloud Search,Annotation Source: Metabolika Search,Annotation Source: ChemSpider Search,FISh Coverage,Molecular Weight,...,Metabolika Pathways,mzCloud Best Match,mzCloud Best Sim. Match,MS2,Background,Norm. Area: Blank_01.raw (F1),Norm. Area: Blank_02.raw (F2),Norm. Area: Blank_03.raw (F3),Norm. Area: Blank_04.raw (F4),Norm. Area: Sample_01.raw (F12)
0,1,True,,C12 H21 N3,Full match,No results,No results,No results,,207.17388,...,,,,PreferredDDA,True,6.311710e+08,6.340919e+07,7.572468e+06,5.328103e+06,2.656655e+06
1,157,True,,C7 H16 N2,Full match,No results,No results,No results,,128.13146,...,,,,PreferredDDA,True,7.730415e+06,8.578198e+06,4.734613e+06,5.602371e+06,6.743121e+06
2,158,True,,C15 H28 N4,Full match,No results,No results,No results,,264.23168,...,,,,PreferredDDA,True,3.697681e+06,7.704188e+06,4.267816e+06,1.750758e+06,1.413146e+06
3,159,True,,C12 H23 N3 O,Full match,No results,No results,No results,,225.18445,...,,,,PreferredDDA,True,7.663994e+06,5.976299e+06,1.927373e+06,1.150319e+06,9.058093e+05
4,160,True,(2S)-3-Hydroxy-2-(octanoyloxy)propyl decanoate,C21 H40 O5,Full match,No results,No results,Full match,,372.28792,...,,,,OtherDDA,True,2.169354e+06,1.774173e+06,2.619920e+06,2.850015e+06,7.497879e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,88,True,Betahistine,C8 H12 N2,No results,No results,No results,Full match,,136.10017,...,,,,PreferredDDA,True,9.030385e+06,2.297748e+07,2.554827e+07,5.054564e+07,4.457351e+07
243,89,True,,C9 H10 N4,Full match,No results,No results,Invalid mass,,174.09073,...,,,,PreferredDDA,True,9.447119e+05,4.752418e+06,8.896223e+06,1.955586e+07,1.581618e+07
244,90,True,,C6 H9 N3,Full match,No results,No results,No results,,123.07993,...,,,,PreferredDDA,True,6.174119e+05,1.915424e+07,2.720712e+07,5.983326e+07,5.692569e+07
245,92,True,,C7 H12 N4,Full match,No results,No results,No results,,152.10625,...,,,,PreferredDDA,True,3.037519e+05,6.079902e+06,9.731779e+06,1.928176e+07,1.885147e+07


In [93]:
# Check if Compound ID in CD exported table is unique or not.
len(set(cpEp_df['Compounds ID'])) == len(cpEp_df['Compounds ID'])

True

In [155]:
# Create an empty dictionary
xcms2cpEp_dict = {}
for i in range(cpEp_df.shape[0]):
    xcms2cpEp_dict.update({cpEp_df.loc[i,'Compounds ID']:[]})

In [156]:
# do the match
for i in range(cpEp_df.shape[0]):
    cp_mz = cpEp_df.loc[i, "m/z"]
    for j in range(xcms_df.shape[0]):
        xcms_mz = xcms_df.loc[j,"mzmed"]
        if match2Mz(cp_mz, xcms_mz,10):
            xcms2cpEp_dict[cpEp_df.loc[i,'Compounds ID']].append(xcms_df.loc[j,"X"])
        else:
            pass

In [157]:
cpEp_df.columns

Index(['Compounds ID', 'Checked', 'Name', 'Formula',
       'Annotation Source: Predicted Compositions',
       'Annotation Source: mzCloud Search',
       'Annotation Source: Metabolika Search',
       'Annotation Source: ChemSpider Search', 'FISh Coverage',
       'Molecular Weight', 'm/z', 'RT [min]', 'Area (Max.)',
       '# ChemSpider Results', '# mzCloud Results', '# Metabolika Pathways',
       'Metabolika Pathways', 'mzCloud Best Match', 'mzCloud Best Sim. Match',
       'MS2', 'Background', 'Norm. Area: Blank_01.raw (F1)',
       'Norm. Area: Blank_02.raw (F2)', 'Norm. Area: Blank_03.raw (F3)',
       'Norm. Area: Blank_04.raw (F4)', 'Norm. Area: Sample_01.raw (F12)'],
      dtype='object')

In [158]:
count = 0
for k,v in xcms2cpEp_dict.items():
    if len(v) >0:
        print(f"CD exported Compound ID of {k} matched with {v}")
        print(str(cpEp_df.loc[cpEp_df["Compounds ID"]==k,"m/z"]))
        print(str(xcms_df.loc[xcms_df["X"] == v[0],"mzmed"]))
        # print(f"CD exported Compound m/z is {str(cpEp_df.loc[cpEp_df["Compounds ID"]==k,"m/z"])} which matched with xcms features of {str(xcms_df.loc[xcms_df["X"] == k,"mzmed"])}")
        count+=1

CD exported Compound ID of 160 matched with ['FT1799']
4    373.2952
Name: m/z, dtype: float64
268    373.294162
Name: mzmed, dtype: float64
CD exported Compound ID of 178 matched with ['FT0661']
22    162.11258
Name: m/z, dtype: float64
155    162.112346
Name: mzmed, dtype: float64
CD exported Compound ID of 182 matched with ['FT0618']
26    156.07227
Name: m/z, dtype: float64
14    156.072005
Name: mzmed, dtype: float64
CD exported Compound ID of 126 matched with ['FT0960']
32    204.12453
Name: m/z, dtype: float64
281    204.122863
Name: mzmed, dtype: float64
CD exported Compound ID of 135 matched with ['FT1595']
41    326.37872
Name: m/z, dtype: float64
916    326.37748
Name: mzmed, dtype: float64
CD exported Compound ID of 151 matched with ['FT0818']
57    183.07831
Name: m/z, dtype: float64
11    183.078119
Name: mzmed, dtype: float64
CD exported Compound ID of 228 matched with ['FT0422']
72    138.04868
Name: m/z, dtype: float64
79    138.048487
Name: mzmed, dtype: float64
CD ex

In [159]:
cpEp_df["Compounds ID"]==k

0      False
1      False
2      False
3      False
4      False
       ...  
242    False
243    False
244    False
245    False
246     True
Name: Compounds ID, Length: 247, dtype: bool

In [160]:
cpEp_df.loc[cpEp_df["Compounds ID"]==k,"m/z"]

246    158.19043
Name: m/z, dtype: float64

In [161]:
count

17

In [162]:
xcms_df['CD_annot'] = "" 
for k,v in xcms2cpEp_dict.items():
    for FT in v:
        for j in range(xcms_df.shape[0]):
            if xcms_df.loc[j,"X"] == FT:
                xcms_df.loc[j,'CD_annot'] = k

In [163]:
m_df = pd.merge(xcms_df,cpEp_df,left_on="CD_annot",right_on="Compounds ID")

In [164]:
m_df.to_csv(output_path + "merged_matched_xcms_CDexported.csv",index=False)

---

## Note

The LC-MS used in the experiment does not generally distinguish L or D isomers. Thus, the chiral notion from a library should be removed when reporting the identity of a compound.