In [14]:
#Inverse labeling detection
#Input: .csv alignment from mzMine2 (3 columns in this order: 13CM-12C-13C)
#Output: .csv of features that show inverse labeling of desired mass offset
#Author: Aaron Puri (a.puri@utah.edu)
#Thanks to  Daniel Petras (functionalmetabolomics@gmail.com) and 
#Wout Bittremieux (wbittremieux@health.ucsd.edu) for offset alignment code

In [15]:
import pandas as pd
import numpy as np

In [16]:
#Import
df=pd.read_csv("XXX.csv") #Define mzMine2 alignment table
df=df.iloc[:,0:4]
new_names = pd.Series(['RT','mz_13CM','mz_12C','mz_13C'])
df.columns = new_names
df.replace(0, np.nan, inplace=True)
df.shape
df

Unnamed: 0,RT,mz_13CM,mz_12C,mz_13C
0,10.511533,105.099998,,105.099998
1,10.520950,106.099998,,106.099998
2,9.654525,108.099998,,108.099998
3,0.519450,110.000000,,
4,7.921683,119.099998,,
...,...,...,...,...
1252,21.125183,,,775.299988
1253,12.837683,,,776.250000
1254,22.085783,,,860.299988
1255,22.066933,,,861.299988


In [17]:
#Drop features that aligned in 12C and 13C conditions
align_12C_13C = df[df['mz_12C'].notnull() & df['mz_13C'].notnull()].index 
df.drop(align_12C_13C, inplace = True) 
df.shape

(970, 4)

In [18]:
#Make individual feature lists for each condition

featurelist_13CM = df[df['mz_13CM'].notnull()]
featurelist_13CM = featurelist_13CM.iloc[:,0:2]

featurelist_12C = df[df['mz_12C'].notnull()]
featurelist_12C = featurelist_12C.iloc[:,[0,2]]

featurelist_13C = df[df['mz_13C'].notnull()]
featurelist_13C = featurelist_13C.iloc[:,[0,3]]

In [19]:
mz_offset = 4 #Define mass offset [m/z]
mz_tolerance = 0.3    # Define m/z tolerance [m/z]
rt_tolerance = 0.1  # Define RT tolerance [min]

In [20]:
featurelist_13CM.loc[:,'_temp'] = 0
featurelist_13C['_temp'] = 0

combined = featurelist_13CM.merge(featurelist_13C, 'outer', '_temp', suffixes=['_13CM', '_13C'])

In [21]:
combined = combined[
    ((combined['mz_13CM'] + mz_offset - combined['mz_13C']).abs() < mz_tolerance) &
    ((combined['RT_13CM'] - combined['RT_13C']).abs() < rt_tolerance)]

In [22]:
combined = combined.drop(['_temp'], axis=1)
combined.sort_values(by=['RT_13CM'], inplace=True)

#combined.round(2)

In [23]:
#Comparing features with desired mass offset to features in 12C condition
combined['_temp'] = 0
featurelist_12C['_temp'] = 0

combined12C = combined.merge(featurelist_12C, 'outer', '_temp')
combined12C.rename(columns={'RT':'RT_12C'}, inplace=True)
combined12C = combined12C.drop(['_temp'], axis=1)

In [24]:
combined12C = combined12C[
    ((combined12C['RT_12C'] - combined12C['RT_13C']).abs() < rt_tolerance) &
    ((combined12C['mz_13CM'] - combined12C['mz_12C']) >= 0) &
    ((combined12C['mz_13C'] - combined12C['mz_12C']) <= 30)]

#combined12C.round(2)

In [25]:
#Narrowing down hits by mass range and retention time
combined12C = combined12C[
    (combined12C['RT_12C'] >= 2) &
    (combined12C['RT_12C'] <= 32) &
    (combined12C['mz_12C'] >= 150) &
    (combined12C['mz_12C'] <= 600)]

#combined12C
combined12C.to_csv('20210422_PA1MA_fulloutput.csv', index=False)

In [26]:
#Group hits that match to same 12C feature
combined12C['same_12C_count'] = combined12C.groupby(['RT_12C','mz_12C']).mz_12C.transform('size')
idx = combined12C.groupby(['RT_12C','mz_12C'])['mz_13C'].transform(max) == combined12C['mz_13C']
combined12C = combined12C[idx]
combined12C = combined12C.round(2)
combined12C.sort_values(by=['same_12C_count'], ascending=False, inplace=True)
#combined12C

Unnamed: 0,RT_13CM,mz_13CM,RT_13C,mz_13C,RT_12C,mz_12C,same_12C_count
634,10.41,219.1,10.43,223.1,10.49,205.1,5
8008,18.8,592.2,18.8,596.3,18.78,568.3,5
12922,20.48,558.2,20.47,562.2,20.45,541.1,4
7236,18.7,338.2,18.66,342.2,18.66,324.1,3
16288,22.82,322.2,22.8,326.3,22.79,308.2,3
4086,15.4,356.2,15.38,360.2,15.39,342.2,3
4181,15.4,356.2,15.38,360.2,15.37,340.7,3
15917,22.82,459.3,22.8,463.3,22.9,449.2,2
15718,22.82,459.3,22.8,463.3,22.79,445.3,2
9486,18.94,338.2,18.92,342.2,18.93,318.1,2


In [27]:
combined12C.to_csv('YYY.csv', index=False) #Define output table