# Annotation by Emory IROA library (Rafi Ahmed's exhausted CD8 T cell)
- Operater: Minghao Gong
- Run time: 11/27/2020
- Data: Rafi Ahmed, Chris, CD8 exhuasted T cell project
- Note: 
  * The notebook is modified from Shuzhao Li's previous annotation project
  * The LC-MS used in the experiment does not generally distinguish L or D isomers. Thus, the chiral notion from a library should be removed when reporting the identity of a compound.

In [34]:
import pandas as pd
from emory_iroa_libs import HILIC_pos, C18_neg #import the json files which have annotation information wrapped.

#### You can reverse the json file to dataframe

In [35]:
#pd.DataFrame(HILIC_pos['compounds']).to_csv("HILIC_pos_annot_2017_KenLiu.csv")
#pd.DataFrame(C18_neg['compounds']).to_csv("C18_neg_annot_2017_KenLiu.csv")

### HILIC_pos annotation library: metadata structure

In [36]:
for k,v in HILIC_pos['metadata'].items():  #the HILIC_pos metadata is a dicionary of library specifics
    print(k + " : " + v)

library : IROA 
instrument : Orbitrap QE
chromatography_column : HILIC
chromatography_time : 10 minutes
ionization : ESI
ionization_mode : positive
expt_contact : Ken Liu
expt_data_generated : 2017
data_processed_by : Shuzhao Li
version : 2020-08-02


In [37]:
for k,v in HILIC_pos['compounds'][32].items(): #the HILIC_pos is a list of dictionary. 
    print(k + " : " + v)                       #Here we show one of the #32 compound dictionary

name : L-CYSTINE
observed_ion : M+H
observed_rtime : 211
parent_KEGG : C01420
HMDB : HMDB00192
HMDB_conc_range (uM) : 8-68
observed_mz : 241.03165


# Input the file path

In [38]:
file1_wt_path = "../data/processed/result_apLCMS_HilicPos_Rafi_M345_v2_labelled_cells.txt"
file2_wt_path = "../data/processed/result_apLCMS_C18_Rafi_M345_v2_labelled_cells.txt"
output_path = "../data/processed/"
infile1 = file1_wt_path.split("/")[-1]
infile2 = file2_wt_path.split("/")[-1]

### match2 function 

In [41]:
PPM_tolerance = 0.000010
RTime_tolerance = 100       # seconds in retention time, usually a small number
                            # more lenient for diff instruments
                            # and possible diff void volume

def match2(F1, F2):
    # The input format: F1, F2 = (mz, rt), (mz, rt)
    if abs(F1[0]-F2[0])/F1[0] < PPM_tolerance and abs(F1[1] - F2[1]) < RTime_tolerance:
        return True
    else:
        return False

---

----

# Step 1: First do HILIC_pos

### Read the table and visualize the data structure

In [39]:
infile1

'result_apLCMS_HilicPos_Rafi_M345_v2_labelled_cells.txt'

In [42]:
if ".csv" in infile1:
    hilic = pd.read_csv(file1_wt_path)
elif ".txt" in infile1:
     hilic = pd.read_csv(file1_wt_path, sep = "\t")
print(hilic.shape)
hilic.head()  #this check will be important to determine the later part: F1 = c18.iloc[ii, 1:3]

(6328, 11)


Unnamed: 0,mz,time,G2_R5pos1_1,G2_R5pos2_1,G2_R5pos3_1,G1_Naive1_1,G1_Naive2_1,G1_Naive3_1,G3_R5neg1a_1,G3_R5neg2a_1,G3_R5neg3a_1
0,85.028385,512.954463,4137588.0,2081908.0,270235.6,0.0,1022535.0,2492206.0,1380191.0,2156244.0,1683666.0
1,85.047678,60.262585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,85.063203,61.252789,3041608.0,2739611.0,3320548.0,2928846.0,3058198.0,3977670.0,4058808.0,3675497.0,4022472.0
3,85.064743,419.26946,266012.9,1700608.0,702902.7,0.0,2150794.0,3018997.0,3647949.0,2074642.0,0.0
4,85.077739,63.327942,402303800.0,2345690000.0,471743700.0,452457600.0,471796400.0,572863500.0,697987800.0,684553300.0,679992600.0


In [43]:
hilic.columns

Index(['mz', 'time', 'G2_R5pos1_1', 'G2_R5pos2_1', 'G2_R5pos3_1',
       'G1_Naive1_1', 'G1_Naive2_1', 'G1_Naive3_1', 'G3_R5neg1a_1',
       'G3_R5neg2a_1', 'G3_R5neg3a_1'],
      dtype='object')

### Excutive the matching and export a tab split tablem

In [52]:
s = 'Name\tKEGG ID\tion\tlibrary_mz\tlib_retention_time\tHMDB\t' + \
    '\t'.join(list(hilic.columns)[0:]) + '\n' # "input_raw_number\t" if the first column is indexes; 1 or 0 dependent on the what is the first column (index or mz value)

for ii in range(hilic.shape[0]):
    F1 = hilic.iloc[ii, 0:2]
    for cpd in HILIC_pos['compounds']:
        F2 = ( float(cpd['observed_mz']), float(cpd['observed_rtime']) )
        if match2(F1, F2):
            s += '\t'.join([cpd[x] for x in ['name', 'parent_KEGG', 'observed_ion', 'observed_mz', 
                                             'observed_rtime', 'HMDB']] + 
                           [str(x) for x in list(hilic.iloc[ii,:])]) + '\n'
        else:
            pass

with open(output_path + "hilic_annotate_" + infile1.split(".",-1)[0] + ".txt", 'w') as file:
        file.write( s )

---

---

# Step 2: now do C18

In [53]:
infile2

'result_apLCMS_C18_Rafi_M345_v2_labelled_cells.txt'

In [54]:
if ".csv" in infile2:
    c18 = pd.read_csv(file2_wt_path)
elif ".txt" in infile2:
    c18 = pd.read_csv(file2_wt_path, sep = "\t")
print(c18.shape)
c18.head() #this check will be important to determine the later part: F1 = c18.iloc[ii, 1:3]

(6788, 11)


Unnamed: 0,mz,time,G2_R5pos1_2,G2_R5pos2_2,G2_R5pos3_2,G1_Naive1_2,G1_Naive2_2,G1_Naive3_2,G3_R5neg1a_2,G3_R5neg2a_2,G3_R5neg3a_2
0,85.004359,23.403797,255885.7816,241225.4365,261371.3941,233516.7742,200427.0297,1108122.0,754709.5351,349273.1576,307586.2969
1,85.029443,496.707362,832199.4676,699751.9193,892857.4503,545464.8925,691852.3427,105021.4,717910.4874,509590.9577,396236.3903
2,86.024788,24.34441,161684.2699,358662.9546,161028.7944,226776.3941,484026.6393,375881.2,610733.8297,530186.234,537828.7317
3,86.125346,52.196204,0.0,0.0,0.0,0.0,0.0,0.0,444088.2758,0.0,0.0
4,86.929766,462.739243,461946.2655,97022.95061,608263.9383,114798.1548,656612.2372,103686.1,103740.9231,138806.2424,566692.054


In [56]:
s = 'Name\tKEGG ID\tion\tlibrary_mz\tlib_retention_time\tHMDB\t' + \
    '\t'.join(list(c18.columns)[0:]) + '\n' # "input_raw_number\t" if the first column is indexes; 1 or 0 dependent on the what is the first column (index or mz value)

for ii in range(c18.shape[0]):
    F1 = c18.iloc[ii, 0:2]   #May need to modify accordingly
    for cpd in C18_neg['compounds']:
        F2 = ( float(cpd['observed_mz']), float(cpd['observed_rtime']) )
        if match2(F1, F2):
            s += '\t'.join([cpd[x] for x in ['name', 'parent_KEGG', 'observed_ion', 'observed_mz', 
                                            'observed_rtime', 'HMDB']] + [str(x) for x in list(c18.iloc[ii,:])]) + '\n'
        else:
            pass

with open(output_path + "c18_annotate_" + infile2.split(".",-1)[0] + ".txt", 'w') as file:
        file.write( s )

## Note

The LC-MS used in the experiment does not generally distinguish L or D isomers. Thus, the chiral notion from a library should be removed when reporting the identity of a compound.