In [214]:
import os
import pandas as pd
import re
import numpy as np
import time
import tritonclient.grpc as grpcclient
from tqdm import tqdm
import openpyxl

In [215]:
def set_il_ic_peptide(df):
    il_peptides = []
    for index, record in df.iterrows():
        ic_peptide = record['ic_peptide']
        in_peptide = record['InferredPeptide']
    
        ## select only I or L characters in the inferred peptide
        il_only = re.sub(r'[^IL]', '', in_peptide)
        if len(il_only) != 0:
            il_idx = 0
            il_str = ''
            for idx in range(len(ic_peptide)):
                if ic_peptide[idx] == 'I' or ic_peptide[idx] == 'L':
                    il_str += il_only[il_idx]
                    il_idx = il_idx+1
                else:
                    il_str += ic_peptide[idx]
    
            ## reflect I/L characters to ic_peptide
            ic_peptide = il_str
    
        il_peptides.append(ic_peptide)
    df['ic_peptide'] = il_peptides


In [216]:
def generate_prosit_input_array(df, ce):
    set_il_ic_peptide(df)
    peptides = []
    charges = []
    ces = []
    for index, record in df.iterrows():
        ## some modifications here
        ic_peptide = record['ic_peptide']
        charge = record['ic_charge']
        ic_peptide = re.sub('\+15.995', '[UNIMOD:35]', ic_peptide)
        ic_peptide = re.sub('\+57.021', '[UNIMOD:4]', ic_peptide)

        if '+' in ic_peptide or '-' in ic_peptide:
            charge = -1
        
        peptides.append(ic_peptide)
        charges.append(charge)
        ces.append(ce)


    df['prosit_peptide_sequences'] = peptides
    df['prosit_precursor_charges'] = charges
    df['prosit_collision_energies'] = ces
    df = df.drop_duplicates(subset=['prosit_peptide_sequences','prosit_precursor_charges','prosit_collision_energies'])
    df = df[df['prosit_precursor_charges'] > 0]
    
    inputs = { 
        'peptide_sequences': np.array(df['prosit_peptide_sequences'], dtype=np.dtype("O")).reshape(-1,1),
        'precursor_charges': np.array(df['prosit_precursor_charges'], dtype=np.dtype("int32")).reshape(-1,1),
        'collision_energies': np.array(df['prosit_collision_energies'], dtype=np.dtype("float32")).reshape(-1,1),
    }
    
    return(df, inputs)


In [234]:
def write_mgf(predictions, file_name):
    with open(file_name, 'w') as file:
        for idx in range(len(predictions['peptide_sequences'])):
            ## Drop zero peaks
            mz_array = predictions['mz'][idx][predictions['intensities'][idx] > 0.0001]
            int_array = predictions['intensities'][idx][predictions['intensities'][idx] > 0.0001]
            charge = "".join(map(str, predictions['precursor_charges'][idx]))
            file.write("BEGIN IONS\n")
            file.write("TITLE=")
            file.write("".join(predictions['peptide_sequences'][idx]))
            file.write("|")
            file.write(charge)
            file.write("\n")
            file.write("CHARGE=")
            file.write(charge)
            file.write("+\n")
            for idx2 in range(len(mz_array)):
                file.write(str(f"{mz_array[idx2]:.4f}"))
                file.write("\t")
                file.write(str(f"{1000 * int_array[idx2]:.4f}"))
                file.write("\n")
            file.write("END IONS\n")

In [235]:
#model_name = 'Prosit_2023_intensity_timsTOF'
#model_name = 'Prosit_2020_intensity_HCD'
def predict_save_spectra(pXg_file, model_name, ce, output_file, server_url = 'koina.proteomicsdb.org:443'):
    ## Load pXg file
    df = pd.read_csv(pXg_file, delimiter='\t')

    
    nptype_convert = {
        np.dtype('float32'): 'FP32',
        np.dtype('O'): 'BYTES',
        np.dtype('int16'): 'INT16',
        np.dtype('int32'): 'INT32',
        np.dtype('int64'): 'INT64',
    }
    
    #server_url = 'koina.proteomicsdb.org:443'
    batch_size = 1000
    
    ## This is real
    df, inputs = generate_prosit_input_array(df, ce)
    
    outputs = [ 'intensities',  'mz',  'annotation' ]
    
    triton_client = grpcclient.InferenceServerClient(url=server_url, ssl=True)
    
    koina_outputs = []
    for name in outputs:
        koina_outputs.append(grpcclient.InferRequestedOutput(name))
    
    predictions = {name: [] for name in outputs}
    len_inputs = list(inputs.values())[0].shape[0]
    
    ## print model_name
    print("Prosit model information:", model_name)
    print("The number of inputs:", len(df))
    
    for i in tqdm(range(0, len_inputs, batch_size), desc="Processing", unit="iteration"):
        koina_inputs = []
        for iname, iarr in inputs.items():
            islice = iarr[i:i+batch_size]
            koina_inputs.append(
                grpcclient.InferInput(iname, islice.shape, nptype_convert[iarr.dtype])
            )
            koina_inputs[-1].set_data_from_numpy(islice)
    
        prediction = triton_client.infer(model_name, inputs=koina_inputs, outputs=koina_outputs)
    
        for name in outputs:
            predictions[name].append(prediction.as_numpy(name))
    
    ## aggreggate
    predictions['intensities'] = np.vstack(predictions['intensities']).astype(float)
    predictions['mz'] = np.vstack(predictions['mz']).astype(float)
    predictions['annotation'] = np.vstack(predictions['annotation'])
    predictions['peptide_sequences'] = np.vstack(inputs['peptide_sequences'])
    predictions['precursor_charges'] = np.vstack(inputs['precursor_charges'])
    predictions['collision_energies'] = np.vstack(inputs['collision_energies'])
    write_mgf(predictions, output_file)


In [237]:
import glob
if __name__ == "__main__":
    folder="/Users/seunghyukchoi/Documents/1_Projects/2023_Spliceosome/1.Search/1.All_pXg"
    
    model_name='Prosit_2023_intensity_timsTOF'
    for file in glob.glob(os.path.join(folder, 'Bruker_*.pXg')):
        input_file = file
        output_file = input_file.replace(".pXg", "."+model_name+".mgf")
        predict_save_spectra(input_file, model_name, 25, output_file)

    if(False):
        model_name='Prosit_2020_intensity_HCD'
        for file in glob.glob(os.path.join(folder, 'Thermo_*.pXg')):
            input_file = file
            output_file = input_file.replace(".pXg", "."+model_name+".mgf")
            predict_save_spectra(input_file, model_name, 30, output_file)
            
        for file in glob.glob(os.path.join(folder, 'BCM_*.pXg')):
            input_file = file
            output_file = input_file.replace(".pXg", "."+model_name+".mgf")
            predict_save_spectra(input_file, model_name, 30, output_file)

Prosit model information: Prosit_2023_intensity_timsTOF
The number of inputs: 45450


Processing: 100%|███████████████████████████████████████████████████████████████████████████████| 46/46 [00:44<00:00,  1.03iteration/s]


Prosit model information: Prosit_2023_intensity_timsTOF
The number of inputs: 55674


Processing: 100%|███████████████████████████████████████████████████████████████████████████████| 56/56 [00:54<00:00,  1.03iteration/s]


Prosit model information: Prosit_2023_intensity_timsTOF
The number of inputs: 48048


Processing: 100%|███████████████████████████████████████████████████████████████████████████████| 49/49 [00:47<00:00,  1.03iteration/s]


Prosit model information: Prosit_2023_intensity_timsTOF
The number of inputs: 80448


Processing: 100%|███████████████████████████████████████████████████████████████████████████████| 81/81 [01:17<00:00,  1.04iteration/s]


Prosit model information: Prosit_2023_intensity_timsTOF
The number of inputs: 75421


Processing: 100%|███████████████████████████████████████████████████████████████████████████████| 76/76 [01:09<00:00,  1.10iteration/s]


Prosit model information: Prosit_2023_intensity_timsTOF
The number of inputs: 79248


Processing: 100%|███████████████████████████████████████████████████████████████████████████████| 80/80 [01:11<00:00,  1.12iteration/s]


Prosit model information: Prosit_2023_intensity_timsTOF
The number of inputs: 95984


Processing: 100%|███████████████████████████████████████████████████████████████████████████████| 96/96 [01:25<00:00,  1.13iteration/s]


Prosit model information: Prosit_2023_intensity_timsTOF
The number of inputs: 80858


Processing: 100%|███████████████████████████████████████████████████████████████████████████████| 81/81 [01:15<00:00,  1.07iteration/s]
