In [61]:
import pandas as pd 
from Bio import SeqIO
import argparse
import os 
from collections import defaultdict
import numpy as np
import timeit
import argparse
from pathlib import Path

In [3]:
def get_pep_ids_df(fa_path):
    Ids = set()
    pep_to_id = {}
    for seq in SeqIO.parse(fa_path,'fasta'):
        Ids.add(seq.seq)
        assert(seq.seq not in pep_to_id)
        pep_to_id[str(seq.seq)] = seq.id
    df =  pd.DataFrame(pep_to_id.keys(), pep_to_id.values()).reset_index()
    df = df.rename({'index':'protein id', 0:'sequence'}, axis = 1)

    return df


In [5]:
def reconstruct_experiment_FDR(select_rows_pipeline, df_search, save_folder, sample, create_sample_subfolder):
    '''Selects all the rows from the initial experiment
    Save'''
    df_search_i = df_search.reset_index()
    for experiment_id in select_rows_pipeline:
        print(f'.....{experiment_id}')

        df_experiment = df_search_i.loc[list(select_rows_pipeline[experiment_id])]
        df_experiment = df_experiment.drop_duplicates()


        if create_sample_subfolder:
            path_save = os.path.join(save_folder, sample, create_sample_subfolder)
        else:
            path_save = os.path.join(save_folder, sample)

        Path(path_save).mkdir(parents=True, exist_ok=True)
        path_save = os.path.join(path_save, f'tsearch-{experiment_id}.txt')
        print(path_save)
        df_experiment.to_csv(path_save, sep = '\t', index=None)

In [6]:
def select_search_result(id_to_exp, id_to_SearchRow):
    select_rows = defaultdict(set)
    for pep_idx, exp_list in id_to_exp.items():
        for experiment in exp_list:
            peptide_rows = id_to_SearchRow[pep_idx]
            if peptide_rows:
                select_rows[experiment].update(peptide_rows)
    return select_rows

In [7]:
def search_result_rows(df_search):
    id_to_row = defaultdict(list)
    for i, idx in enumerate(df_search['protein id']):
        if idx is np.nan:
            print('ERROR: Search not successful on all fractions of sample. Please RERUN')
        for name_ in idx.split(','):
            if 'pepID' not in name_:
                continue
            pep_ix = int(name_.split('-')[1].split('(')[0])
            id_to_row[pep_ix].append(i)
    return id_to_row


In [8]:
def reader_experiments(list_experiments):
    '''Read files with path lists'''
    with open(list_experiments, 'r') as f:
        path_dict = {}
        for i in f.readlines():
            sample_name = os.path.basename(i.strip()).split('_')[1]
            sample_short = '-'.join(sample_name.split('-')[0:3])
            path_dict[sample_short] = i.strip()
    return path_dict


In [9]:
def experiments_maps(path):
    '''Extract experiment maps'''
    df = pd.read_csv(path, sep = '\t')
    id_to_pep = {}
    id_to_exp = {}
    exp_to_id = defaultdict(list)
    for i, row in df.iterrows():
        id_to_pep[row['peptide_id']] = row['peptide_sequence']
        id_to_exp[row['peptide_id']] = row['experiment_ids'].split(';')

    for k, v in id_to_exp.items():
        for ID in v:
            exp_to_id[ID].append(k)
    return id_to_pep, id_to_exp, exp_to_id

In [59]:
# INPUTS
fa_path= '/cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-61-2008/trypsine_digest/peptide-extracted-filter-unique.fasta'
#ETH_tryptic_peptides = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-61-2008/trypsine_digest/
list_experiments='/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/current_OHSU_experiments_per_peptides_list.txt'
samples = ['TCGA-61-2008']
create_sample_subfolder = ''
save_folder = '~'

In [52]:
for sample in samples:
    pass

In [53]:
exp_all = reader_experiments(list_experiments)

In [54]:
print('...read tryptic peptides')
df_pep_sample = get_pep_ids_df(fa_path)

print('...process experiment map')
id_to_pep, id_to_exp, exp_to_id = experiments_maps(exp_all[sample])
print(len(id_to_exp))

print('...extract rows IDS corresponding to peptides')
id_to_SearchRow = search_result_rows(df_pep_sample)
print(len(id_to_SearchRow))

print('...extract rows needed to reconstruct experiments')
select_rows = select_search_result(id_to_exp, id_to_SearchRow)
print(len(select_rows))


print('...save experiments')
reconstruct_experiment_FDR(select_rows, df_pep_sample, save_folder, sample, create_sample_subfolder=create_sample_subfolder)

...read tryptic peptides


In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='separates a joint reindexed file into one file per pipeline with the original indexes')
    parser.add_argument("--file-joint", help='tide search file on union of pipelines to separate in two pipelines')
    parser.add_argument("--map-eth-file", help='file for eth containing the mapping table between original ids and shared ids')
    parser.add_argument("--map-ohsu-file",help='file for ohsu containing the mapping table between original ids and shared ids')
    parser.add_argument("--save-folder",help='base folder to save results')
    args = parser.parse_args()
    print(args)
    tide_pipeline_split(args.file_joint, args.map_eth_file, args.map_ohsu_file, args.save_folder)

In [11]:
# SAVE 
# Write the wrapper
# Write the command line