In [53]:
import os 
from pathlib import Path
import pandas as pd
from collections import defaultdict
import glob
import timeit
import argparse
import numpy as np

In [54]:
def reader_FDR_results(search_out_folder):
    search_res = dict()
    for path in glob.glob(os.path.join(search_out_folder, 'assign-confidence.target.txt')): 
        sample = path.split('/')[-3]
        search_res[sample] = path 
    return search_res


def reader_experiments(list_experiments):
    '''Read files with path lists'''
    with open(list_experiments, 'r') as f:
        path_dict = {}
        for i in f.readlines():
            sample_name = os.path.basename(i.strip()).split('_')[1]
            sample_short = '-'.join(sample_name.split('-')[0:3])
            path_dict[sample_short] = i.strip()
    return path_dict


def experiments_maps(path):
    '''Extract experiment maps'''
    df = pd.read_csv(path, sep = '\t')
    id_to_pep = {}
    id_to_exp = {}
    exp_to_id = defaultdict(list)
    for i, row in df.iterrows():
        id_to_pep[row['peptide_id']] = row['peptide_sequence']
        id_to_exp[row['peptide_id']] = row['experiment_ids'].split(';')

    for k, v in id_to_exp.items():
        for ID in v:
            exp_to_id[ID].append(k)
    return id_to_pep, id_to_exp, exp_to_id


def search_result_rows(df_search):
    id_to_row = defaultdict(list)
    for i, idx in enumerate(df_search['protein id']):
        if idx is np.nan:
            print('ERROR: Search not successful on all fractions of sample. Please RERUN')
        for name_ in idx.split(','):
            if 'pepID' not in name_:
                continue
            pep_ix = int(name_.split('-')[1].replace('(1)', ''))
            id_to_row[pep_ix].append(i)
    return id_to_row


def select_search_result(id_to_exp, id_to_SearchRow):
    select_rows = defaultdict(set)
    for pep_idx, exp_list in id_to_exp.items():
        for experiment in exp_list:
            peptide_rows = id_to_SearchRow[pep_idx]
            if peptide_rows:
                select_rows[experiment].update(peptide_rows)
    return select_rows


def reconstruct_experiment_FDR(select_rows_pipeline, df_search, save_folder, sample, create_sample_subfolder=True):
    '''Selects all the rows from the initial experiment
    Save'''
    df_search_i = df_search.reset_index()
    for experiment_id in select_rows_pipeline:
        print(f'.....{experiment_id}')

        df_experiment = df_search_i.loc[select_rows_pipeline[experiment_id]]
        df_experiment = df_experiment.drop_duplicates()
        

        if create_sample_subfolder:
            path_save = os.path.join(save_folder, sample, 'assign_conf_pooled_FDR') 
        else:
            path_save = os.path.join(save_folder)

        Path(path_save).mkdir(parents=True, exist_ok=True)
        path_save = os.path.join(path_save, f'tsearch-{experiment_id}.txt')
        print(path_save)
        df_experiment.to_csv(path_save, sep = '\t', index=None)


In [55]:
list_experiments = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/OHSU_Oct2023_data/OHSU_experiments_per_peptides_list.txt'
# list_experiments_eth = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/ETH_Oct2023_data/ETH_experiments_per_peptides_list.txt'
# eth_or_ohsu = 'joint_search' #which experimet file to use
search_out_folder = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/*/tide_search'

save_folder = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU'
create_sample_subfolder = True
rerank_psm = False

In [56]:
exp_all = reader_experiments(list_experiments)
FDR_res = reader_FDR_results(search_out_folder)

n_samples_process = 1
for sample, FDR_file in FDR_res.items():
    print(sample)
    n_samples_process -= 1
    print(n_samples_process)

    print('...read FDR result')
    df_FDR = pd.read_csv(FDR_file, sep = '\t')
    print(df_FDR.shape)

    print('...extract rows IDS corresponding to peptides')
    id_to_SearchRow = search_result_rows(df_FDR)
    print(len(id_to_SearchRow))

    print('...process experiment map')
    id_to_pep, id_to_exp, exp_to_id = experiments_maps(exp_all[sample])
    print(len(id_to_exp))

    print('...select experiment rows')
    select_rows = select_search_result(id_to_exp, id_to_SearchRow)
    print(len(select_rows))

    print('...save experiments')
    reconstruct_experiment_FDR(select_rows, df_FDR, save_folder, sample, create_sample_subfolder=create_sample_subfolder)


    if n_samples_process < 1:
        break

TCGA-25-1319
0
...read FDR result
(173901, 17)
...extract rows IDS corresponding to peptides
49475
...process experiment map
145419
...select experiment rows
180
...save experiments
.....J0AN01PC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0AN01PC.txt
.....J0AN12PC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0AN12PC.txt
.....J0AN32PC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0AN32PC.txt
.....J0ANA2PC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0ANA2PC.txt
.....J0AN1XPC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0AN1XPC.txt
.....J0AN3XPC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0AN3XPC.txt
.....J0ANAXPC


.....J0A1AXPA
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0A1AXPA.txt
.....J0A11APA
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0A11APA.txt
.....J0A13APA
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0A13APA.txt
.....J02101PA
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J02101PA.txt
.....J02112PA
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J02112PA.txt
.....J02132PA
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J02132PA.txt
.....J021A2PA
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J021A2PA.txt
.....J0211XPA
/cluster/work/grlab/projects/projects2020_OHSU/p

.....J02101GC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J02101GC.txt
.....J02112GC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J02112GC.txt
.....J02132GC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J02132GC.txt
.....J021A2GC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J021A2GC.txt
.....J0211XGC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0211XGC.txt
.....J0213XGC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0213XGC.txt
.....J0211AGC
/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-25-1319/assign_conf_pooled_FDR/tsearch-J0211AGC.txt
.....J0213AGC
/cluster/work/grlab/projects/projects2020_OHSU/p

# TEST

In [49]:
idx_list = set()
for name_ in split_exp['protein id']:
    for single in name_.split(','):
        if 'pepID' in  single:
            idx_list.add(int(single.split('-')[1].replace('(1)', '')))
            

foo = pd.read_csv('/cluster/work/grlab/projects/projects2020_OHSU/tmp_file', header = None)

full_exp = set(foo[0])


len(full_exp.difference(idx_list))

len(idx_list.difference(full_exp))