In [17]:
import os 
from pathlib import Path
import pandas as pd
from collections import defaultdict
import glob
import timeit
import numpy as np

In [18]:
### INPUTS ### (WRITE PARSER)
list_experiments = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/OHSU_Oct2023_data/OHSU_experiments_per_peptides_list.txt'
# list_experiments_eth = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/ETH_Oct2023_data/ETH_experiments_per_peptides_list.txt'
# eth_or_ohsu = 'joint_search' #which experimet file to use
search_out_folder = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/*/tide_search'

save_folder = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU'
create_sample_subfolder = True
rerank_psm = True

In [19]:
def reader_tide_results(search_out_folder):
    search_res = defaultdict(list)
    for path in glob.glob(os.path.join(search_out_folder, '*', 'tide-search.txt')): #samples, partitions
        sample = path.split('/')[-4]
        if ('fA' not in path) & ('POOL' not in path): #calibration files exclude
            search_res[sample].append(path) 
    return search_res

In [20]:
def reader_experiments(list_experiments):
    '''Read files with path lists'''
    with open(list_experiments, 'r') as f:
        path_dict = {}
        for i in f.readlines():
            sample_name = os.path.basename(i.strip()).split('_')[1]
            sample_short = '-'.join(sample_name.split('-')[0:3])
            path_dict[sample_short] = i.strip()
    return path_dict


In [21]:
def experiments_maps(path):
    '''Extract experiment maps'''
    df = pd.read_csv(path, sep = '\t')
    id_to_pep = {}
    id_to_exp = {}
    exp_to_id = defaultdict(list)
    for i, row in df.iterrows():
        id_to_pep[row['peptide_id']] = row['peptide_sequence']
        id_to_exp[row['peptide_id']] = row['experiment_ids'].split(';')

    for k, v in id_to_exp.items():
        for ID in v:
            exp_to_id[ID].append(k)
    return id_to_pep, id_to_exp, exp_to_id

In [34]:
def search_result_rows(df_search):
    id_to_row = defaultdict(list)
    for i, idx in enumerate(df_search['protein id']):
        if idx is np.nan:
            print('ERROR: Search not successful on all fractions of sample. Please RERUN')
        for name_ in idx.split(','):
            if 'pepID' not in name_:
                continue
            pep_ix = int(name_.split('-')[1].replace('(1)', ''))
            id_to_row[pep_ix].append(i)
    return id_to_row

In [29]:
def select_search_result(id_to_exp, id_to_SearchRow):
    select_rows = defaultdict(set)
    for pep_idx, exp_list in id_to_exp.items():
        for experiment in exp_list:
            peptide_rows = id_to_SearchRow[pep_idx]
            if peptide_rows:
                select_rows[experiment].update(peptide_rows)
    return select_rows


In [30]:
def reconstruct_experiment(select_rows_pipeline, df_search, save_folder, sample, rerank=True):
    '''Selects all the rows from the initial experiment
    Re-calculates the xcorr rank per experiment and partition'''
    df_search_i = df_search.reset_index()
    for experiment_id in select_rows_pipeline:
        print(f'.....{experiment_id}')

        df_experiment = df_search_i.loc[select_rows_pipeline[experiment_id]]
        
        if rerank:
            df_experiment['xcorr rank'] = df_experiment.groupby(['original target sequence', 'file', 'scan'])\
            ['xcorr score'].rank(method='first', ascending=False)

        df_experiment = df_experiment.drop_duplicates()

        if create_sample_subfolder:
            path_save = os.path.join(save_folder, sample, 'tide_search_per_experiment') 
        else:
            path_save = os.path.join(save_folder)

        Path(path_save).mkdir(parents=True, exist_ok=True)
        path_save = os.path.join(path_save, f'tsearch-{experiment_id}.txt')
        print(path_save)
        df_experiment.to_csv(path_save, sep = '\t', index=None)

In [31]:
list_experiments = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/OHSU_Oct2023_data/OHSU_experiments_per_peptides_list.txt'
# list_experiments_eth = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/ETH_Oct2023_data/ETH_experiments_per_peptides_list.txt'
# eth_or_ohsu = 'joint_search' #which experimet file to use
search_out_folder = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/*/tide_search'

save_folder = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU'
create_sample_subfolder = True
rerank_psm = True

In [33]:
# def psm_to_experiments(list_experiments, search_out_folder, save_folder, create_sample_subfolder, rerank_psm):
exp_all = reader_experiments(list_experiments)
search_res = reader_tide_results(search_out_folder)

n_samples_process = 10 
for sample, partitions in search_res.items():
    if len(partitions) == 24:
        print(sample)
        n_samples_process -= 1
#         print(n_samples_process)

        print('...read search result')
        df_search = pd.concat([pd.read_csv(part, sep = '\t') for part in partitions])

        print('...extract rows IDS corresponding to peptides')
        id_to_SearchRow = search_result_rows(df_search)

        print('...process experiment map')
        id_to_pep, id_to_exp, exp_to_id = experiments_maps(exp_all[sample])


#         print('...select experiment rows')
#         select_rows = select_search_result(id_to_exp, id_to_SearchRow)


#         print('...save experiments')
#         reconstruct_experiment(select_rows, df_search, save_folder, sample, rerank=rerank_psm)


        if n_samples_process < 1:
            break

#     else: 
#         print(f'skip {sample}')


TCGA-25-1319
9
...read search result
...extract rows IDS corresponding to peptides
...process experiment map
TCGA-24-2298
8
...read search result
...extract rows IDS corresponding to peptides
...process experiment map
TCGA-A2-A0SX
7
...read search result
...extract rows IDS corresponding to peptides
...process experiment map
TCGA-BH-A18V
6
...read search result
...extract rows IDS corresponding to peptides
...process experiment map
TCGA-AO-A0JM
5
...read search result
...extract rows IDS corresponding to peptides
...process experiment map
TCGA-25-1313
4
...read search result
...extract rows IDS corresponding to peptides
...process experiment map
TCGA-61-2008
3
...read search result
...extract rows IDS corresponding to peptides
...process experiment map
TCGA-A2-A0D2
2
...read search result
...extract rows IDS corresponding to peptides
...process experiment map
TCGA-24-1431
1
...read search result
...extract rows IDS corresponding to peptides
...process experiment map
TCGA-C8-A12P
0
...r

In [35]:
df_search.head()

Unnamed: 0,file,scan,charge,spectrum precursor m/z,spectrum neutral mass,peptide mass,delta_cn,delta_lcn,xcorr score,b/y ions matched,...,b/y ions fraction,xcorr rank,distinct matches/spectrum,sequence,modifications,unmodified sequence,protein id,flanking aa,target/decoy,original target sequence
0,/cluster/work/grlab/projects/TCGA/PanCanAtlas/...,4484,2,329.6764,657.3383,657.3568,0.000634,0.000634,0.034609,2,...,0.2,1,2,GAAAQP,1_S_144.1021_n,GAAAQP,GAPGGR_GAAAQP_0.2857_657.3680_657.3568_0.0112_...,--,target,GAAAQP
1,/cluster/work/grlab/projects/TCGA/PanCanAtlas/...,4484,2,329.6764,657.3383,657.3568,0.0,0.0,0.033975,2,...,0.2,2,2,GAAQAP,1_S_144.1021_n,GAAQAP,decoy_GAPGGR_GAAAQP_0.2857_657.3680_657.3568_0...,--,decoy,GAAAQP
2,/cluster/work/grlab/projects/TCGA/PanCanAtlas/...,5809,2,330.6873,659.3601,659.3836,0.006463,0.084185,0.4366,6,...,0.6,1,5,AAAGAR,1_S_144.1021_n,AAAGAR,decoy_AGAAAR_AAAAGR_0.2857_659.3836_659.3836_0...,--,decoy,AAAAGR
3,/cluster/work/grlab/projects/TCGA/PanCanAtlas/...,5809,2,330.6873,659.3601,659.3836,0.010108,0.077722,0.430137,5,...,0.5,2,5,AAGAAR,1_S_144.1021_n,AAGAAR,AGAAAR_AAGAAR_0.8571_659.3836_659.3836_0.0000_...,--,target,AAGAAR
4,/cluster/work/grlab/projects/TCGA/PanCanAtlas/...,5809,2,330.6873,659.3601,659.3836,0.046867,0.067614,0.420029,4,...,0.4,3,5,AGAAAR,1_S_144.1021_n,AGAAAR,AGAAAR_AGAAAR_1.0_659.3836_659.3836_0.0000_0.0...,--,target,AGAAAR


In [43]:
id_to_row = defaultdict(list)
for i, idx in enumerate(df_search['protein id']):
    if idx is np.nan:
        print('ERROR: Search not successful on all fractions of sample. Please RERUN')
    for name_ in idx.split(','):
        if 'pepID' not in name_:
            continue
        pep_ix = int(name_.split('-')[1].replace('(1)', ''))
        id_to_row[pep_ix].append(i)


In [51]:
partitions

['/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-C8-A12P/tide_search/TCGA_C8-A12P_BH-A0C1_A2-A0EY_117C_W_BI_20130622_H-PM_f16/tide-search.txt',
 '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-C8-A12P/tide_search/TCGA_C8-A12P_BH-A0C1_A2-A0EY_117C_W_BI_20130622_H-PM_f12/tide-search.txt',
 '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-C8-A12P/tide_search/TCGA_C8-A12P_BH-A0C1_A2-A0EY_117C_W_BI_20130622_H-PM_f04/tide-search.txt',
 '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-C8-A12P/tide_search/TCGA_C8-A12P_BH-A0C1_A2-A0EY_117C_W_BI_20130622_H-PM_f02/tide-search.txt',
 '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-C8-A12P/tide_search/TCGA_C8-A12P_BH-A0C1_A2-A0EY_117C_W_BI_20130622_H-PM_f07/tide-search.txt',
 '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/OHSU/TCGA-C8-A12P/tide_search/TCGA_C8-A12P_BH-A0C1_A2-A0EY_117C_W_BI_20130622_H-PM_f19/tide-search.txt',
 '/cluster

In [49]:
for a, b in enumerate(df_search['protein id']):
    if 'pepID' in b:
        print(a, b)

In [41]:
name_.split('-')[1].replace('(1)', '')

'17.0378'

In [44]:
len(id_to_row)

0

In [None]:
# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description='Takes results from tide search and splits them between experimental conditions')
#     parser.add_argument("--list-experiments", help='file containing the paths to the experiment files per sample')
#     parser.add_argument("--search-out-folder",help='path (with wildcards) of the tide search results')
#     parser.add_argument("--save-folder",help='base folder to save results')
#     parser.add_argument("--create-sample-subfolder", default=True, action='store_false', 
#                         help='wheather to create a subfolder with the sample name when saving')
#     parser.add_argument("--rerank-psm", default=True, 
#                         action='store_false',  
#                         help='wheather to apply re-ranking of the psm within condition and partition')
#     args = parser.parse_args()
#     psm_to_experiments(args.list_experiments, args.search_out_folder, args.save_folder,
#                        args.create_sample_subfolder, args.rerank_psm)

In [24]:
# n_samples_process = 1 #TEMPORARY
# for sample, partitions in search_res.items():
#     if len(partitions) == 24:
#         print(sample)
#         n_samples_process -= 1
#         print(n_samples_process)
#         print('...read search result')
#         df_search = pd.concat([pd.read_csv(part, sep = '\t') for part in partitions])
#         print('...extract rows IDS corresponding to peptides')
#         id_to_SearchRow = search_result_rows(df_search)
        
#         print('...process experiment map ohsu')
#         id_to_pep_ohsu, id_to_exp_ohsu, exp_to_id_ohsu = experiments_maps(exp_ohsu[sample])
# #         print('...process experiment map eth')
# #         id_to_pep_eth, id_to_exp_eth, exp_to_id_eth = experiments_maps(exp_eth[sample])
        
#         print('...select experiment rows ohsu')
#         select_rows_ohsu = select_search_result(id_to_exp_ohsu, id_to_SearchRow)
# #         print('...select experiment rows eth')
# #         select_rows_eth = select_search_result(id_to_exp_eth, id_to_SearchRow)
        
#         print('...save experiments ohsu')
#         reconstruct_experiment(select_rows_ohsu, df_search, save_folder, sample)
# #         print('...save experiments eth')
# #         reconstruct_experiment(select_rows_eth, df_search, save_folder, sample)

# #     if n_samples_process < 1:
# #         break
        
#     else: 
#         print(f'skip {sample}')
    

TCGA-25-1319
0
...read search result
...extract rows IDS corresponding to peptides
...process experiment map ohsu
...process experiment map eth
...select experiment rows ohsu
...select experiment rows eth
TCGA-24-2298
-1
...read search result
...extract rows IDS corresponding to peptides
...process experiment map ohsu
...process experiment map eth
...select experiment rows ohsu
...select experiment rows eth
skip TCGA-A2-A0SX
TCGA-BH-A18V
-2
...read search result
...extract rows IDS corresponding to peptides


AttributeError: 'float' object has no attribute 'split'

## TEST


In [None]:
experiment = 'J0251XGC'

In [None]:
exp_to_id_ohsu[experiment] # Protein IDS in initial exp

In [None]:
for idx in exp_to_id_ohsu[experiment]:# Protein SEQ in initial exp
    print(id_to_pep_ohsu[idx])

In [None]:
df_search.loc[select_rows_ohsu[experiment]].shape

In [None]:
for i in df_search.loc[select_rows_ohsu[experiment]]['protein id'].unique(): # Protein IDs in validated EXP
    print(i)

In [None]:
df_search.loc[select_rows_ohsu[experiment]]['unmodified sequence'].unique()  # Protein SEQ in validated EXP

In [None]:
df_search.loc[select_rows_ohsu[experiment], ['unmodified sequence', 'protein id']].drop_duplicates()

In [None]:
# some entries are missing! These are decoys

In [None]:
df_search.columns