In [None]:
import os 
import pathlib
import pandas as pd
from collections import defaultdict
import glob

In [None]:
### INPUTS ### (WRITE PARSER)
list_experiments_ohsu = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/OHSU_Oct2023_data/OHSU_experiments_per_peptides_list.txt'
list_experiments_eth = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/ETH_Oct2023_data/ETH_experiments_per_peptides_list.txt'
eth_or_ohsu = 'joint_search' #which experimet file to use
search_out_folder = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics/tide_search_joint' # contains the samples

In [None]:
def reader_experiments(list_experiments):
    '''Read files with path lists'''
    with open(list_experiments, 'r') as f:
        path_dict = {}
        for i in f.readlines():
            sample_name = os.path.basename(i.strip()).split('_')[1]
            sample_short = '-'.join(sample_name.split('-')[0:3])
            path_dict[sample_short] = i.strip()
    return path_dict


In [None]:
def experiments_maps(path):
    '''Extract experiment maps'''
    df = pd.read_csv(path, sep = '\t')
    id_to_pep = {}
    id_to_exp = {}
    exp_to_id = defaultdict(list)
    for i, row in df.iterrows():
        id_to_pep[row['peptide_id']] = row['peptide_sequence']
        id_to_exp[row['peptide_id']] = row['experiment_ids'].split(';')

    for k, v in id_to_exp.items():
        for ID in v:
            exp_to_id[ID].append(k)
    return id_to_pep, id_to_exp, exp_to_id

In [None]:
def reader_tide_results(search_out_folder):
    search_res = defaultdict(list)
    for path in glob.glob(os.path.join(search_out_folder, '*', '*', 'tide-search.txt')): #samples, partitions
        sample = path.split('/')[-3]
        if ('fA' not in path) & ('POOL' not in path): #calibration files exclude
            search_res[sample].append(path) 
    return search_res

In [6]:
def search_result_rows(df_search):
    id_to_row = defaultdict(list)
    for i, idx in enumerate(df_search['protein id']):
        for name_ in idx.split(','):
            pep_ix = int(name_.split('-')[1].replace('(1)', ''))
            id_to_row[pep_ix].append(i)
    return id_to_row

In [8]:
def select_search_result(id_to_exp, id_to_SearchRow):
    select_rows = defaultdict(set)
    for pep_idx, exp_list in id_to_exp.items():
        for experiment in exp_list:
            peptide_rows = id_to_SearchRow[pep_idx]
            if peptide_rows:
                select_rows[experiment].update(peptide_rows)
    return select_rows


In [9]:
exp_ohsu = reader_experiments(list_experiments_ohsu)
exp_eth = reader_experiments(list_experiments_eth)
search_res = reader_tide_results(search_out_folder)

In [None]:
n_samples_process = 1 #TEMPORARY
for sample, partitions in search_res.items():
    if len(partitions) == 24:
        print(sample)
        n_samples_process -= 1
        print('...read search result')
        df_search = pd.concat([pd.read_csv(part, sep = '\t') for part in partitions])
        print('...extract rows IDS corresponding to peptides')
        id_to_SearchRow = search_result_rows(df_search)
        print('...process experiment map ohsu')
        id_to_pep_ohsu, id_to_exp_ohsu, exp_to_id_ohsu = experiments_maps(exp_ohsu[sample])
        print('...process experiment map eth')
        id_to_pep_eth, id_to_exp_eth, exp_to_id_eth = experiments_maps(exp_eth[sample])
        print('...select experiment rows ohsu')
        select_rows_ohsu = select_search_result(id_to_exp_ohsu, id_to_SearchRow)
        print('...select experiment rows eth')
        select_rows_eth = select_search_result(id_to_exp_eth, id_to_SearchRow)
    if n_samples_process < 1:
        continue
        
    else: 
        print(f'skip {sample}')
    

TCGA-25-1319
...read search result
...extract rows IDS corresponding to peptides
...process experiment map ohsu
...process experiment map eth
...select experiment rows ohsu


In [None]:
df_search = df_search.reset_index()

In [None]:
# TODO SAVE

In [None]:
# for exp, experiment_rows in select_rows_ohsu.items():
#     print(exp, df_search.loc[experiment_rows].shape)

## TEST


In [None]:
experiment = 'J0251XGC'

In [None]:
exp_to_id_ohsu[experiment] # Protein IDS in initial exp

In [None]:
for idx in exp_to_id_ohsu[experiment]:# Protein SEQ in initial exp
    print(id_to_pep_ohsu[idx])

In [None]:
df_search.loc[select_rows_ohsu[experiment]].shape

In [None]:
for i in df_search.loc[select_rows_ohsu[experiment]]['protein id'].unique(): # Protein IDs in validated EXP
    print(i)

In [None]:
df_search.loc[select_rows_ohsu[experiment]]['unmodified sequence'].unique()  # Protein SEQ in validated EXP

In [None]:
df_search.loc[select_rows_ohsu[experiment], ['unmodified sequence', 'protein id']].drop_duplicates()

In [None]:
# some entries are missing! These are decoys

In [None]:
df_search.columns