In [51]:
import pandas as pd
import glob
import os 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
%load_ext autoreload
%autoreload 2
#from ..helpers_barplot_intersection import reader_assign_conf_pep, plot_text, plot_intersection_bars

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
def reader_assign_conf_pep(path, FDR_threshold, col_seq, col_qval, input_trypPep):
    print(f'Reading {path}')
    if os.path.isfile(path):
        # Read
        df = pd.read_csv(path, sep = '\t')
        df_all_pep = pd.read_csv(input_trypPep, sep = '\t')
        
        # Total
        tot_peptides = len(df_all_pep['sequence'].unique())
        
        print(f'Shape of FDR file: {df.shape[0]}')
        print(f'Total input tryptic junction peptides: {tot_peptides}')
        assert('sequence' in df.columns)
        
        # Validated peptides
        df_filtered = df.loc[df[col_qval] < FDR_threshold]
        peptides = set(df_filtered[col_seq])
        val = len(peptides)

        # Validation rate
        if tot_peptides:
            val_rate = np.round(val / tot_peptides * 100 , 2)
        else:
            val_rate = 0.0
        
        print(f'Number of validated unique peptides: {val}')
        print(f'Validation Rate: {val_rate } percent')
        
        return val, val_rate, peptides, df_filtered, tot_peptides
    else:
        return 0, 0.0, set(), None, None

In [54]:
# INPUTS 
read_from_disk = True # or reload
proteomicsdir = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024'
#proteomicsdir = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics'

pipelines = ['OHSU', 'ETH']

FDR_limit = 0.05
MS_FDR = '_crux' #choices: '_crema' '_crux' or ''
MS_strategy = 'pool' #Choices: 'pool' 'joint', 'single' 

#pool: pool all experiments per pipeline
#joint: pool all experiments for both pipeline
#single: per experiment per pipeline

if MS_FDR == '_crema':
    FDR_file = 'crema.peptides.txt'
    col_seq = 'sequence'
    col_qvalue = 'crema q-value'
elif MS_FDR == '_crux' or  MS_FDR == '':
    FDR_file = 'assign-confidence.target.txt'
    col_seq = 'unmodified sequence'
    col_qvalue = 'tdc q-value'
else:
    print(f'ERROR: wrong input for {FDR_file}')

    

sample_plot_dir = {}
run_type_plot_dir = {}    
all_samples = []

# BRCA
samples = ['TCGA-C8-A12P-01A-11R-A115-07',
                  'TCGA-AO-A0JM-01A-21R-A056-07',
                  'TCGA-BH-A18V-01A-11R-A12D-07',
                  'TCGA-A2-A0D2-01A-21R-A034-07',
                  'TCGA-A2-A0SX-01A-12R-A084-07']
all_samples.extend(samples)
basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
filter_dir = os.path.join(basedir, 'filtering_samples', 'filters_19May_order_5ge_wAnnot_GPstar')

for sample in samples:
    sample_plot_dir[sample] = os.path.join(filter_dir, 'plots')
    run_type_plot_dir[sample] = 'BRCA'
    
# OV
samples = ['TCGA-25-1319-01A-01R-1565-13',
                  'TCGA-25-1313-01A-01R-1565-13',
                  'TCGA-61-2008-01A-02R-1568-13',
                  'TCGA-24-1431-01A-01R-1566-13',
                  'TCGA-24-2298-01A-01R-1569-13']
all_samples.extend(samples)
basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374'
filter_dir = os.path.join(basedir, 'filtering_samples', 'filters_19May_order_5ge_wAnnot_GPstar')

for sample in samples:
    sample_plot_dir[sample] = os.path.join(filter_dir, 'plots')
    run_type_plot_dir[sample] = 'OV'
    
    
    
# PLOTTING Parameters 
ticks_fontsize = 12
axislabels_fontsize = 12
legend_fontsize = 12
axes_fontsize = 15
save = True
sample = None

# Parse: results for all experiments

In [60]:
## Get peptides
if read_from_disk:
    samples_store_pep = {}
    samples_store_rates = {}
    samples_store_total = {}
#     experiment_pipelines_peptides =  
#     experiments_pipelines_rates = defaultdict(dict)


    for sample in all_samples:

        sample_short = '-'.join(sample.split('-')[0:3])
        samples_store_pep[sample] = defaultdict(dict)
        samples_store_rates[sample] = defaultdict(dict)
        samples_store_total[sample] = defaultdict(dict)
        for pipeline in pipelines:
            path_single = os.path.join(proteomicsdir, pipeline, sample_short, 
                                       f'assign_conf_per_experiment{MS_FDR}')
            path_pool_pipeline = os.path.join(proteomicsdir, pipeline, sample_short, 
                                              f'assign_conf_pooled_FDR{MS_FDR}')
            path_pool_union = os.path.join(proteomicsdir, 
                                           f'assign_conf_joint_to_{pipeline}{MS_FDR}', sample_short)
            path_TEST_OHSU = os.path.join(proteomicsdir, 'OHSU', sample_short, 
                                       f'assign_conf_per_experiment{MS_FDR}')
            path_TEST_ETH = os.path.join(proteomicsdir, 'ETH', sample_short, 
                                       f'assign_conf_per_experiment{MS_FDR}')
            base_input_trypPep = os.path.join(proteomicsdir, pipeline, sample_short, 
                                             'trypsine_digest_per_experiment')


            experiment_list = [ i.split('/')[-1] for i in glob.glob(path_single + '/*')] #check

            for experiment in experiment_list:
                if pipeline == 'OHSU':
                    original_name = experiment
                    cut_name = experiment[1:]
                else:
                    original_name = experiment
                    cut_name = experiment          

                if os.path.isfile(os.path.join(path_TEST_OHSU, 'J' + cut_name, FDR_file)) and \
                     os.path.isfile(os.path.join(path_TEST_ETH, cut_name, FDR_file)): #Commun experiments               

                    # search 1 experiment, 1 pipeline  
                    if MS_strategy == 'single':
                        df = os.path.join(path_single, original_name, FDR_file)
                    # search all experiments, 1 pipeline
                    if MS_strategy == 'pool':
                        df = os.path.join(path_pool_pipeline, f'tsearch-{original_name}.txt')
                    # search all experiments, 1 union of pipelines
                    if MS_strategy == 'joint':
                        df = os.path.join(path_pool_union, f'tsearch-{original_name}.txt')
                    
                    input_trypPep = os.path.join(base_input_trypPep, f'tsearch-{original_name}.txt')
                    
                    val, val_rate, peptides, _, tot = reader_assign_conf_pep(df, FDR_limit, col_seq, col_qvalue, input_trypPep)
                    samples_store_pep[sample][cut_name][pipeline] = peptides
                    samples_store_rates[sample][cut_name][pipeline] = val_rate
                    samples_store_total[sample][cut_name][pipeline] = tot

                    print('\n')


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0A53XGA.txt
Shape of FDR file: 1469
Total input tryptic junction peptides: 33
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0A13AGA.txt
Shape of FDR file: 9545
Total input tryptic junction peptides: 273
Number of validated unique peptides: 4
Validation Rate: 1.47 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0A51XGA.txt
Shape of FDR file: 389
Total input tryptic junction peptides: 12
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-

Shape of FDR file: 42540
Total input tryptic junction peptides: 1220
Number of validated unique peptides: 9
Validation Rate: 0.74 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J02132GA.txt
Shape of FDR file: 1021
Total input tryptic junction peptides: 28
Number of validated unique peptides: 1
Validation Rate: 3.57 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0211XGA.txt
Shape of FDR file: 712
Total input tryptic junction peptides: 17
Number of validated unique peptides: 1
Validation Rate: 5.88 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0AN1XGA.txt
Shape of FDR file: 28548
Total input tryptic junction peptides: 817
Number of validated unique peptides: 6
Validation Rate: 0.73 perce

Shape of FDR file: 626
Total input tryptic junction peptides: 10
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-0A53XGA.txt
Shape of FDR file: 825
Total input tryptic junction peptides: 12
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-0251AGA.txt
Shape of FDR file: 20
Total input tryptic junction peptides: 2
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-0A51AGA.txt
Shape of FDR file: 116
Total input tryptic junction peptides: 4
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster

Shape of FDR file: 17581
Total input tryptic junction peptides: 422
Number of validated unique peptides: 6
Validation Rate: 1.42 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-AO-A0JM/assign_conf_pooled_FDR_crux/tsearch-0211AGA.txt
Shape of FDR file: 1
Total input tryptic junction peptides: 1
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-AO-A0JM/assign_conf_pooled_FDR_crux/tsearch-0211XGA.txt
Shape of FDR file: 1
Total input tryptic junction peptides: 1
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-AO-A0JM/assign_conf_pooled_FDR_crux/tsearch-02532GA.txt
Shape of FDR file: 10
Total input tryptic junction peptides: 2
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/

Shape of FDR file: 12154
Total input tryptic junction peptides: 460
Number of validated unique peptides: 9
Validation Rate: 1.96 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-J0A532GA.txt
Shape of FDR file: 380
Total input tryptic junction peptides: 19
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-J0A11XGA.txt
Shape of FDR file: 1517
Total input tryptic junction peptides: 58
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-J0253XGA.txt
Shape of FDR file: 162
Total input tryptic junction peptides: 9
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Rea

Shape of FDR file: 344
Total input tryptic junction peptides: 5
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-0AN01GA.txt
Shape of FDR file: 925
Total input tryptic junction peptides: 14
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-0AN1XGA.txt
Shape of FDR file: 925
Total input tryptic junction peptides: 14
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-0AN32GA.txt
Shape of FDR file: 1521
Total input tryptic junction peptides: 27
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /clus

Shape of FDR file: 28401
Total input tryptic junction peptides: 855
Number of validated unique peptides: 16
Validation Rate: 1.87 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-J0AN12GA.txt
Shape of FDR file: 14947
Total input tryptic junction peptides: 457
Number of validated unique peptides: 5
Validation Rate: 1.09 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-J0A11AGA.txt
Shape of FDR file: 1333
Total input tryptic junction peptides: 48
Number of validated unique peptides: 1
Validation Rate: 2.08 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-J0A101GA.txt
Shape of FDR file: 1229
Total input tryptic junction peptides: 46
Number of validated unique peptides: 1
Validation Rate: 2.17 perc

Shape of FDR file: 194
Total input tryptic junction peptides: 6
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-0AN3AGA.txt
Shape of FDR file: 2999
Total input tryptic junction peptides: 48
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-0AN3XGA.txt
Shape of FDR file: 2026
Total input tryptic junction peptides: 34
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-0A532GA.txt
Shape of FDR file: 26
Total input tryptic junction peptides: 2
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /clust

Shape of FDR file: 2221
Total input tryptic junction peptides: 81
Number of validated unique peptides: 2
Validation Rate: 2.47 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-J0A53AGA.txt
Shape of FDR file: 1452
Total input tryptic junction peptides: 65
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-J02101GA.txt
Shape of FDR file: 322
Total input tryptic junction peptides: 10
Number of validated unique peptides: 1
Validation Rate: 10.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-J02112GA.txt
Shape of FDR file: 322
Total input tryptic junction peptides: 10
Number of validated unique peptides: 1
Validation Rate: 10.0 percent


Re

Shape of FDR file: 2313
Total input tryptic junction peptides: 52
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-0A51XGA.txt


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-02501GA.txt


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-0AN3XGA.txt
Shape of FDR file: 1799
Total input tryptic junction peptides: 39
Number of validated unique peptides: 0
Validation Rate: 0.0 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-0A532GA.txt
Shape of FDR file: 138
Total input tryptic junction peptides: 3
Number of validated unique peptides: 0
Validation Rate: 0.0 perc

Shape of FDR file: 103901
Total input tryptic junction peptides: 44923
Number of validated unique peptides: 552
Validation Rate: 1.23 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-25-1319/assign_conf_pooled_FDR_crux/tsearch-J02532GA.txt
Shape of FDR file: 9106
Total input tryptic junction peptides: 3805
Number of validated unique peptides: 101
Validation Rate: 2.65 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-25-1319/assign_conf_pooled_FDR_crux/tsearch-J0A13XGA.txt
Shape of FDR file: 74824
Total input tryptic junction peptides: 31893
Number of validated unique peptides: 525
Validation Rate: 1.65 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-25-1319/assign_conf_pooled_FDR_crux/tsearch-J0253AGA.txt
Shape of FDR file: 12247
Total input tryptic junction peptides: 5046
Number of validated unique peptides: 132
Validatio

Shape of FDR file: 71386
Total input tryptic junction peptides: 7091
Number of validated unique peptides: 109
Validation Rate: 1.54 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-25-1319/assign_conf_pooled_FDR_crux/tsearch-02132GA.txt
Shape of FDR file: 75543
Total input tryptic junction peptides: 7439
Number of validated unique peptides: 116
Validation Rate: 1.56 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-25-1319/assign_conf_pooled_FDR_crux/tsearch-0251XGA.txt
Shape of FDR file: 36140
Total input tryptic junction peptides: 3531
Number of validated unique peptides: 56
Validation Rate: 1.59 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-25-1319/assign_conf_pooled_FDR_crux/tsearch-0253AGA.txt
Shape of FDR file: 64353
Total input tryptic junction peptides: 6154
Number of validated unique peptides: 98
Validation Rate: 1.

Shape of FDR file: 83999
Total input tryptic junction peptides: 20931
Number of validated unique peptides: 366
Validation Rate: 1.75 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-25-1313/assign_conf_pooled_FDR_crux/tsearch-J0A51AGA.txt
Shape of FDR file: 22263
Total input tryptic junction peptides: 5382
Number of validated unique peptides: 126
Validation Rate: 2.34 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-25-1313/assign_conf_pooled_FDR_crux/tsearch-J0AN12GA.txt
Shape of FDR file: 68566
Total input tryptic junction peptides: 17387
Number of validated unique peptides: 283
Validation Rate: 1.63 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-25-1313/assign_conf_pooled_FDR_crux/tsearch-J0A11AGA.txt
Shape of FDR file: 41982
Total input tryptic junction peptides: 10440
Number of validated unique peptides: 213
Validati

Shape of FDR file: 50177
Total input tryptic junction peptides: 3803
Number of validated unique peptides: 60
Validation Rate: 1.58 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-25-1313/assign_conf_pooled_FDR_crux/tsearch-0A512GA.txt
Shape of FDR file: 44772
Total input tryptic junction peptides: 3340
Number of validated unique peptides: 52
Validation Rate: 1.56 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-25-1313/assign_conf_pooled_FDR_crux/tsearch-0AN1AGA.txt
Shape of FDR file: 56541
Total input tryptic junction peptides: 4299
Number of validated unique peptides: 66
Validation Rate: 1.54 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-25-1313/assign_conf_pooled_FDR_crux/tsearch-0A13AGA.txt
Shape of FDR file: 82564
Total input tryptic junction peptides: 6251
Number of validated unique peptides: 92
Validation Rate: 1.47

Shape of FDR file: 44803
Total input tryptic junction peptides: 3345
Number of validated unique peptides: 52
Validation Rate: 1.55 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-25-1313/assign_conf_pooled_FDR_crux/tsearch-02512GA.txt
Shape of FDR file: 26623
Total input tryptic junction peptides: 1944
Number of validated unique peptides: 23
Validation Rate: 1.18 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-25-1313/assign_conf_pooled_FDR_crux/tsearch-02112GA.txt
Shape of FDR file: 43411
Total input tryptic junction peptides: 3282
Number of validated unique peptides: 53
Validation Rate: 1.61 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-61-2008/assign_conf_pooled_FDR_crux/tsearch-J0A53XGA.txt
Shape of FDR file: 46999
Total input tryptic junction peptides: 21345
Number of validated unique peptides: 545
Validation Rate: 

Shape of FDR file: 7799
Total input tryptic junction peptides: 3636
Number of validated unique peptides: 86
Validation Rate: 2.37 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-61-2008/assign_conf_pooled_FDR_crux/tsearch-J0A132GA.txt
Shape of FDR file: 70886
Total input tryptic junction peptides: 34513
Number of validated unique peptides: 715
Validation Rate: 2.07 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-61-2008/assign_conf_pooled_FDR_crux/tsearch-J0A53AGA.txt
Shape of FDR file: 48268
Total input tryptic junction peptides: 21810
Number of validated unique peptides: 564
Validation Rate: 2.59 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-61-2008/assign_conf_pooled_FDR_crux/tsearch-J02101GA.txt
Shape of FDR file: 19550
Total input tryptic junction peptides: 9339
Number of validated unique peptides: 216
Validation 

Shape of FDR file: 91195
Total input tryptic junction peptides: 11111
Number of validated unique peptides: 193
Validation Rate: 1.74 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-61-2008/assign_conf_pooled_FDR_crux/tsearch-0213AGA.txt
Shape of FDR file: 105096
Total input tryptic junction peptides: 12309
Number of validated unique peptides: 214
Validation Rate: 1.74 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-61-2008/assign_conf_pooled_FDR_crux/tsearch-0A13XGA.txt
Shape of FDR file: 115936
Total input tryptic junction peptides: 13754
Number of validated unique peptides: 241
Validation Rate: 1.75 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-61-2008/assign_conf_pooled_FDR_crux/tsearch-0A53AGA.txt
Shape of FDR file: 108115
Total input tryptic junction peptides: 12715
Number of validated unique peptides: 231
Validation

Shape of FDR file: 47732
Total input tryptic junction peptides: 12048
Number of validated unique peptides: 261
Validation Rate: 2.17 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-24-1431/assign_conf_pooled_FDR_crux/tsearch-J0A501GA.txt
Shape of FDR file: 25081
Total input tryptic junction peptides: 6220
Number of validated unique peptides: 170
Validation Rate: 2.73 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-24-1431/assign_conf_pooled_FDR_crux/tsearch-J0211AGA.txt
Shape of FDR file: 18144
Total input tryptic junction peptides: 4458
Number of validated unique peptides: 103
Validation Rate: 2.31 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-24-1431/assign_conf_pooled_FDR_crux/tsearch-J02512GA.txt
Shape of FDR file: 7808
Total input tryptic junction peptides: 1853
Number of validated unique peptides: 56
Validation R

Shape of FDR file: 79315
Total input tryptic junction peptides: 5883
Number of validated unique peptides: 89
Validation Rate: 1.51 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-24-1431/assign_conf_pooled_FDR_crux/tsearch-0A501GA.txt
Shape of FDR file: 50267
Total input tryptic junction peptides: 3704
Number of validated unique peptides: 59
Validation Rate: 1.59 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-24-1431/assign_conf_pooled_FDR_crux/tsearch-0A11AGA.txt
Shape of FDR file: 57879
Total input tryptic junction peptides: 4354
Number of validated unique peptides: 67
Validation Rate: 1.54 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-24-1431/assign_conf_pooled_FDR_crux/tsearch-0AN01GA.txt
Shape of FDR file: 56557
Total input tryptic junction peptides: 4250
Number of validated unique peptides: 64
Validation Rate: 1.51

Shape of FDR file: 60479
Total input tryptic junction peptides: 39155
Number of validated unique peptides: 723
Validation Rate: 1.85 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-24-2298/assign_conf_pooled_FDR_crux/tsearch-J0A51XGA.txt
Shape of FDR file: 23140
Total input tryptic junction peptides: 14710
Number of validated unique peptides: 315
Validation Rate: 2.14 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-24-2298/assign_conf_pooled_FDR_crux/tsearch-J0213XGA.txt
Shape of FDR file: 22379
Total input tryptic junction peptides: 13849
Number of validated unique peptides: 284
Validation Rate: 2.05 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-24-2298/assign_conf_pooled_FDR_crux/tsearch-J0AN01GA.txt
Shape of FDR file: 76538
Total input tryptic junction peptides: 51644
Number of validated unique peptides: 753
Validat

Shape of FDR file: 92682
Total input tryptic junction peptides: 61816
Number of validated unique peptides: 952
Validation Rate: 1.54 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-24-2298/assign_conf_pooled_FDR_crux/tsearch-J02132GA.txt
Shape of FDR file: 17850
Total input tryptic junction peptides: 11254
Number of validated unique peptides: 216
Validation Rate: 1.92 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-24-2298/assign_conf_pooled_FDR_crux/tsearch-J0211XGA.txt
Shape of FDR file: 15255
Total input tryptic junction peptides: 9650
Number of validated unique peptides: 183
Validation Rate: 1.9 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-24-2298/assign_conf_pooled_FDR_crux/tsearch-J0AN1XGA.txt
Shape of FDR file: 77811
Total input tryptic junction peptides: 52435
Number of validated unique peptides: 771
Validatio

Shape of FDR file: 29393
Total input tryptic junction peptides: 4767
Number of validated unique peptides: 54
Validation Rate: 1.13 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-24-2298/assign_conf_pooled_FDR_crux/tsearch-0AN3XGA.txt
Shape of FDR file: 96367
Total input tryptic junction peptides: 15528
Number of validated unique peptides: 204
Validation Rate: 1.31 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-24-2298/assign_conf_pooled_FDR_crux/tsearch-0A532GA.txt
Shape of FDR file: 65033
Total input tryptic junction peptides: 10396
Number of validated unique peptides: 140
Validation Rate: 1.35 percent


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-24-2298/assign_conf_pooled_FDR_crux/tsearch-0A112GA.txt
Shape of FDR file: 62668
Total input tryptic junction peptides: 10006
Number of validated unique peptides: 135
Validation Rate

In [61]:
## Get rates
if read_from_disk:
    compare = {'sample' : [], 
              'filter_' : [], 
              'pipeline': [], 
               'validation_rate':[]}
    
    for sample, experiments_ in samples_store_rates.items():
        for experiment, pipelines_ in experiments_.items():
            if ('OHSU' in pipelines_.keys()) and ('ETH' in pipelines_.keys()):
                for pipeline, rate in pipelines_.items():
                    compare['sample'].append(sample)
                    compare['filter_'].append(experiment)
                    compare['pipeline'].append( pipeline)
                    compare['validation_rate'].append(rate)
                

In [62]:
df1 = pd.DataFrame(compare)

In [63]:
## Compare peptides
if read_from_disk:
    compare = {'sample' : [], 
              'filter_' : [], 
              'pep_size_ohsu' : [], 
              'pep_size_eth' : [], 
              'pep_size_intersection' : [], 
              'pep_size_ohsu\eth' : [], 
              'pep_size_eth\ohsu' : []}

    for sample, experiments_ in samples_store_pep.items():
        for experiment, pipelines_ in experiments_.items():
            if ('OHSU' in pipelines_.keys()) and ('ETH' in pipelines_.keys()):
                compare['sample'].append(sample)
                compare['filter_'].append(experiment)
                compare['pep_size_ohsu'].append(len(pipelines_['OHSU']))
                compare['pep_size_eth'].append(len(pipelines_['ETH']))
                compare['pep_size_ohsu\eth'].append(len(pipelines_['OHSU'].difference(pipelines_['ETH'])))
                compare['pep_size_eth\ohsu'].append(len(pipelines_['ETH'].difference(pipelines_['OHSU'])))
                compare['pep_size_intersection'].append(len(pipelines_['ETH'].intersection(pipelines_['OHSU'])))

In [64]:
df2 = pd.DataFrame(compare)

In [65]:
## Get rates
if read_from_disk:
    compare = {'sample' : [], 
              'filter_' : [], 
              'pipeline': [], 
               'total':[]}
    
    for sample, experiments_ in samples_store_total.items():
        for experiment, pipelines_ in experiments_.items():
            if ('OHSU' in pipelines_.keys()) and ('ETH' in pipelines_.keys()):
                for pipeline, tot in pipelines_.items():
                    compare['sample'].append(sample)
                    compare['filter_'].append(experiment)
                    compare['pipeline'].append( pipeline)
                    compare['total'].append(tot)

In [66]:
df3 = pd.DataFrame(compare)

# Debug 

In [67]:
criteria = '02501GA'
print(MS_strategy)

display(df1.loc[df1['filter_'] == criteria])
display(df2.loc[df2['filter_'] == criteria])
display(df3.loc[df3['filter_'] == criteria])


pool


Unnamed: 0,sample,filter_,pipeline,validation_rate
50,TCGA-C8-A12P-01A-11R-A115-07,02501GA,OHSU,0.0
51,TCGA-C8-A12P-01A-11R-A115-07,02501GA,ETH,0.0
290,TCGA-A2-A0SX-01A-12R-A084-07,02501GA,OHSU,0.0
291,TCGA-A2-A0SX-01A-12R-A084-07,02501GA,ETH,0.0
360,TCGA-25-1319-01A-01R-1565-13,02501GA,OHSU,2.78
361,TCGA-25-1319-01A-01R-1565-13,02501GA,ETH,1.63
430,TCGA-25-1313-01A-01R-1565-13,02501GA,OHSU,2.29
431,TCGA-25-1313-01A-01R-1565-13,02501GA,ETH,1.13
500,TCGA-61-2008-01A-02R-1568-13,02501GA,OHSU,2.37
501,TCGA-61-2008-01A-02R-1568-13,02501GA,ETH,1.7


Unnamed: 0,sample,filter_,pep_size_ohsu,pep_size_eth,pep_size_intersection,pep_size_ohsu\eth,pep_size_eth\ohsu
25,TCGA-C8-A12P-01A-11R-A115-07,02501GA,0,0,0,0,0
145,TCGA-A2-A0SX-01A-12R-A084-07,02501GA,0,0,0,0,0
180,TCGA-25-1319-01A-01R-1565-13,02501GA,84,55,27,57,28
215,TCGA-25-1313-01A-01R-1565-13,02501GA,33,21,8,25,13
250,TCGA-61-2008-01A-02R-1568-13,02501GA,86,69,30,56,39
285,TCGA-24-1431-01A-01R-1566-13,02501GA,55,33,15,40,18
320,TCGA-24-2298-01A-01R-1569-13,02501GA,76,54,31,45,23


Unnamed: 0,sample,filter_,pipeline,total
50,TCGA-C8-A12P-01A-11R-A115-07,02501GA,OHSU,2.0
51,TCGA-C8-A12P-01A-11R-A115-07,02501GA,ETH,2.0
290,TCGA-A2-A0SX-01A-12R-A084-07,02501GA,OHSU,1.0
291,TCGA-A2-A0SX-01A-12R-A084-07,02501GA,ETH,
360,TCGA-25-1319-01A-01R-1565-13,02501GA,OHSU,3021.0
361,TCGA-25-1319-01A-01R-1565-13,02501GA,ETH,3374.0
430,TCGA-25-1313-01A-01R-1565-13,02501GA,OHSU,1444.0
431,TCGA-25-1313-01A-01R-1565-13,02501GA,ETH,1855.0
500,TCGA-61-2008-01A-02R-1568-13,02501GA,OHSU,3636.0
501,TCGA-61-2008-01A-02R-1568-13,02501GA,ETH,4058.0


In [68]:
78/1309

0.059587471352177235

In [69]:
50/1516

0.032981530343007916

In [None]:
WHY IS THE JOINT TOTAL DIFFERENT FOR ETH AND OHSU.. HA script to pick up 

# Plot: Results per experiment

In [10]:
# Choose plotting sample
sample = 'TCGA-25-1319-01A-01R-1565-13'
#sample =  'TCGA-AO-A0JM-01A-21R-A056-07'

In [11]:
if read_from_disk:
    df = pd.DataFrame(compare)
    display(df.head())
    print(df.shape)
else:
    df = None

Unnamed: 0,sample,filter_,pep_size_ohsu,pep_size_eth,pep_size_intersection,pep_size_ohsu\eth,pep_size_eth\ohsu
0,TCGA-C8-A12P-01A-11R-A115-07,0A53XGA,0,0,0,0,0
1,TCGA-C8-A12P-01A-11R-A115-07,0A13AGA,4,1,0,4,1
2,TCGA-C8-A12P-01A-11R-A115-07,0A51XGA,0,0,0,0,0
3,TCGA-C8-A12P-01A-11R-A115-07,0213XGA,1,1,0,1,1
4,TCGA-C8-A12P-01A-11R-A115-07,0AN01GA,6,2,2,4,0


(330, 7)


In [12]:
def run_sample_plotting(sample_plot_dir, sample, df, MS_FDR, MS_strategy, save):
    order_file = os.path.join(sample_plot_dir[sample], f'{sample}_Barplot_sorting.tsv.gz')
    order_file = pd.read_csv(order_file, sep = '\t')
    display(order_file.head())
    print(order_file.shape)
    df_plot = order_file.merge(df, on = ['sample', 'filter_'], how = 'inner')
    df_plot = df_plot.fillna(0)
    print(df_plot.shape)
    df_plot = df_plot.sort_values('index')
    display(df_plot.head())
    # Plotting
    run_type = run_type_plot_dir[sample]

    back_ticks = df_plot['filter_background'] 
    front_ticks = df_plot['filter_foreground']

    serie_index = df_plot['index']
    serie_intersection = df_plot['pep_size_intersection']
    serie_eth = df_plot['pep_size_eth']
    print(serie_eth)
    serie_ohsu = df_plot['pep_size_ohsu']
    y_label = 'Number of MS-detected tryptic junction-peptides'

    name_plot = f'Barplot_ovelap_protein{MS_FDR}_{MS_strategy}'
    base_plot = sample
    plot_dir = sample_plot_dir[sample]

    plot_intersection_bars(back_ticks, front_ticks, ticks_fontsize, axislabels_fontsize, 
                              legend_fontsize, axes_fontsize, run_type, 
                               serie_index, serie_intersection, serie_eth, serie_ohsu,
                               y_label, save, plot_dir, base_plot, name_plot)
    return df_plot

In [13]:
# TMP CODE single sample
path_data = os.path.join(sample_plot_dir[sample], f'data{MS_FDR}_{MS_strategy}.tsv.gz')
print(path_data)
if df is not None:
    df.to_csv(path_data, sep = '\t', index = None)
else:
    df = pd.read_csv(path_data, sep = '\t')
df_plot = run_sample_plotting(sample_plot_dir, sample, df, MS_FDR, MS_strategy, save)

/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar/plots/data_crema_joint.tsv.gz


Unnamed: 0.1,Unnamed: 0,sample,filter_,size_ohsu,size_eth,size_intersection,size_ohsu\eth,size_eth\ohsu,filter_foreground_target,filter_foreground_reads,filter_foreground_samples,filter_background_reads,filter_background_samples,cohort,motif,index,filter_background,filter_foreground
0,7,TCGA-25-1319-01A-01R-1565-13,02501GA,23972,20052,8874,15098,11178,0,2,5,0,1,G,A,0,"(0, 1)","(0, 2, 5)"
1,30,TCGA-25-1319-01A-01R-1565-13,02512GA,24766,20638,9166,15600,11472,0,2,5,1,2,G,A,1,"(1, 2)","(0, 2, 5)"
2,14,TCGA-25-1319-01A-01R-1565-13,0251AGA,24913,20683,9193,15720,11490,0,2,5,1,A,G,A,2,"(1, A)","(0, 2, 5)"
3,8,TCGA-25-1319-01A-01R-1565-13,0251XGA,24913,20683,9193,15720,11490,0,2,5,1,X,G,A,3,"(1, X)","(0, 2, 5)"
4,15,TCGA-25-1319-01A-01R-1565-13,02532GA,29573,24940,11086,18487,13854,0,2,5,3,2,G,A,4,"(3, 2)","(0, 2, 5)"


(35, 18)
(35, 23)


Unnamed: 0.1,Unnamed: 0,sample,filter_,size_ohsu,size_eth,size_intersection,size_ohsu\eth,size_eth\ohsu,filter_foreground_target,filter_foreground_reads,...,cohort,motif,index,filter_background,filter_foreground,pep_size_ohsu,pep_size_eth,pep_size_intersection,pep_size_ohsu\eth,pep_size_eth\ohsu
0,7,TCGA-25-1319-01A-01R-1565-13,02501GA,23972,20052,8874,15098,11178,0,2,...,G,A,0,"(0, 1)","(0, 2, 5)",78,50,27,51,23
1,30,TCGA-25-1319-01A-01R-1565-13,02512GA,24766,20638,9166,15600,11472,0,2,...,G,A,1,"(1, 2)","(0, 2, 5)",81,50,27,54,23
2,14,TCGA-25-1319-01A-01R-1565-13,0251AGA,24913,20683,9193,15720,11490,0,2,...,G,A,2,"(1, A)","(0, 2, 5)",81,50,27,54,23
3,8,TCGA-25-1319-01A-01R-1565-13,0251XGA,24913,20683,9193,15720,11490,0,2,...,G,A,3,"(1, X)","(0, 2, 5)",81,50,27,54,23
4,15,TCGA-25-1319-01A-01R-1565-13,02532GA,29573,24940,11086,18487,13854,0,2,...,G,A,4,"(3, 2)","(0, 2, 5)",94,62,34,60,28


0      50
1      50
2      50
3      50
4      62
5      81
6      84
7      82
8      82
9      82
10     82
11    100
12     88
13     88
14     88
15     88
16    129
17    132
18    106
19     98
20     98
21     98
22     98
23     99
24     99
25     99
26     99
27    137
28    141
29    116
30    117
31    148
32    152
33    150
34    154
Name: pep_size_eth, dtype: int64


NameError: name 'plot_intersection_bars' is not defined

In [None]:
# for sample in all_samples:
#     path_data = os.path.join(sample_plot_dir[sample], f'data{MS_FDR}_{MS_strategy}.tsv.gz')
#     print(path_data)
#     if df is not None:
#         df.to_csv(path_data, sep = '\t', index = None)
#     else:
#         df = pd.read_csv(path_data, sep = '\t')
#     run_sample_plotting(sample_plot_dir, sample, df, MS_FDR, MS_strategy, save)