In [4]:
import pandas as pd
import glob
import os 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from Bio import SeqIO
import tarfile
import gzip
%load_ext autoreload
%autoreload 2
from helpers_barplot_intersection import plot_text, plot_intersection_bars
from helpers_barplot_intersection_kmers import explode_immunopepper_coord, search_result_peptides_ids 
from helpers_barplot_intersection_kmers import get_pep_ids, get_pep_coord, tar_reader
from helpers_barplot_intersection_kmers import validated_filtered_kmers, reader_assign_conf_pep
from helpers_barplot_intersection_kmers import compare_OHSU_ETH, kmer_in_bi_exon_peptide


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# INPUTS 
read_from_disk = True # or reload
proteomicsdir = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024'
#proteomicsdir = '/cluster/work/grlab/projects/projects2020_OHSU/proteomics'

pipelines = ['OHSU', 'ETH']

FDR_limit = 0.05
MS_FDR = '_crux' #choices: '_crema' '_crux' or ''
MS_strategy = 'pool' #Choices: 'pool' 'joint', 'single' 

#pool: pool all experiments per pipeline
#joint: pool all experiments for both pipeline
#single: per experiment per pipeline

if MS_FDR == '_crema':
    FDR_file = 'crema.peptides.txt'
    col_seq = 'sequence'
    col_qvalue = 'crema q-value'
elif MS_FDR == '_crux' or  MS_FDR == '':
    FDR_file = 'assign-confidence.target.txt'
    col_seq = 'unmodified sequence'
    col_qvalue = 'tdc q-value'
else:
    print(f'ERROR: wrong input for {FDR_file}')

    

sample_plot_dir = {}
run_type_plot_dir = {}    
all_samples = []

# BRCA
samples = ['TCGA-C8-A12P-01A-11R-A115-07',
                  'TCGA-AO-A0JM-01A-21R-A056-07',
                  'TCGA-BH-A18V-01A-11R-A12D-07',
                  'TCGA-A2-A0D2-01A-21R-A034-07',
                  'TCGA-A2-A0SX-01A-12R-A084-07']
all_samples.extend(samples)
basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
filter_dir = os.path.join(basedir, 'filtering_samples', 'filters_19May_order_5ge_wAnnot_GPstar')

for sample in samples:
    sample_plot_dir[sample] = os.path.join(filter_dir, 'plots')
    run_type_plot_dir[sample] = 'BRCA'
    
# OV
samples = ['TCGA-25-1319-01A-01R-1565-13',
                  'TCGA-25-1313-01A-01R-1565-13',
                  'TCGA-61-2008-01A-02R-1568-13',
                  'TCGA-24-1431-01A-01R-1566-13',
                  'TCGA-24-2298-01A-01R-1569-13']
all_samples.extend(samples)
basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374'
filter_dir = os.path.join(basedir, 'filtering_samples', 'filters_19May_order_5ge_wAnnot_GPstar')

for sample in samples:
    sample_plot_dir[sample] = os.path.join(filter_dir, 'plots')
    run_type_plot_dir[sample] = 'OV'
    
fasta_base_OHSU = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/current'
fasta_base_ETH = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/*/filtering_samples/filters_19May_order_5ge_wAnnot_GPstar'
        #format fasta_file = f'{fasta_base_OHSU}/J_{sample}_pool_kmer.fa'
        #format fasta_file = f'{fasta_base_ETH}/G_{sample}_pool_kmer_25012024.fa.gz'
kmer_files_OHSU = '/cluster/work/grlab/projects/projects2020_OHSU/share_OHUS_PNLL/archive/June28_renamed_kmerfiles_OHSU.tar.gz'
  
# PLOTTING Parameters 
ticks_fontsize = 12
axislabels_fontsize = 12
legend_fontsize = 12
axes_fontsize = 15
save = False
sample = None

# Parse: results for all experiments

In [5]:
## Get kmers from result files
if read_from_disk:
    samples_store_kmers = {}

    for sample in all_samples:

        sample_short = '-'.join(sample.split('-')[0:3])
        samples_store_kmers[sample] = defaultdict(dict)
        for pipeline in pipelines:
            path_single = os.path.join(proteomicsdir, pipeline, sample_short, 
                                       f'assign_conf_per_experiment{MS_FDR}')
            path_pool_pipeline = os.path.join(proteomicsdir, pipeline, sample_short, 
                                              f'assign_conf_pooled_FDR{MS_FDR}')
            path_pool_union = os.path.join(proteomicsdir, 
                                           f'assign_conf_joint_to_{pipeline}{MS_FDR}', sample_short)
            path_TEST_OHSU = os.path.join(proteomicsdir, 'OHSU', sample_short, 
                                       f'assign_conf_per_experiment{MS_FDR}')
            path_TEST_ETH = os.path.join(proteomicsdir, 'ETH', sample_short, 
                                       f'assign_conf_per_experiment{MS_FDR}')


            experiment_list = [ i.split('/')[-1] for i in glob.glob(path_single + '/*')] #check

            for experiment in experiment_list:
                if pipeline == 'OHSU':
                    original_name = experiment
                    cut_name = experiment[1:]
                else:
                    original_name = experiment
                    cut_name = experiment          

                if os.path.isfile(os.path.join(path_TEST_OHSU, 'J' + cut_name, FDR_file)) and \
                     os.path.isfile(os.path.join(path_TEST_ETH, cut_name, FDR_file)): #Commun experiments               

                    # search 1 experiment, 1 pipeline  
                    if MS_strategy == 'single':
                        df = os.path.join(path_single, original_name, FDR_file)
                    # search all experiments, 1 pipeline
                    if MS_strategy == 'pool':
                        df = os.path.join(path_pool_pipeline, f'tsearch-{original_name}.txt')
                    # search all experiments, 1 union of pipelines
                    if MS_strategy == 'joint':
                        df = os.path.join(path_pool_union, f'tsearch-{original_name}.txt')

                    df_filtered = reader_assign_conf_pep(df, FDR_limit, col_seq, col_qvalue)
                    if df_filtered.shape[0]:
                        df_filtered = validated_filtered_kmers(df_filtered, fasta_base_OHSU, kmer_files_OHSU,
                                                               fasta_base_ETH, sample, experiment, 
                                                               pipeline)
                        samples_store_kmers[sample][cut_name][pipeline] = set(df_filtered['kmer'])
                    else:
                        samples_store_kmers[sample][cut_name][pipeline] = set()
                    print(f'{len( samples_store_kmers[sample][cut_name][pipeline])} validated kmers')

                    print('\n')


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0A53XGA.txt
With Shape: 1469
With unique peptides: 31
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0A13AGA.txt
With Shape: 9545
With unique peptides: 258
Number of validated psm: (12, 18)
26 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0A51XGA.txt
With Shape: 389
With unique peptides: 11
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0213XGA.txt
With Shape: 1511
With unique peptides: 38
Number of validated psm: (9, 18)
8 validated kmers


Reading /clust

8 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-J0AN1XGA.txt
With Shape: 28548
With unique peptides: 757
Number of validated psm: (16, 18)
43 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-0211AGA.txt
With Shape: 626
With unique peptides: 9
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-0211XGA.txt
With Shape: 626
With unique peptides: 9
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/ETH/TCGA-C8-A12P/assign_conf_pooled_FDR_crux/tsearch-02532GA.txt
With Shape: 362
With unique peptides: 6
Number of validated psm: (0, 18)
0 validated kmers


Read

19 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-AO-A0JM/assign_conf_pooled_FDR_crux/tsearch-J0213XGA.txt
With Shape: 1163
With unique peptides: 29
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-AO-A0JM/assign_conf_pooled_FDR_crux/tsearch-J0AN01GA.txt
With Shape: 17357
With unique peptides: 392
Number of validated psm: (144, 18)
44 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-AO-A0JM/assign_conf_pooled_FDR_crux/tsearch-J0A532GA.txt
With Shape: 108
With unique peptides: 4
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-AO-A0JM/assign_conf_pooled_FDR_crux/tsearch-J0A11XGA.txt
With Shape: 1194
With unique peptides: 20
Number of validated psm: (2, 18)
7 validated 

8 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-J0A13AGA.txt
With Shape: 8114
With unique peptides: 264
Number of validated psm: (34, 18)
47 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-J0A51XGA.txt
With Shape: 229
With unique peptides: 11
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-J0213XGA.txt
With Shape: 1292
With unique peptides: 40
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-BH-A18V/assign_conf_pooled_FDR_crux/tsearch-J0AN01GA.txt
With Shape: 12154
With unique peptides: 418
Number of validated psm: (32, 18)
50 validate

7 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-J0A13AGA.txt
With Shape: 8452
With unique peptides: 228
Number of validated psm: (19, 18)
64 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-J0213XGA.txt
With Shape: 1407
With unique peptides: 37
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-J0AN01GA.txt
With Shape: 14830
With unique peptides: 416
Number of validated psm: (11, 18)
28 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0D2/assign_conf_pooled_FDR_crux/tsearch-J0A532GA.txt
With Shape: 208
With unique peptides: 10
Number of validated psm: (0, 18)
0 validate

20 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-J0A51XGA.txt
With Shape: 182
With unique peptides: 5
Number of validated psm: (0, 18)
0 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-J0213XGA.txt
With Shape: 962
With unique peptides: 36
Number of validated psm: (1, 18)
8 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-J0AN01GA.txt
With Shape: 10425
With unique peptides: 383
Number of validated psm: (20, 18)
37 validated kmers


Reading /cluster/work/grlab/projects/projects2020_OHSU/proteomics_fixMerge_25012024/OHSU/TCGA-A2-A0SX/assign_conf_pooled_FDR_crux/tsearch-J0A532GA.txt
With Shape: 288
With unique peptides: 10
Number of validated psm: (0, 18)
0 validated kme

With Shape: 40922
With unique peptides: 11482
Number of validated psm: (840, 18)


TypeError: 'in <string>' requires string as left operand, not float

In [None]:
# Performs sets comparisons
compare = compare_OHSU_ETH(samples_store_kmers, read_from_disk)

# Plot: Results per experiment

In [None]:
# Choose plotting sample
sample = 'TCGA-25-1319-01A-01R-1565-13'
#sample =  'TCGA-AO-A0JM-01A-21R-A056-07'

In [None]:
if read_from_disk:
    df = pd.DataFrame(compare)
    display(df.head())
    print(df.shape)
else:
    df = None

In [None]:
def run_sample_plotting(sample_plot_dir, sample, df, MS_FDR, MS_strategy, save):
    order_file = os.path.join(sample_plot_dir[sample], f'{sample}_Barplot_sorting.tsv.gz')
    order_file = pd.read_csv(order_file, sep = '\t')
    display(order_file.head())
    print(order_file.shape)
    df_plot = order_file.merge(df, on = ['sample', 'filter_'], how = 'inner')
    df_plot = df_plot.fillna(0)
    print(df_plot.shape)
    df_plot = df_plot.sort_values('index')
    display(df_plot.head())
    # Plotting
    run_type = run_type_plot_dir[sample]

    back_ticks = df_plot['filter_background'] 
    front_ticks = df_plot['filter_foreground']

    serie_index = df_plot['index']
    serie_intersection = df_plot['pep_size_intersection']
    serie_eth = df_plot['pep_size_eth']
    print(serie_eth)
    serie_ohsu = df_plot['pep_size_ohsu']
    y_label = 'Number of MS-detected tryptic junction-peptides'

    name_plot = f'Barplot_ovelap_valKMERS{MS_FDR}_{MS_strategy}'
    base_plot = sample
    plot_dir = sample_plot_dir[sample]

    plot_intersection_bars(back_ticks, front_ticks, ticks_fontsize, axislabels_fontsize, 
                              legend_fontsize, axes_fontsize, run_type, 
                               serie_index, serie_intersection, serie_eth, serie_ohsu,
                               y_label, save, plot_dir, base_plot, name_plot)
    return df_plot

In [None]:
# TMP CODE single sample
path_data = os.path.join(sample_plot_dir[sample], f'data_kmers{MS_FDR}_{MS_strategy}.tsv.gz')
if df is not None:
    df.to_csv(path_data, sep = '\t', index = None)
    print(f'Saved data to {path_data}')
else:
    df = pd.read_csv(path_data, sep = '\t')
df_plot = run_sample_plotting(sample_plot_dir, sample, df, MS_FDR, MS_strategy, save)

In [None]:
# for sample in all_samples:
#     path_data = os.path.join(sample_plot_dir[sample], f'data{MS_FDR}_{MS_strategy}.tsv.gz')
#     print(path_data)
#     if df is not None:
#         df.to_csv(path_data, sep = '\t', index = None)
#     else:
#         df = pd.read_csv(path_data, sep = '\t')
#     run_sample_plotting(sample_plot_dir, sample, df, MS_FDR, MS_strategy, save)