In [1]:
import numpy as np
import os
import timeit
import glob 
import pandas as pd
import time
import multiprocessing as mp 
import logging
import sys 
import pathlib
from pathlib import Path
import matplotlib.pyplot as plt 

Matplotlib created a temporary config/cache directory at /scratch/slurm-job.828179/matplotlib-xquhxa72 because the default path (/cluster/customapps/biomed/grlab/users/prelotla/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
def get_junction_coordinates(df, coordinates_col, sep=':'):
    df['strand'] = None
    df['junction_coordinate'] = None

    for idx, row in df.iterrows():
        kmer_coordinates = [int(x) for x in row[coordinates_col].split(sep) if (x !='None') ]

        if kmer_coordinates[1] < kmer_coordinates[2]: # order strand +

            df.loc[idx, 'strand'] = '+'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:3]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:5]])
        else: # order strand +
            df.loc[idx, 'strand'] = '-'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0]]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0],
                                                                                kmer_coordinates[2],
                                                                                kmer_coordinates[5]
                                                                               ]])
    return df




### Get filtered data

In [3]:
# Inputs
run_type = 'brca'

if run_type == 'brca':
    target_samples = ['TCGA-C8-A12P-01A-11R-A115-07.all',
                      'TCGA-AO-A0JM-01A-21R-A056-07.all',
                      'TCGA-BH-A18V-01A-11R-A12D-07.all',
                      'TCGA-A2-A0D2-01A-21R-A034-07.all',
                      'TCGA-A2-A0SX-01A-12R-A084-07.all']
    basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
    intermediate_output = os.path.join(basedir, 'filtering_intermediate/complete_cancer_candidates_order_r.tsv.gz')

filtering_id = 'chosen_filters_06March_order'
output_dir = os.path.join(basedir, 'filtering_samples', filtering_id)

In [4]:
# Load generation matrix
df_load = pd.read_csv(intermediate_output, sep = '\t')

In [5]:
# Load filtered kmers
sample_target = 'TCGA-AO-A0JM-01A-21R-A056-07'
path_interest = f'G_{sample_target}_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz'
path_interest = os.path.join(output_dir, path_interest)
filt = pd.read_csv(path_interest, sep = '\t')
print(filt.shape)

(1139, 4)


In [6]:
# Merge generation matrix & filtered kmers
filt_meta = df_load.merge(filt, on = list(filt.columns), how = 'right')
print(filt_meta.shape)

(1139, 24)


In [7]:
# Define problematic kmer
# exp_lim = 600
# filt_meta_pb = filt_meta.loc[filt_meta['cancerCohortfilter >0.0'] > exp_lim]
# filt_meta_pb = filt_meta_pb[filt_meta_pb[sample_target.replace('-', '') + 'all'] > 0 ]
# print(f'filtered kmers-junctions {filt_meta.shape[0]}')
# print(f'filtered kmers unique {len(filt_meta.kmer.unique())}')
# print(f'filtered kmers-junctions problematic {filt_meta_pb.shape[0]}')
# print(f'filtered kmers problematic unique {len(filt_meta_pb.kmer.unique())}')
filt_meta_pb = filt_meta

In [8]:
# Columns magic
core_cols = ['kmer', 'gtexCohortfilter >0.0', 'coord',
       'junctionAnnotated', 'readFrameAnnotated', 'isCrossJunction', 'batch',
       'cancerCohortfilter >0.0', sample_target.replace('-', '') + 'all', 'isAnnotated']

df_load.columns

Index(['kmer', 'gtexCohortfilter >0.0', 'gtexCohortfilter >=1.0',
       'gtexCohortfilter >=2.0', 'gtexCohortfilter >=3.0',
       'gtexCohortfilter >=5.0', 'gtexCohortfilter >=10.0', 'coord',
       'junctionAnnotated', 'readFrameAnnotated', 'isCrossJunction', 'batch',
       'cancerCohortfilter >0.0', 'cancerCohortfilter >=1.0',
       'cancerCohortfilter >=2.0', 'cancerCohortfilter >=3.0',
       'cancerCohortfilter >=5.0', 'cancerCohortfilter >=10.0',
       'TCGAC8A12P01A11RA11507all', 'TCGAAOA0JM01A21RA05607all',
       'TCGABHA18V01A11RA12D07all', 'TCGAA2A0D201A21RA03407all',
       'TCGAA2A0SX01A12RA08407all', 'isAnnotated'],
      dtype='object')

In [9]:
# Extract coordinates of the merged table
filt_meta_pb = get_junction_coordinates(filt_meta_pb, 'coord', sep=':')

display(filt_meta_pb[core_cols + ['junction_coordinate']].head())

Unnamed: 0,kmer,gtexCohortfilter >0.0,coord,junctionAnnotated,readFrameAnnotated,isCrossJunction,batch,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,isAnnotated,junction_coordinate
0,KSKTRPISD,,16801319:16801333:16795006:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
1,SKTRPISDS,,16801319:16801330:16795003:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
2,YKSKTRPIS,,16801319:16801336:16795009:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
3,LYKSKTRPI,,16801319:16801339:16795012:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
4,ALYKSKTRP,,16801319:16801342:16795015:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319


### Add the peptide metadata

In [10]:
# Define peptide file columns of interest 
cols_correct = ['peptide','id','readFrame','readFrameAnnotated','geneName','geneChr','geneStrand',
'mutationMode','hasStopCodon','isInJunctionList',
'isIsolated','variantComb','variantSegExpr','modifiedExonsCoord',
'originalExonsCoord',
'vertexIdx','kmerType', 'dummy1', 'dummy2', 'dummy3']

cols_pep_file = ['peptide', 'id', 'readFrame', 'geneName',
       'geneChr', 'geneStrand', 'mutationMode',
       'hasStopCodon', 'isInJunctionList', 'isIsolated',
       'variantSegExpr', 'modifiedExonsCoord', 'originalExonsCoord',
       'vertexIdx', 'kmerType']


cols_pep_file = {col:  'gtex_' + col for col in cols_pep_file}


In [11]:
len(filt_meta_pb['batch'].unique()) # Now 121

143

### Make False positive hypothesis

In [12]:
gtex_dir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref'
check_hypothesis = []
kmers_not_in_gtex = []
kmers_no_metadata = []

for batch_i, batch in enumerate(filt_meta_pb['batch'].unique()):
    print(f'\n Iteration {batch_i} batch {batch}')
    # CANCER SIDE GET BATCH Junctions
    batch_kmers_init = filt_meta_pb.loc[filt_meta_pb['batch'] == batch]
    print(f'Size cancer kmers-junctions {len(batch_kmers_init)}')
    
    # Read GTEX peptide file
    pep_path = os.path.join(gtex_dir, 'cohort_mutNone', f'tmp_out_ref_batch_{batch}', 'ref_sample_peptides_meta.gz')
    if os.path.exists(pep_path):
        ##### COPY 
        not_in_background = False


        print(pep_path)
        meta_pep = pd.read_csv(pep_path, sep = '\t')
        print(meta_pep.shape)
        meta_pep.columns = cols_correct # ISSUE WITH IMMUNOPEPPER



        # Only bi or tri-exons peptides
        jx_ids = [idx for idx, coord in enumerate(meta_pep['modifiedExonsCoord']) if 'nan' not in coord]
        meta_pep = meta_pep.iloc[jx_ids]

        # Quick Assess Cancer junction presence in GTEX
        coord_int = [i.split(':') for i in batch_kmers_init['junction_coordinate'].unique()]
        keep = set()
        for cd in coord_int:
            keep.update([idx for idx, coord in enumerate(meta_pep['modifiedExonsCoord']) \
                         if (cd[0] in coord) and (cd[1] in coord)])
        meta_pep = meta_pep.iloc[list(keep)]

        if keep: # Some target junctions are in "potentially" found in gtex 
            # Rename
            meta_pep = meta_pep.rename(cols_pep_file, axis = 1)

            # Extract coordinates peptides
            meta_pep = get_junction_coordinates(meta_pep, 'gtex_modifiedExonsCoord', sep=';')

            # Add Peptide info 
            meta_pep = meta_pep[list(cols_pep_file.values()) + ['junction_coordinate']].drop_duplicates()
            
            exist_jx_not_in_gtex = set(batch_kmers_init['junction_coordinate']).difference(set(meta_pep['junction_coordinate']))
            exist_jx_in_gtex = set(batch_kmers_init['junction_coordinate']).intersection(set(meta_pep['junction_coordinate']))

            if exist_jx_in_gtex: # Some target junctions are in gtex
                batch_kmers = batch_kmers_init.merge(meta_pep, on = 'junction_coordinate', how = 'inner')
                print(f'Size cancer kmers-junctions + all GTEX peptides {len(batch_kmers)}')

                # Calculate the number of aa fitting on each side of the junctions
                batch_kmers['gtex_aa_E1'] = batch_kmers['gtex_modifiedExonsCoord'].str.split(';').map(lambda x: abs(int(x[0]) - int(x[1]))/3 )
                batch_kmers['gtex_aa_E2'] = batch_kmers['gtex_modifiedExonsCoord'].str.split(';').map(lambda x: abs(int(x[2]) - int(x[3]))/3 )
                batch_kmers['cancer_aa_E1'] = batch_kmers['coord'].str.split(':').map(lambda x: abs(int(x[0]) - int(x[1]))/3 )
                batch_kmers['cancer_aa_E2'] = batch_kmers['coord'].str.split(':').map(lambda x: abs(int(x[2]) - int(x[3]))/3 )
                # What about 3 exons?

                # Validate hypothesis: 
                # H1: The second exon is not long enough to get the translation through in GTEX. BUT no exon was added on the right
                new_col = 'gtexE2<cancE2'
                batch_kmers[new_col] = True
                batch_kmers.loc[batch_kmers['gtex_aa_E2'] > batch_kmers['cancer_aa_E2'], new_col] = False

                res = batch_kmers[['kmer', new_col, 'junction_coordinate' ]].drop_duplicates()
                display( batch_kmers[['kmer', 'cancerCohortfilter >0.0',\
                                      sample_target.replace('-', '') + 'all',
                                      'readFrameAnnotated', \
                                      'junctionAnnotated', \
                                      new_col ]].drop_duplicates() )
                check_hypothesis.append(res[['kmer', 'junction_coordinate']])
                
            if exist_jx_not_in_gtex: # Some target junctions are NOT in gtex
                not_in_background = True
                diff = batch_kmers_init.set_index('junction_coordinate').loc[exist_jx_not_in_gtex].reset_index() #left anti join

        
        else: # No target junctions are in GTEX at all
            not_in_background = True
            diff = batch_kmers_init.copy()

        if not_in_background:
            #diff = diff[['kmer', 'junction_coordinate', 'cancerCohortfilter >0.0', ]].drop_duplicates()
            kmers_not_in_gtex.append(diff[['kmer', 'junction_coordinate']])
            print(f'{diff.shape[0]} Kmers - junctions not found in gtex. Recurrence is:')
            print(diff['cancerCohortfilter >0.0'].unique())
            print(f'{diff.shape[0]} Kmers - junctions not found in gtex. Junction annotated is:')
            print(diff['junctionAnnotated'].unique())
            print(f'{diff.shape[0]} Kmers - junctions not found in gtex. RF annotated is:')
            print(diff['readFrameAnnotated'].unique())

            
        ##### END COPY 
    else:
        check_path = os.path.join(gtex_dir, 'cohort_mutNone', f'tmp_out_ref_batch_{batch}')
        kmers_no_metadata.append(batch_kmers_init[['kmer', 'junction_coordinate']])
        print(f'CHECK COMPLETION OF {check_path}')
        continue
#         for pep_path in glob.glob(pep_path + '/*'):
           




 Iteration 0 batch 50040
Size cancer kmers-junctions 5
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_50040/ref_sample_peptides_meta.gz
(14647, 20)
5 Kmers - junctions not found in gtex. Recurrence is:
[2]
5 Kmers - junctions not found in gtex. Junction annotated is:
[False]
5 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 1 batch 24125
Size cancer kmers-junctions 8
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_24125/ref_sample_peptides_meta.gz
(26762, 20)
8 Kmers - junctions not found in gtex. Recurrence is:
[17]
8 Kmers - junctions not found in gtex. Junction annotated is:
[False]
8 Kmers - junctions not found in gtex. RF annotated is:
[ True]

 Iteration 2 batch 40985
Size cancer kmers-junctions 6
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generat

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,AETLMLRNS,184,4.204772,True,False,True
57,ETLMLRNSV,184,4.204772,True,False,True
114,ANAETLMLR,184,4.204772,True,False,True
171,SANAETLML,184,4.204772,True,False,True
228,NAETLMLRN,184,4.204772,True,False,True
285,RSANAETLM,184,4.204772,True,False,True



 Iteration 16 batch 55186
Size cancer kmers-junctions 8
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_55186/ref_sample_peptides_meta.gz
(2045, 20)
8 Kmers - junctions not found in gtex. Recurrence is:
[7]
8 Kmers - junctions not found in gtex. Junction annotated is:
[False]
8 Kmers - junctions not found in gtex. RF annotated is:
[ True]

 Iteration 17 batch 42631
Size cancer kmers-junctions 6
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_42631/ref_sample_peptides_meta.gz
(17330, 20)
6 Kmers - junctions not found in gtex. Recurrence is:
[1]
6 Kmers - junctions not found in gtex. Junction annotated is:
[False]
6 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 18 batch 52627
Size cancer kmers-junctions 7
/cluster/work/grlab/projects/projects2020_OHSU/peptides_genera

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,EEEASPHQV,1046,4.204772,True,True,True
10,EEEASPHQV,1046,4.204772,False,True,True



 Iteration 21 batch 47584
Size cancer kmers-junctions 7
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_47584/ref_sample_peptides_meta.gz
(5606, 20)
7 Kmers - junctions not found in gtex. Recurrence is:
[19]
7 Kmers - junctions not found in gtex. Junction annotated is:
[False]
7 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 22 batch 46648
Size cancer kmers-junctions 8
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_46648/ref_sample_peptides_meta.gz
(30569, 20)
8 Kmers - junctions not found in gtex. Recurrence is:
[1]
8 Kmers - junctions not found in gtex. Junction annotated is:
[False]
8 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 23 batch 13412
Size cancer kmers-junctions 7
/cluster/work/grlab/projects/projects2020_OHSU/peptides_gener

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,KKVDTLGKS,1017,25.228635,False,True,True
20,KVDTLGKST,1017,25.228635,False,True,True
40,AKKVDTLGK,1017,25.228635,False,True,True



 Iteration 32 batch 16522
Size cancer kmers-junctions 7
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_16522/ref_sample_peptides_meta.gz
(5895, 20)
7 Kmers - junctions not found in gtex. Recurrence is:
[1]
7 Kmers - junctions not found in gtex. Junction annotated is:
[False]
7 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 33 batch 49556
Size cancer kmers-junctions 8
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_49556/ref_sample_peptides_meta.gz
(4890, 20)
8 Kmers - junctions not found in gtex. Recurrence is:
[5]
8 Kmers - junctions not found in gtex. Junction annotated is:
[False]
8 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 34 batch 6782
Size cancer kmers-junctions 3
CHECK COMPLETION OF /cluster/work/grlab/projects/projects2020_OH

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,TKKSLESIR,6,2.102386,True,True,True



 Iteration 37 batch 24752
Size cancer kmers-junctions 41
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_24752/ref_sample_peptides_meta.gz
(55491, 20)
41 Kmers - junctions not found in gtex. Recurrence is:
[ 33  38 452   6 246  13]
41 Kmers - junctions not found in gtex. Junction annotated is:
[False  True]
41 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 38 batch 7529
Size cancer kmers-junctions 2
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_7529/ref_sample_peptides_meta.gz
(50121, 20)
2 Kmers - junctions not found in gtex. Recurrence is:
[2]
2 Kmers - junctions not found in gtex. Junction annotated is:
[False]
2 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 39 batch 21754
Size cancer kmers-junctions 14
/cluster/work/grlab/projects/p

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,TCQSLQEEL,414,2.102386,True,True,True
1,ATCQSLQEG,432,4.204772,True,True,True


18 Kmers - junctions not found in gtex. Recurrence is:
[ 2 15]
18 Kmers - junctions not found in gtex. Junction annotated is:
[ True False]
18 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 46 batch 24005
Size cancer kmers-junctions 8
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_24005/ref_sample_peptides_meta.gz
(30192, 20)
8 Kmers - junctions not found in gtex. Recurrence is:
[1]
8 Kmers - junctions not found in gtex. Junction annotated is:
[False]
8 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 47 batch 993
Size cancer kmers-junctions 3
CHECK COMPLETION OF /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_993

 Iteration 48 batch 2542
Size cancer kmers-junctions 8
CHECK COMPLETION OF /cluster/work/grlab/projects/projects2020_OHSU/peptid

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,LRRPKMTPY,130,2.102386,False,False,True
7,RRPKMTPYV,130,2.102386,False,False,True



 Iteration 64 batch 44010
Size cancer kmers-junctions 4
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_44010/ref_sample_peptides_meta.gz
(21691, 20)
Size cancer kmers-junctions + all GTEX peptides 156


Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,VDYLLEKVS,1050,8.409545,False,True,True
39,DYLLEKVSK,1050,8.409545,False,True,True
78,IVDYLLEKV,1050,8.409545,False,True,True
117,YLLEKVSKR,1050,8.409545,False,True,True



 Iteration 65 batch 31801
Size cancer kmers-junctions 10
CHECK COMPLETION OF /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_31801

 Iteration 66 batch 31039
Size cancer kmers-junctions 5
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_31039/ref_sample_peptides_meta.gz
(27390, 20)
5 Kmers - junctions not found in gtex. Recurrence is:
[1]
5 Kmers - junctions not found in gtex. Junction annotated is:
[ True False]
5 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 67 batch 31425
Size cancer kmers-junctions 8
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_31425/ref_sample_peptides_meta.gz
(10936, 20)
8 Kmers - junctions not found in gtex. Recurrence is:
[1]
8 Kmers - juncti

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,GVKWLKAKV,1076,33.638179,False,True,True
20,KWLKAKVRA,1076,33.638179,False,True,True
40,VKWLKAKVR,1076,33.638179,False,True,True



 Iteration 76 batch 32590
Size cancer kmers-junctions 7
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_32590/ref_sample_peptides_meta.gz
(11374, 20)
7 Kmers - junctions not found in gtex. Recurrence is:
[1]
7 Kmers - junctions not found in gtex. Junction annotated is:
[False]
7 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 77 batch 33139
Size cancer kmers-junctions 35
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_33139/ref_sample_peptides_meta.gz
(64820, 20)
35 Kmers - junctions not found in gtex. Recurrence is:
[10 15  2]
35 Kmers - junctions not found in gtex. Junction annotated is:
[False]
35 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 78 batch 56249
Size cancer kmers-junctions 9
/cluster/work/grlab/projects/projects2020_OHSU/pep

(49892, 20)
14 Kmers - junctions not found in gtex. Recurrence is:
[1]
14 Kmers - junctions not found in gtex. Junction annotated is:
[False]
14 Kmers - junctions not found in gtex. RF annotated is:
[ True False]

 Iteration 98 batch 12384
Size cancer kmers-junctions 8
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_12384/ref_sample_peptides_meta.gz
(946, 20)
8 Kmers - junctions not found in gtex. Recurrence is:
[2]
8 Kmers - junctions not found in gtex. Junction annotated is:
[False]
8 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 99 batch 1493
Size cancer kmers-junctions 8
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_1493/ref_sample_peptides_meta.gz
(42941, 20)
8 Kmers - junctions not found in gtex. Recurrence is:
[1]
8 Kmers - junctions not found in gtex. Junc

/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_25118/ref_sample_peptides_meta.gz
(183354, 20)
6 Kmers - junctions not found in gtex. Recurrence is:
[1]
6 Kmers - junctions not found in gtex. Junction annotated is:
[False]
6 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 118 batch 10818
Size cancer kmers-junctions 14
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_10818/ref_sample_peptides_meta.gz
(48055, 20)
14 Kmers - junctions not found in gtex. Recurrence is:
[1]
14 Kmers - junctions not found in gtex. Junction annotated is:
[False  True]
14 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 119 batch 30030
Size cancer kmers-junctions 15
CHECK COMPLETION OF /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,RKNEKVRIN,982,10.511931,False,True,True
6,KNEKVRINP,982,10.511931,False,True,True



 Iteration 124 batch 8381
Size cancer kmers-junctions 6
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_8381/ref_sample_peptides_meta.gz
(21347, 20)
6 Kmers - junctions not found in gtex. Recurrence is:
[1]
6 Kmers - junctions not found in gtex. Junction annotated is:
[False]
6 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 125 batch 31811
Size cancer kmers-junctions 14
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_31811/ref_sample_peptides_meta.gz
(14999, 20)
14 Kmers - junctions not found in gtex. Recurrence is:
[1]
14 Kmers - junctions not found in gtex. Junction annotated is:
[False]
14 Kmers - junctions not found in gtex. RF annotated is:
[ True False]

 Iteration 126 batch 12613
Size cancer kmers-junctions 3
/cluster/work/grlab/projects/projects2020_OHSU/pep

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,HTEARLTGR,609,4.204772,True,True,True



 Iteration 133 batch 43479
Size cancer kmers-junctions 7
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_43479/ref_sample_peptides_meta.gz
(37076, 20)
7 Kmers - junctions not found in gtex. Recurrence is:
[2]
7 Kmers - junctions not found in gtex. Junction annotated is:
[False]
7 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 134 batch 39714
Size cancer kmers-junctions 7
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_39714/ref_sample_peptides_meta.gz
(20301, 20)
7 Kmers - junctions not found in gtex. Recurrence is:
[4]
7 Kmers - junctions not found in gtex. Junction annotated is:
[False]
7 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 135 batch 7485
Size cancer kmers-junctions 2
/cluster/work/grlab/projects/projects2020_OHSU/peptides_gen

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,SDVFQQGKF,811,119.836014,False,True,True
9,DVFQQGKFT,811,119.836014,False,True,True



 Iteration 136 batch 19173
Size cancer kmers-junctions 8
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_19173/ref_sample_peptides_meta.gz
(45505, 20)
8 Kmers - junctions not found in gtex. Recurrence is:
[17]
8 Kmers - junctions not found in gtex. Junction annotated is:
[False]
8 Kmers - junctions not found in gtex. RF annotated is:
[ True]

 Iteration 137 batch 34141
Size cancer kmers-junctions 4
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_34141/ref_sample_peptides_meta.gz
(24681, 20)
4 Kmers - junctions not found in gtex. Recurrence is:
[8]
4 Kmers - junctions not found in gtex. Junction annotated is:
[False]
4 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 138 batch 13583
Size cancer kmers-junctions 1
/cluster/work/grlab/projects/projects2020_OHSU/peptides_g

Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,MAGHRKESP,1081,18.921476,True,True,True



 Iteration 139 batch 57118
Size cancer kmers-junctions 2
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_57118/ref_sample_peptides_meta.gz
(43947, 20)
Size cancer kmers-junctions + all GTEX peptides 42


Unnamed: 0,kmer,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,readFrameAnnotated,junctionAnnotated,gtexE2<cancE2
0,LKLLTRLRK,104,50.457269,False,False,True
21,DLKLLTRLR,104,50.457269,False,False,True



 Iteration 140 batch 24612
Size cancer kmers-junctions 9
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_24612/ref_sample_peptides_meta.gz
(32065, 20)
9 Kmers - junctions not found in gtex. Recurrence is:
[1]
9 Kmers - junctions not found in gtex. Junction annotated is:
[False]
9 Kmers - junctions not found in gtex. RF annotated is:
[False]

 Iteration 141 batch 12422
Size cancer kmers-junctions 21
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_12422/ref_sample_peptides_meta.gz
(71591, 20)
21 Kmers - junctions not found in gtex. Recurrence is:
[6]
21 Kmers - junctions not found in gtex. Junction annotated is:
[False]
21 Kmers - junctions not found in gtex. RF annotated is:
[False  True]

 Iteration 142 batch 9435
Size cancer kmers-junctions 3
/cluster/work/grlab/projects/projects2020_OHSU/pe

In [13]:
print(filt_meta_pb.shape)
print(filt_meta_pb[['kmer', 'junction_coordinate']].drop_duplicates().shape)

(1139, 26)
(1070, 2)


In [14]:
df_kmers_no_metadata = pd.concat(kmers_no_metadata, axis = 0 )
print(f'{df_kmers_no_metadata.shape[0]} kmers without metadata')
print(f'{df_kmers_no_metadata.drop_duplicates().shape[0]} UNIQUE kmers without metadata')

68 kmers without metadata
54 UNIQUE kmers without metadata


In [15]:
df_kmers_not_in_gtex = pd.concat(kmers_not_in_gtex, axis = 0 )
print(f'{df_kmers_not_in_gtex.shape[0]} kmers completely absent in GTEX')
print(f'{df_kmers_not_in_gtex.drop_duplicates().shape[0]} UNIQUE kmers completely absent in GTEX')

1040 kmers completely absent in GTEX
986 UNIQUE kmers completely absent in GTEX


In [16]:
df_check_hypothesis = pd.concat(check_hypothesis, axis = 0 )
print(f'{df_check_hypothesis.shape[0]} kmers coming from novel exons')
print(f'{df_check_hypothesis.drop_duplicates().shape[0]} UNIQUE kmers coming from novel exons')

30 kmers coming from novel exons
30 UNIQUE kmers coming from novel exons


In [17]:
# Create new labels based on the false positive check above 
# The sets no_metadata, not_GTEX, check_hypothesis are disjoint
print(set(df_kmers_no_metadata['kmer']).intersection( set(df_kmers_not_in_gtex['kmer'])))
print(set(df_kmers_no_metadata['kmer']).intersection( set(df_check_hypothesis['kmer'])))
print(set(df_kmers_not_in_gtex['kmer']).intersection( set(df_check_hypothesis['kmer'])))

# Drop duplicates
df_kmers_no_metadata = df_kmers_no_metadata.drop_duplicates() 
df_kmers_not_in_gtex = df_kmers_not_in_gtex.drop_duplicates()
df_check_hypothesis = df_check_hypothesis.drop_duplicates()

df_no_info = df_kmers_no_metadata.copy()
df_no_info['info_not_available'] = True

# Augment df with information about GTEX junction presence
df_kmers_not_in_gtex['coordinate_in_GTEX'] = False
df_kmers_no_metadata['coordinate_in_GTEX'] = False # No info
df_coord_in_gtex = pd.concat([df_kmers_not_in_gtex, df_kmers_no_metadata], axis = 0)

# Augment df with information about Novel exon hypothesis
df_kmers_no_metadata.drop('coordinate_in_GTEX', inplace=True, axis=1)
df_kmers_no_metadata['gtexExon2<cancExon2'] = False
df_check_hypothesis['gtexExon2<cancExon2'] = True
df_exon_length = pd.concat([df_check_hypothesis, df_kmers_no_metadata], axis = 0)

filt_meta = filt_meta.merge(df_coord_in_gtex, on = ['kmer', 'junction_coordinate'], how = 'left')
filt_meta['coordinate_in_GTEX'] = filt_meta['coordinate_in_GTEX'].fillna(True)

filt_meta = filt_meta.merge(df_exon_length, on = ['kmer', 'junction_coordinate'], how = 'left')
filt_meta['gtexExon2<cancExon2'] = filt_meta['gtexExon2<cancExon2'].fillna(False)

filt_meta = filt_meta.merge(df_no_info, on = ['kmer', 'junction_coordinate'], how = 'left')
filt_meta['info_not_available'] = filt_meta['info_not_available'].fillna(False)

display(filt_meta.groupby('gtexExon2<cancExon2').count())
display(filt_meta.groupby('coordinate_in_GTEX').count())

set()
set()
set()


Unnamed: 0_level_0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,TCGAC8A12P01A11RA11507all,TCGAAOA0JM01A21RA05607all,TCGABHA18V01A11RA12D07all,TCGAA2A0D201A21RA03407all,TCGAA2A0SX01A12RA08407all,isAnnotated,strand,junction_coordinate,coordinate_in_GTEX,info_not_available
gtexExon2<cancExon2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,1108,0,0,0,0,0,0,1108,1108,1108,...,1108,1108,1108,1108,1108,0,1108,1108,1108,1108
True,31,0,0,0,0,0,0,31,31,31,...,31,31,31,31,31,0,31,31,31,31


Unnamed: 0_level_0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,TCGAC8A12P01A11RA11507all,TCGAAOA0JM01A21RA05607all,TCGABHA18V01A11RA12D07all,TCGAA2A0D201A21RA03407all,TCGAA2A0SX01A12RA08407all,isAnnotated,strand,junction_coordinate,gtexExon2<cancExon2,info_not_available
coordinate_in_GTEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,1108,0,0,0,0,0,0,1108,1108,1108,...,1108,1108,1108,1108,1108,0,1108,1108,1108,1108
True,31,0,0,0,0,0,0,31,31,31,...,31,31,31,31,31,0,31,31,31,31


In [18]:
path_interest = f'G_{sample_target}_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz'
path_interest_with_advanced_meta = os.path.join(output_dir, path_interest.replace('.tsv.gz', 'metadata2.tsv.gz'))
print(f'Saving kmers with advanced metadata to {path_interest_with_advanced_meta}')
filt_meta.to_csv(path_interest_with_advanced_meta, sep = '\t', compression = 'gzip', index = None)

Saving kmers with advanced metadata to /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order/G_TCGA-AO-A0JM-01A-21R-A056-07_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1metadata2.tsv.gz


## Exploratory

### Remove GTEX annotation 

In [19]:
# path_annot = glob.glob('/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_6920432_ANNOT_conf2_RFall_ref/cohort_mutNone/*/ref_annot_kmer.gz')

# print(len(path_annot))

In [20]:
# for idx, annot in enumerate(path_annot):
#     kmers_filter_pipeline = len(kmer_post_filter)
#     annot = pd.read_csv(annot, sep = '\t')
#     annot = set(annot['kmer'])
#     kmer_post_filter = kmer_post_filter.difference(annot)
#     if kmers_filter_pipeline != len(kmer_post_filter):
#         print(path_annot[idx], len(kmer_post_filter))