In [1]:
import numpy as np
import os
import timeit
import glob 
import pandas as pd
import time
import multiprocessing as mp 
import logging
import sys 
import pathlib
from pathlib import Path
import matplotlib.pyplot as plt 

Matplotlib created a temporary config/cache directory at /scratch/slurm-job.816371/matplotlib-y4bcbqsk because the default path (/cluster/customapps/biomed/grlab/users/prelotla/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
def get_junction_coordinates(df, coordinates_col, sep=':'):
    df['strand'] = None
    df['junction_coordinate'] = None

    for idx, row in df.iterrows():
        kmer_coordinates = [int(x) for x in row[coordinates_col].split(sep) if (x !='None') ]

        if kmer_coordinates[1] < kmer_coordinates[2]: # order strand +

            df.loc[idx, 'strand'] = '+'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:3]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:5]])
        else: # order strand +
            df.loc[idx, 'strand'] = '-'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0]]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0],
                                                                                kmer_coordinates[2],
                                                                                kmer_coordinates[5]
                                                                               ]])
    return df




### Get filtered data

In [3]:
run_type = 'brca'

# Inputs

if run_type == 'brca':
    target_samples = ['TCGA-C8-A12P-01A-11R-A115-07.all',
                      'TCGA-AO-A0JM-01A-21R-A056-07.all',
                      'TCGA-BH-A18V-01A-11R-A12D-07.all',
                      'TCGA-A2-A0D2-01A-21R-A034-07.all',
                      'TCGA-A2-A0SX-01A-12R-A084-07.all']
    basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
    intermediate_output = os.path.join(basedir, 'filtering_intermediate/complete_cancer_candidates_.tsv.gz')

filtering_id = 'chosen_filters_06March'
output_dir = os.path.join(basedir, 'filtering_samples', filtering_id)

In [4]:
# Load generation matrix
df_load = pd.read_csv(intermediate_output, sep = '\t')

In [5]:
# Load filtered kmers
sample_target = 'TCGA-AO-A0JM-01A-21R-A056-07'
path_interest = f'G_{sample_target}_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz'
path_interest = os.path.join(output_dir, path_interest)
filt = pd.read_csv(path_interest, sep = '\t')
print(filt.shape)

(9364, 4)


In [6]:
# Columns magic
core_cols = ['kmer', 'gtexCohortfilter >0.0', 'coord',
       'junctionAnnotated', 'readFrameAnnotated', 'isCrossJunction', 'batch',
       'cancerCohortfilter >0.0', sample_target.replace('-', '') + 'all', 'isAnnotated']

df_load.columns

Index(['kmer', 'gtexCohortfilter >0.0', 'gtexCohortfilter >=1.0',
       'gtexCohortfilter >=2.0', 'gtexCohortfilter >=3.0',
       'gtexCohortfilter >=5.0', 'gtexCohortfilter >=10.0', 'coord',
       'junctionAnnotated', 'readFrameAnnotated', 'isCrossJunction', 'batch',
       'cancerCohortfilter >0.0', 'cancerCohortfilter >=1.0',
       'cancerCohortfilter >=2.0', 'cancerCohortfilter >=3.0',
       'cancerCohortfilter >=5.0', 'cancerCohortfilter >=10.0',
       'TCGAC8A12P01A11RA11507all', 'TCGAAOA0JM01A21RA05607all',
       'TCGABHA18V01A11RA12D07all', 'TCGAA2A0D201A21RA03407all',
       'TCGAA2A0SX01A12RA08407all', 'isAnnotated'],
      dtype='object')

In [7]:
# Merge generation matrix & filtered kmers
filt_meta = df_load.merge(filt, on = list(filt.columns), how = 'right')
print(filt_meta.shape)

(9364, 24)


In [8]:
# Define problematic kmer
# exp_lim = 600
# filt_meta_pb = filt_meta.loc[filt_meta['cancerCohortfilter >0.0'] > exp_lim]
# filt_meta_pb = filt_meta_pb[filt_meta_pb[sample_target.replace('-', '') + 'all'] > 0 ]
# print(f'filtered kmers-junctions {filt_meta.shape[0]}')
# print(f'filtered kmers unique {len(filt_meta.kmer.unique())}')
# print(f'filtered kmers-junctions problematic {filt_meta_pb.shape[0]}')
# print(f'filtered kmers problematic unique {len(filt_meta_pb.kmer.unique())}')
filt_meta_pb = filt_meta

In [9]:
# Extract coordinates of the merged table
filt_meta_pb = get_junction_coordinates(filt_meta_pb, 'coord', sep=':')

In [10]:
display(filt_meta_pb[core_cols + ['junction_coordinate']].head())

Unnamed: 0,kmer,gtexCohortfilter >0.0,coord,junctionAnnotated,readFrameAnnotated,isCrossJunction,batch,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,isAnnotated,junction_coordinate
0,KSKTRPISD,,16801319:16801333:16795006:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
1,SKTRPISDS,,16801319:16801330:16795003:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
2,YKSKTRPIS,,16801319:16801336:16795009:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
3,LYKSKTRPI,,16801319:16801339:16795012:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
4,ALYKSKTRP,,16801319:16801342:16795015:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319


### Add the peptide metadata

In [11]:
# Define peptide file columns of interest 
cols_correct = ['peptide','id','readFrame','readFrameAnnotated','geneName','geneChr','geneStrand',
'mutationMode','hasStopCodon','isInJunctionList',
'isIsolated','variantComb','variantSegExpr','modifiedExonsCoord',
'originalExonsCoord',
'vertexIdx','kmerType', 'dummy1', 'dummy2', 'dummy3']

cols_pep_file = ['peptide', 'id', 'readFrame', 'geneName',
       'geneChr', 'geneStrand', 'mutationMode',
       'hasStopCodon', 'isInJunctionList', 'isIsolated',
       'variantSegExpr', 'modifiedExonsCoord', 'originalExonsCoord',
       'vertexIdx', 'kmerType']


cols_pep_file = {col:  'gtex_' + col for col in cols_pep_file}


In [27]:
check_hypothesis = []
kmers_not_in_gtex = []
for batch in filt_meta_pb['batch'].unique():
    no_gtex = False
    print('\n', batch)
    # Read GTEX peptide file
    gtex_dir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref'
    pep_path = os.path.join(gtex_dir, 'cohort_mutNone', f'tmp_out_ref_batch_{batch}', 'ref_sample_peptides_meta.gz')
    print(pep_path)
    meta_pep = pd.read_csv(pep_path, sep = '\t')
    print(meta_pep.shape)
    meta_pep.columns = cols_correct # ISSUE WITH IMMUNOPEPPER
    
    # CANCER SIDE GET BATCH Junctions
    batch_kmers = filt_meta_pb.loc[filt_meta_pb['batch'] == batch]
    print(f'Size cancer kmers-junctions {len(batch_kmers)}')

    # Only bi or tri-exons peptides
    jx_ids = [idx for idx, coord in enumerate(meta_pep['modifiedExonsCoord']) if 'nan' not in coord]
    meta_pep = meta_pep.iloc[jx_ids]
    
    # Quick Cancer junction presence in GTEX
    coord_int = [i.split(':') for i in batch_kmers['junction_coordinate'].unique()]
    keep = set()
    for cd in coord_int:
        keep.update([idx for idx, coord in enumerate(meta_pep['modifiedExonsCoord']) \
                     if (cd[0] in coord) and (cd[1] in coord)])
    meta_pep = meta_pep.iloc[list(keep)]
    
    if keep:
        # Rename
        meta_pep = meta_pep.rename(cols_pep_file, axis = 1)

        # Extract coordinates peptides
        meta_pep = get_junction_coordinates(meta_pep, 'gtex_modifiedExonsCoord', sep=';')

        # Add Peptide info 
        meta_pep = meta_pep[list(cols_pep_file.values()) + ['junction_coordinate']].drop_duplicates()
        
        if batch_kmers.merge(meta_pep, on = 'junction_coordinate', how = 'inner').shape[0]: 
            batch_kmers = batch_kmers.merge(meta_pep, on = 'junction_coordinate', how = 'left')
            print(f'Size cancer kmers-junctions + all GTEX peptides {len(batch_kmers)}')

            # Calculate the number of aa fitting on each side of the junctions
            batch_kmers['gtex_aa_E1'] = batch_kmers['gtex_modifiedExonsCoord'].str.split(';').map(lambda x: abs(int(x[0]) - int(x[1]))/3 )
            batch_kmers['gtex_aa_E2'] = batch_kmers['gtex_modifiedExonsCoord'].str.split(';').map(lambda x: abs(int(x[2]) - int(x[3]))/3 )
            batch_kmers['cancer_aa_E1'] = batch_kmers['coord'].str.split(':').map(lambda x: abs(int(x[0]) - int(x[1]))/3 )
            batch_kmers['cancer_aa_E2'] = batch_kmers['coord'].str.split(':').map(lambda x: abs(int(x[2]) - int(x[3]))/3 )
            # What about 3 exons?

            # Validate hypothesis: 
            # H1: The second exon is not long enough to get the translation through in GTEX. BUT no exon was added on the right
            new_col = 'gtexE2<cancE2'
            batch_kmers[new_col] = True
            batch_kmers.loc[batch_kmers['gtex_aa_E2'] > batch_kmers['cancer_aa_E2'], 'new_col'] = False

            res = batch_kmers[['kmer', new_col ]].drop_duplicates()
            display( batch_kmers[['kmer', 'cancerCohortfilter >0.0', new_col ]].drop_duplicates() )
            check_hypothesis.append(res)
        else:
            no_gtex = True
    else:
        no_gtex = True
    
    if no_gtex:
        kmers_not_in_gtex.append(batch_kmers[['kmer', 'junction_coordinate']])
        print(f'{batch_kmers.shape[0]} Kmers junction not found in gtex. Recurrence is:')
        print(batch_kmers['cancerCohortfilter >0.0'].unique())






 50040
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_50040/ref_sample_peptides_meta.gz
(14647, 20)
Size cancer kmers-junctions 5
5 Kmers junction not found in gtex. Recurrence is:
[2]

 24125
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_24125/ref_sample_peptides_meta.gz
(26762, 20)
Size cancer kmers-junctions 8
8 Kmers junction not found in gtex. Recurrence is:
[17]

 40985
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_40985/ref_sample_peptides_meta.gz
(151764, 20)
Size cancer kmers-junctions 6
6 Kmers junction not found in gtex. Recurrence is:
[1]

 9192
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_

Unnamed: 0,kmer,cancerCohortfilter >0.0,gtexE2<cancE2
0,AETLMLRNS,184,True
57,ETLMLRNSV,184,True
114,ANAETLMLR,184,True
171,SANAETLML,184,True
228,NAETLMLRN,184,True
285,RSANAETLM,184,True



 55186
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_55186/ref_sample_peptides_meta.gz
(2045, 20)
Size cancer kmers-junctions 8
8 Kmers junction not found in gtex. Recurrence is:
[7]

 42631
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_42631/ref_sample_peptides_meta.gz
(17330, 20)
Size cancer kmers-junctions 6
6 Kmers junction not found in gtex. Recurrence is:
[1]

 52627
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_52627/ref_sample_peptides_meta.gz
(33402, 20)
Size cancer kmers-junctions 7
7 Kmers junction not found in gtex. Recurrence is:
[1]

 34250
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_re

Unnamed: 0,kmer,cancerCohortfilter >0.0,gtexE2<cancE2
0,EEEASPHQV,1046,True



 47584
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_47584/ref_sample_peptides_meta.gz
(5606, 20)
Size cancer kmers-junctions 7
7 Kmers junction not found in gtex. Recurrence is:
[19]

 46648
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_46648/ref_sample_peptides_meta.gz
(30569, 20)
Size cancer kmers-junctions 8
8 Kmers junction not found in gtex. Recurrence is:
[1]

 13412
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_13412/ref_sample_peptides_meta.gz
(16226, 20)
Size cancer kmers-junctions 7
7 Kmers junction not found in gtex. Recurrence is:
[3]

 13574
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_r

Unnamed: 0,kmer,cancerCohortfilter >0.0,gtexE2<cancE2
0,KKVDTLGKS,1017,True
20,KVDTLGKST,1017,True
40,AKKVDTLGK,1017,True



 16522
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_16522/ref_sample_peptides_meta.gz
(5895, 20)
Size cancer kmers-junctions 7
7 Kmers junction not found in gtex. Recurrence is:
[1]

 49556
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_49556/ref_sample_peptides_meta.gz
(4890, 20)
Size cancer kmers-junctions 8
8 Kmers junction not found in gtex. Recurrence is:
[5]

 6782
/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_6782/ref_sample_peptides_meta.gz


FileNotFoundError: [Errno 2] No such file or directory: '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_6782/ref_sample_peptides_meta.gz'

In [20]:
meta_pep

Unnamed: 0,gtex_peptide,gtex_id,gtex_readFrame,gtex_geneName,gtex_geneChr,gtex_geneStrand,gtex_mutationMode,gtex_hasStopCodon,gtex_isInJunctionList,gtex_isIsolated,gtex_variantSegExpr,gtex_modifiedExonsCoord,gtex_originalExonsCoord,gtex_vertexIdx,gtex_kmerType,junction_coordinate
31583,KHC,ENSG00000109452.12:83_76:1:142086256:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086256;142082070;142082185,142086143;142086256;142082068;142082185,83;76,2-exons,142082185:142086143
42324,KLCGWLQRFAAN,ENSG00000109452.12:82_73:3:142086238:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086238;142082031;142082185,142086143;142086239;142082030;142082185,82;73,2-exons,142082185:142086143
2317,NKMI,ENSG00000109452.12:82_76:0:142086238:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086238;142082070;142082185,142086143;142086239;142082068;142082185,82;76,2-exons,142082185:142086143
33310,MVFVSPVVKVPKTGHRCQ,ENSG00000109452.12:82_73:4:142086238:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086238;142082031;142082185,142086143;142086239;142082030;142082185,82;73,2-exons,142082185:142086143
36701,EMSTSYTRTSLSERWIA,ENSG00000109452.12:82_73:6:142086238:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086238;142082031;142082185,142086143;142086239;142082030;142082185,82;73,2-exons,142082185:142086143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36591,KHC,ENSG00000109452.12:83_74:1:142086256:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086256;142082052;142082185,142086143;142086256;142082052;142082185,83;74,2-exons,142082185:142086143
6147,EMSTSYTRTSLSERWIA,ENSG00000109452.12:83_73:6:142086256:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086256;142082031;142082185,142086143;142086256;142082030;142082185,83;73,2-exons,142082185:142086143
51324,KISFKISNPKKERM,ENSG00000109452.12:83_73:2:142086256:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086256;142082031;142082185,142086143;142086256;142082030;142082185,83;73,2-exons,142082185:142086143
13513,FKSIARKSPSKYPIQKKKECRNYVAGCNDLPQTEWYSFHLL,ENSG00000109452.12:82_73:1:142086237:2-exons,1,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086237;142082030;142082185,142086143;142086239;142082030;142082185,82;73,2-exons,142082185:142086143


In [26]:
coord_int = [i.split(':') for i in filt_meta_pb.loc[filt_meta_pb['batch'] == batch]['junction_coordinate'].unique()]
for cd in coord_int:
    print(cd)
    display(meta_pep.iloc[[idx for idx, coord in enumerate(meta_pep['gtex_modifiedExonsCoord']) \
                 if (cd[0] in coord) and (cd[1] in coord)]])
    

['142082148', '142082170']


Unnamed: 0,gtex_peptide,gtex_id,gtex_readFrame,gtex_geneName,gtex_geneChr,gtex_geneStrand,gtex_mutationMode,gtex_hasStopCodon,gtex_isInJunctionList,gtex_isIsolated,gtex_variantSegExpr,gtex_modifiedExonsCoord,gtex_originalExonsCoord,gtex_vertexIdx,gtex_kmerType,junction_coordinate


['142082185', '142086143', '142082170', '142082148']


Unnamed: 0,gtex_peptide,gtex_id,gtex_readFrame,gtex_geneName,gtex_geneChr,gtex_geneStrand,gtex_mutationMode,gtex_hasStopCodon,gtex_isInJunctionList,gtex_isIsolated,gtex_variantSegExpr,gtex_modifiedExonsCoord,gtex_originalExonsCoord,gtex_vertexIdx,gtex_kmerType,junction_coordinate
31583,KHC,ENSG00000109452.12:83_76:1:142086256:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086256;142082070;142082185,142086143;142086256;142082068;142082185,83;76,2-exons,142082185:142086143
42324,KLCGWLQRFAAN,ENSG00000109452.12:82_73:3:142086238:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086238;142082031;142082185,142086143;142086239;142082030;142082185,82;73,2-exons,142082185:142086143
2317,NKMI,ENSG00000109452.12:82_76:0:142086238:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086238;142082070;142082185,142086143;142086239;142082068;142082185,82;76,2-exons,142082185:142086143
33310,MVFVSPVVKVPKTGHRCQ,ENSG00000109452.12:82_73:4:142086238:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086238;142082031;142082185,142086143;142086239;142082030;142082185,82;73,2-exons,142082185:142086143
36701,EMSTSYTRTSLSERWIA,ENSG00000109452.12:82_73:6:142086238:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086238;142082031;142082185,142086143;142086239;142082030;142082185,82;73,2-exons,142082185:142086143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36591,KHC,ENSG00000109452.12:83_74:1:142086256:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086256;142082052;142082185,142086143;142086256;142082052;142082185,83;74,2-exons,142082185:142086143
6147,EMSTSYTRTSLSERWIA,ENSG00000109452.12:83_73:6:142086256:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086256;142082031;142082185,142086143;142086256;142082030;142082185,83;73,2-exons,142082185:142086143
51324,KISFKISNPKKERM,ENSG00000109452.12:83_73:2:142086256:2-exons,2,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086256;142082031;142082185,142086143;142086256;142082030;142082185,83;73,2-exons,142082185:142086143
13513,FKSIARKSPSKYPIQKKKECRNYVAGCNDLPQTEWYSFHLL,ENSG00000109452.12:82_73:1:142086237:2-exons,1,ENSG00000109452.12,chr4,-,ref,1,,1,,142086143;142086237;142082030;142082185,142086143;142086239;142082030;142082185,82;73,2-exons,142082185:142086143


Unnamed: 0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,cancerCohortfilter >=5.0,cancerCohortfilter >=10.0,TCGAC8A12P01A11RA11507all,TCGAAOA0JM01A21RA05607all,TCGABHA18V01A11RA12D07all,TCGAA2A0D201A21RA03407all,TCGAA2A0SX01A12RA08407all,isAnnotated,strand,junction_coordinate
89,LVKVPKTGH,,,,,,,142082170:142082173:142082124:142082148:None:None,False,False,...,0,0,0.0,4.204772,0.0,0.0,0.0,,-,142082148:142082170
90,RKLVKVPKT,,,,,,,142082170:142082179:142082130:142082148:None:None,False,False,...,0,0,0.0,4.204772,0.0,0.0,0.0,,-,142082148:142082170
91,CRKLVKVPK,,,,,,,142082170:142082182:142082133:142082148:None:None,False,False,...,0,0,0.0,4.204772,0.0,0.0,0.0,,-,142082148:142082170
92,ATICRKLVK,,,,,,,142086143:142086149:142082170:142082185:142082...,True,False,...,0,0,0.0,4.204772,0.0,0.0,0.0,,-,142082185:142086143:142082170:142082148
93,ICRKLVKVP,,,,,,,142082170:142082185:142082136:142082148:None:None,False,False,...,0,0,0.0,4.204772,0.0,0.0,0.0,,-,142082148:142082170
94,TICRKLVKV,,,,,,,142086143:142086146:142082170:142082185:142082...,True,False,...,0,0,0.0,4.204772,0.0,0.0,0.0,,-,142082185:142086143:142082170:142082148
95,KLVKVPKTG,,,,,,,142082170:142082176:142082127:142082148:None:None,False,False,...,0,0,0.0,4.204772,0.0,0.0,0.0,,-,142082148:142082170
96,AATICRKLV,,,,,,,142086143:142086152:142082170:142082185:142082...,True,False,...,0,0,0.0,4.204772,0.0,0.0,0.0,,-,142082185:142086143:142082170:142082148


In [19]:
keep

{117,
 189,
 433,
 1100,
 1316,
 2095,
 2386,
 2459,
 2460,
 3537,
 3932,
 4295,
 5036,
 5062,
 5591,
 7800,
 9200,
 11097,
 11484,
 11699,
 12114,
 12269,
 12536,
 13398,
 13619,
 14546,
 14761,
 15543,
 17036,
 17620,
 18701,
 18834,
 20723,
 20768,
 21398,
 22259,
 22855,
 22877,
 23742,
 24764,
 27836,
 27943,
 28326,
 28552,
 28675,
 28861,
 29045,
 29105,
 29869,
 30259,
 31107,
 33060,
 33234,
 33335,
 35742,
 36984,
 37612,
 37781,
 38433,
 39110,
 39361,
 39857,
 40694,
 40770,
 40830,
 41047,
 41804,
 42339,
 42401,
 44444,
 44476,
 44675,
 45117,
 45337,
 45824,
 45961,
 46433,
 46569,
 47042,
 47392}

In [None]:
keep

### Remove GTEX annotation 

In [None]:
path_annot = glob.glob('/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_6920432_ANNOT_conf2_RFall_ref/cohort_mutNone/*/ref_annot_kmer.gz')

print(len(path_annot))

In [None]:
for idx, annot in enumerate(path_annot):
    kmers_filter_pipeline = len(kmer_post_filter)
    annot = pd.read_csv(annot, sep = '\t')
    annot = set(annot['kmer'])
    kmer_post_filter = kmer_post_filter.difference(annot)
    if kmers_filter_pipeline != len(kmer_post_filter):
        print(path_annot[idx], len(kmer_post_filter))