In [2]:
import numpy as np
import os
import timeit
import glob 
import pandas as pd
import time
import multiprocessing as mp 
import logging
import sys 
import pathlib
from pathlib import Path
import matplotlib.pyplot as plt 

Matplotlib created a temporary config/cache directory at /scratch/slurm-job.816371/matplotlib-znoh07a6 because the default path (/cluster/customapps/biomed/grlab/users/prelotla/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [78]:
def get_junction_coordinates(df, coordinates_col, sep=':'):
    df['strand'] = None
    df['junction_coordinate'] = None

    for idx, row in df.iterrows():
        kmer_coordinates = [int(x) for x in row[coordinates_col].split(sep) if (x !='None') ]

        if kmer_coordinates[1] < kmer_coordinates[2]: # order strand +

            df.loc[idx, 'strand'] = '+'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:3]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:5]])
        else: # order strand +
            df.loc[idx, 'strand'] = '-'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0]]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0],
                                                                                kmer_coordinates[2],
                                                                                kmer_coordinates[5]
                                                                               ]])
    return df




### Get filtered data

In [3]:
run_type = 'brca'

# Inputs

if run_type == 'brca':
    target_samples = ['TCGA-C8-A12P-01A-11R-A115-07.all',
                      'TCGA-AO-A0JM-01A-21R-A056-07.all',
                      'TCGA-BH-A18V-01A-11R-A12D-07.all',
                      'TCGA-A2-A0D2-01A-21R-A034-07.all',
                      'TCGA-A2-A0SX-01A-12R-A084-07.all']
    basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
    intermediate_output = os.path.join(basedir, 'filtering_intermediate/complete_cancer_candidates_.tsv.gz')

filtering_id = 'chosen_filters_06March'
output_dir = os.path.join(basedir, 'filtering_samples', filtering_id)

In [5]:
# Load matrix to be filtered
df_load = pd.read_csv(intermediate_output, sep = '\t')

In [42]:
# Load filtered kmers
sample_target = 'TCGA-AO-A0JM-01A-21R-A056-07'
path_interest = f'G_{sample_target}_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz'
path_interest = os.path.join(output_dir, path_interest)
filt = pd.read_csv(path_interest, sep = '\t')
print(filt.shape)

(9364, 4)


In [48]:
# Columns magic
core_cols = ['kmer', 'gtexCohortfilter >0.0', 'coord',
       'junctionAnnotated', 'readFrameAnnotated', 'isCrossJunction', 'batch',
       'cancerCohortfilter >0.0', sample_target.replace('-', '') + 'all', 'isAnnotated']

df_load.columns

Index(['kmer', 'gtexCohortfilter >0.0', 'gtexCohortfilter >=1.0',
       'gtexCohortfilter >=2.0', 'gtexCohortfilter >=3.0',
       'gtexCohortfilter >=5.0', 'gtexCohortfilter >=10.0', 'coord',
       'junctionAnnotated', 'readFrameAnnotated', 'isCrossJunction', 'batch',
       'cancerCohortfilter >0.0', 'cancerCohortfilter >=1.0',
       'cancerCohortfilter >=2.0', 'cancerCohortfilter >=3.0',
       'cancerCohortfilter >=5.0', 'cancerCohortfilter >=10.0',
       'TCGAC8A12P01A11RA11507all', 'TCGAAOA0JM01A21RA05607all',
       'TCGABHA18V01A11RA12D07all', 'TCGAA2A0D201A21RA03407all',
       'TCGAA2A0SX01A12RA08407all', 'isAnnotated'],
      dtype='object')

In [37]:
# merge metadata to filtered
filt_meta = df_load.merge(filt, on = list(filt.columns), how = 'right')
print(filt_meta.shape)

(9364, 24)


In [219]:
# Define problematic kmer
# exp_lim = 600
# filt_meta_pb = filt_meta.loc[filt_meta['cancerCohortfilter >0.0'] > exp_lim]
# filt_meta_pb = filt_meta_pb[filt_meta_pb[sample_target.replace('-', '') + 'all'] > 0 ]
# print(f'filtered kmers-junctions {filt_meta.shape[0]}')
# print(f'filtered kmers unique {len(filt_meta.kmer.unique())}')
# print(f'filtered kmers-junctions problematic {filt_meta_pb.shape[0]}')
# print(f'filtered kmers problematic unique {len(filt_meta_pb.kmer.unique())}')
filt_meta_pb = filt_meta

In [220]:
filt_meta_pb = get_junction_coordinates(filt_meta_pb, 'coord', sep=':')

In [221]:
filt_meta_pb[core_cols + ['junction_coordinate']].head()

Unnamed: 0,kmer,gtexCohortfilter >0.0,coord,junctionAnnotated,readFrameAnnotated,isCrossJunction,batch,cancerCohortfilter >0.0,TCGAAOA0JM01A21RA05607all,isAnnotated,junction_coordinate
0,KSKTRPISD,,16801319:16801333:16795006:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
1,SKTRPISDS,,16801319:16801330:16795003:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
2,YKSKTRPIS,,16801319:16801336:16795009:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
3,LYKSKTRPI,,16801319:16801339:16795012:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319
4,ALYKSKTRP,,16801319:16801342:16795015:16795019:None:None,False,False,True,50040,2,18.921476,,16795019:16801319


### Add the peptide metadata

In [222]:
# Define peptide file columns of interest 
cols_pep_file = ['peptide', 'id', 'readFrame', 'geneName',
       'geneChr', 'geneStrand', 'mutationMode',
       'hasStopCodon', 'isInJunctionList', 'isIsolated',
       'variantSegExpr', 'modifiedExonsCoord', 'originalExonsCoord',
       'vertexIdx', 'kmerType']


cols_pep_file = {col:  'gtex_' + col for col in cols_pep_file}


In [223]:
batch = 24167

In [224]:
# Read GTEX peptide file
gtex_dir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref'
pep_path = os.path.join(gtex_dir, 'cohort_mutNone', f'tmp_out_ref_batch_{batch}', 'ref_sample_peptides_meta.gz')
print(pep_path)
meta_pep = pd.read_csv(pep_path, sep = '\t')
print(meta_pep.shape)

/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/cohort_mutNone/tmp_out_ref_batch_24167/ref_sample_peptides_meta.gz
(15884, 20)


In [225]:
# Only bi or tri-exons peptides
jx_ids = [idx for idx, coord in enumerate(meta_pep['modifiedExonsCoord']) if 'nan' not in coord]
meta_pep = meta_pep.iloc[jx_ids]

In [226]:
# Rename
meta_pep = meta_pep.rename(cols_pep_file, axis = 1)

In [227]:
# Extract coordinates peptides
meta_pep = get_junction_coordinates(meta_pep, 'gtex_modifiedExonsCoord', sep=';')

In [228]:
# CANCER SIDE GET BATCH Junctions
batch_kmers = filt_meta_pb.loc[filt_meta_pb['batch'] == batch]
print(f'Size cancer kmers-junctions {len(batch_kmers)}')

Size cancer kmers-junctions 2


In [229]:
# Add Peptide info 
meta_pep = meta_pep[list(cols_pep_file.values()) + ['junction_coordinate']].drop_duplicates()
batch_kmers = batch_kmers.merge(meta_pep, on = 'junction_coordinate', how = 'left')
print(f'Size cancer kmers-junctions + all GTEX peptides {len(batch_kmers)}')

Size cancer kmers-junctions + all GTEX peptides 20


In [230]:
# Calculate the number of aa fitting on each side of the junctions
batch_kmers['gtex_aa_E1'] = batch_kmers['gtex_modifiedExonsCoord'].str.split(';').map(lambda x: abs(int(x[0]) - int(x[1]))/3 )
batch_kmers['gtex_aa_E2'] = batch_kmers['gtex_modifiedExonsCoord'].str.split(';').map(lambda x: abs(int(x[2]) - int(x[3]))/3 )
batch_kmers['cancer_aa_E1'] = batch_kmers['coord'].str.split(':').map(lambda x: abs(int(x[0]) - int(x[1]))/3 )
batch_kmers['cancer_aa_E2'] = batch_kmers['coord'].str.split(':').map(lambda x: abs(int(x[2]) - int(x[3]))/3 )
# What about 3 exons?

In [233]:
# Validate hypothesis: 
# H1: The second exon is not long enough to get the translation through in GTEX. BUT no exon was added on the right
new_col = 'gtexE2<cancE2'
batch_kmers[new_col] = True
batch_kmers.loc[batch_kmers['gtex_aa_E2'] > batch_kmers['cancer_aa_E2'], 'new_col'] = False

In [235]:
res = batch_kmers[['kmer', new_col ]].drop_duplicates()
display(res)

In [210]:
meta_pep.kmerType.head()


0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: kmerType, dtype: float64

In [None]:
#### WHY IS THE PEPTIDE DATA NOT LOADED

In [216]:
for col in meta_pep.columns:
    print(meta_pep[[col]].head())
    print('\t')

                                             peptide
0                                           SPEGHQAV
1                                       ALLSLTCSLSAI
2                                              RSSGS
3  CPPLPVIPPFTQREVGCETAHIGKSRKPGTGICPSQFYSPDPLPWT...
4     VWGKAGDLPTEEGRLVVGGVWGPTEWGREGDSLEGVMGRPQRAQEA
	
                                              id
0  ENSG00000131771.14:192_240:5:39633883:2-exons
1  ENSG00000131771.14:202_236:0:39633891:2-exons
2  ENSG00000131771.14:230_nan:1:39634032:2-exons
3  ENSG00000131771.14:270_296:3:39635817:2-exons
4  ENSG00000131771.14:154_199:0:39632138:2-exons
	
   readFrame
0          1
1          2
2          1
3          2
4          0
	
   readFrameAnnotated
0               False
1               False
2               False
3               False
4               False
	
             geneName
0  ENSG00000131771.14
1  ENSG00000131771.14
2  ENSG00000131771.14
3  ENSG00000131771.14
4  ENSG00000131771.14
	
  geneChr
0   chr17
1   chr17
2   ch

In [200]:
batch_kmers.gtex_kmerType

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14   NaN
15   NaN
16   NaN
17   NaN
18   NaN
19   NaN
Name: gtex_kmerType, dtype: float64

### Remove GTEX annotation 

In [14]:
path_annot = glob.glob('/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_6920432_ANNOT_conf2_RFall_ref/cohort_mutNone/*/ref_annot_kmer.gz')

print(len(path_annot))

In [28]:
for idx, annot in enumerate(path_annot):
    kmers_filter_pipeline = len(kmer_post_filter)
    annot = pd.read_csv(annot, sep = '\t')
    annot = set(annot['kmer'])
    kmer_post_filter = kmer_post_filter.difference(annot)
    if kmers_filter_pipeline != len(kmer_post_filter):
        print(path_annot[idx], len(kmer_post_filter))

KeyboardInterrupt: 