In [None]:
import numpy as np
import os
import timeit
import glob 
import pandas as pd
import time
import multiprocessing as mp 
import logging
import sys 
import pathlib
from pathlib import Path
import matplotlib.pyplot as plt 

Matplotlib created a temporary config/cache directory at /scratch/slurm-job.1088651/matplotlib-74873cmb because the default path (/cluster/customapps/biomed/grlab/users/prelotla/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [None]:
def get_junction_coordinates(df, coordinates_col, sep=':'):
    df['strand'] = None
    df['junction_coordinate'] = None

    for idx, row in df.iterrows():
        kmer_coordinates = [int(x) for x in row[coordinates_col].split(sep) if (x !='None') ]

        if kmer_coordinates[1] < kmer_coordinates[2]: # order strand +

            df.loc[idx, 'strand'] = '+'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:3]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in kmer_coordinates[1:5]])
        else: # order strand +
            df.loc[idx, 'strand'] = '-'
            if len(kmer_coordinates) == 4:  # 2 exons
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0]]])
            elif len(kmer_coordinates) == 6:
                df.loc[idx, 'junction_coordinate'] = ':'.join([str(x) for x in [kmer_coordinates[3],
                                                                                kmer_coordinates[0],
                                                                                kmer_coordinates[2],
                                                                                kmer_coordinates[5]
                                                                               ]])
    return df




### Get filtered data

In [None]:
# Inputs
run_type = 'ov'

if run_type == 'brca':
    target_samples = ['TCGA-C8-A12P-01A-11R-A115-07.all',
                      'TCGA-AO-A0JM-01A-21R-A056-07.all',
                      'TCGA-BH-A18V-01A-11R-A12D-07.all',
                      'TCGA-A2-A0D2-01A-21R-A034-07.all',
                      'TCGA-A2-A0SX-01A-12R-A084-07.all']
    sample_target = 'TCGA-AO-A0JM-01A-21R-A056-07'
    basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
    intermediate_output = os.path.join(basedir, 'filtering_intermediate/complete_cancer_candidates_order_r.tsv.gz')
elif run_type == 'ov':
    target_samples = ['TCGA-25-1319-01A-01R-1565-13.all',
                      'TCGA-25-1313-01A-01R-1565-13.all',
                      'TCGA-61-2008-01A-02R-1568-13.all',
                      'TCGA-24-1431-01A-01R-1566-13.all',
                      'TCGA-24-2298-01A-01R-1569-13.all']
    sample_target = 'TCGA-25-1319-01A-01R-1565-13'
    basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374'
    intermediate_output = os.path.join(basedir, 'filtering_intermediate/complete_cancer_candidates_order_r.tsv.gz')


# Outputs
filtering_id = 'filters_22March_order_wany_wAnnot'
output_dir = os.path.join(basedir, 'filtering_samples', filtering_id)

In [None]:
# Load generation matrix
df_load = pd.read_csv(intermediate_output, sep = '\t')

In [None]:
# Load filtered kmers
path_interest = f'G_{sample_target}_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz'
path_interest = os.path.join(output_dir, path_interest)
filt = pd.read_csv(path_interest, sep = '\t')
print(filt.shape)

In [None]:
# Merge generation matrix & filtered kmers
filt_meta = df_load.merge(filt, on = list(filt.columns), how = 'right')
print(filt_meta.shape)

In [None]:
# Define problematic kmer
# exp_lim = 600
# filt_meta_pb = filt_meta.loc[filt_meta['cancerCohortfilter >0.0'] > exp_lim]
# filt_meta_pb = filt_meta_pb[filt_meta_pb[sample_target.replace('-', '') + 'all'] > 0 ]
# print(f'filtered kmers-junctions {filt_meta.shape[0]}')
# print(f'filtered kmers unique {len(filt_meta.kmer.unique())}')
# print(f'filtered kmers-junctions problematic {filt_meta_pb.shape[0]}')
# print(f'filtered kmers problematic unique {len(filt_meta_pb.kmer.unique())}')
filt_meta_pb = filt_meta

In [None]:
# Columns magic
core_cols = ['kmer', 'gtexCohortfilter >0.0', 'coord',
       'junctionAnnotated', 'readFrameAnnotated', 'isCrossJunction', 'batch',
       'cancerCohortfilter >0.0', sample_target.replace('-', '') + 'all', 'isAnnotated']

df_load.columns

In [None]:
# Extract coordinates of the merged table
filt_meta_pb = get_junction_coordinates(filt_meta_pb, 'coord', sep=':')

display(filt_meta_pb[core_cols + ['junction_coordinate']].head())

### Add the peptide metadata

In [None]:
# Define peptide file columns of interest 
cols_correct = ['peptide','id','readFrame','readFrameAnnotated','geneName','geneChr','geneStrand',
'mutationMode','hasStopCodon','isInJunctionList',
'isIsolated','variantComb','variantSegExpr','modifiedExonsCoord',
'originalExonsCoord',
'vertexIdx','kmerType', 'dummy1', 'dummy2', 'dummy3']

cols_pep_file = ['peptide', 'id', 'readFrame', 'geneName',
       'geneChr', 'geneStrand', 'mutationMode',
       'hasStopCodon', 'isInJunctionList', 'isIsolated',
       'variantSegExpr', 'modifiedExonsCoord', 'originalExonsCoord',
       'vertexIdx', 'kmerType']


cols_pep_file = {col:  'gtex_' + col for col in cols_pep_file}


In [None]:
len(filt_meta_pb['batch'].unique()) # Now 121 brca, # 2792 OV

In [None]:
filt_meta_pb.head()

### Make False positive hypothesis

In [None]:
gtex_dir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref'
check_hypothesis = []
check_hypothesis1 = []
kmers_not_in_gtex = []
kmers_no_metadata = []

for batch_i, batch in enumerate(filt_meta_pb['batch'].unique()):
    print(f'\n Iteration {batch_i} batch {batch}')
    # CANCER SIDE GET BATCH Junctions
    batch_kmers_init = filt_meta_pb.loc[filt_meta_pb['batch'] == batch]
    print(f'Size cancer kmers-junctions {len(batch_kmers_init)}')
    
    # Read GTEX peptide file
    pep_path = os.path.join(gtex_dir, 'cohort_mutNone', f'tmp_out_ref_batch_{batch}', 'ref_sample_peptides_meta.gz')
#     if os.path.exists(pep_path):
    ##### COPY 
    not_in_background = False


    print(pep_path)
    try:
        meta_pep = pd.read_csv(pep_path, sep = '\t')
    except:
        print(f'CHECK COMPLETION OF {meta_pep}')
        kmers_no_metadata.append(batch_kmers_init[['kmer', 'junction_coordinate']])
        continue
    print(meta_pep.shape)

    meta_pep.columns = cols_correct # ISSUE WITH IMMUNOPEPPER



    # Only bi or tri-exons peptides
    jx_ids = [idx for idx, coord in enumerate(meta_pep['modifiedExonsCoord']) if 'nan' not in coord]
    meta_pep = meta_pep.iloc[jx_ids]

    # Quick Assess Cancer junction presence in GTEX
    coord_int = [i.split(':') for i in batch_kmers_init['junction_coordinate'].unique()]
    keep = set()
    for cd in coord_int:
        keep.update([idx for idx, coord in enumerate(meta_pep['modifiedExonsCoord']) \
                     if (cd[0] in coord) and (cd[1] in coord)])
    meta_pep = meta_pep.iloc[list(keep)]

    if keep: # Some target junctions are in "potentially" found in gtex 
        # Rename
        meta_pep = meta_pep.rename(cols_pep_file, axis = 1)

        # Extract coordinates peptides
        meta_pep = get_junction_coordinates(meta_pep, 'gtex_modifiedExonsCoord', sep=';')

        # Add Peptide info 
        meta_pep = meta_pep[list(cols_pep_file.values()) + ['junction_coordinate']].drop_duplicates()

        exist_jx_not_in_gtex = set(batch_kmers_init['junction_coordinate']).difference(set(meta_pep['junction_coordinate']))
        exist_jx_in_gtex = set(batch_kmers_init['junction_coordinate']).intersection(set(meta_pep['junction_coordinate']))

        if exist_jx_in_gtex: # Some target junctions are in gtex
            batch_kmers = batch_kmers_init.merge(meta_pep, on = 'junction_coordinate', how = 'inner')
            print(f'Size cancer kmers-junctions + all GTEX peptides {len(batch_kmers)}')

            # Calculate the number of aa fitting on each side of the junctions
            batch_kmers['gtex_aa_E1'] = batch_kmers['gtex_modifiedExonsCoord'].str.split(';').map(lambda x: abs(int(x[0]) - int(x[1]))/3 )
            batch_kmers['gtex_aa_E2'] = batch_kmers['gtex_modifiedExonsCoord'].str.split(';').map(lambda x: abs(int(x[2]) - int(x[3]))/3 )
            batch_kmers['cancer_aa_E1'] = batch_kmers['coord'].str.split(':').map(lambda x: abs(int(x[0]) - int(x[1]))/3 )
            batch_kmers['cancer_aa_E2'] = batch_kmers['coord'].str.split(':').map(lambda x: abs(int(x[2]) - int(x[3]))/3 )
            # What about 3 exons?

            # Validate hypothesis: 
            # H1: The second exon is not long enough to get the translation through in GTEX. BUT no exon was added on the right
            H_cols = ['gtexE2<cancE2', 'gtexE1<cancE1']
            for new_col in H_cols:
                batch_kmers[new_col] = True
            batch_kmers.loc[batch_kmers['gtex_aa_E2'] > batch_kmers['cancer_aa_E2'], 'gtexE2<cancE2'] = False
            batch_kmers.loc[batch_kmers['gtex_aa_E1'] > batch_kmers['cancer_aa_E1'], 'gtexE1<cancE1'] = False

            res = batch_kmers[['kmer', 'junction_coordinate' ] + H_cols].drop_duplicates()

            display( batch_kmers[['kmer', 'cancerCohortfilter >0.0',\
                                  sample_target.replace('-', '') + 'all',
                                  'readFrameAnnotated', \
                                  'junctionAnnotated', \
                                 ] + H_cols].drop_duplicates() )

            check_hypothesis.append(res[['kmer', 'junction_coordinate']])

        if exist_jx_not_in_gtex: # Some target junctions are NOT in gtex (Immunopepper Metadata)
            not_in_background = True
            diff = batch_kmers_init.set_index('junction_coordinate').loc[exist_jx_not_in_gtex].reset_index() #left anti join


    else: # No target junctions are in GTEX (Immunopepper Metadata) at all [CAREFUL - could be in graph]
        not_in_background = True
        diff = batch_kmers_init.copy()

    if not_in_background:
        #diff = diff[['kmer', 'junction_coordinate', 'cancerCohortfilter >0.0', ]].drop_duplicates()
        kmers_not_in_gtex.append(diff[['kmer', 'junction_coordinate']])
        print(f'{diff.shape[0]} Kmers - junctions not found in gtex. Recurrence is:')
        print(diff['cancerCohortfilter >0.0'].unique())
#             for rec in diff['cancerCohortfilter >0.0'].unique():
#                 if rec > 100: 
#                     display( diff[['kmer', 'junction_coordinate', 'coord', \
#                                    'cancerCohortfilter >0.0',\
#                                       sample_target.replace('-', '') + 'all',
#                                       'readFrameAnnotated', \
#                                       'junctionAnnotated' ]].drop_duplicates() )
        print(f'{diff.shape[0]} Kmers - junctions not found in gtex. Junction annotated is:')
        print(diff['junctionAnnotated'].unique())
        print(f'{diff.shape[0]} Kmers - junctions not found in gtex. RF annotated is:')
        print(diff['readFrameAnnotated'].unique())


    ##### END COPY 
#     else:
#         check_path = os.path.join(gtex_dir, 'cohort_mutNone', f'tmp_out_ref_batch_{batch}')
#         kmers_no_metadata.append(batch_kmers_init[['kmer', 'junction_coordinate']])
# #         for rec in diff['cancerCohortfilter >0.0'].unique():
# #             if rec > 1000: 
# #                 display(batch_kmers_init[['kmer', 'junction_coordinate']])
#         print(f'CHECK COMPLETION OF {check_path}')
#         continue
# #         for pep_path in glob.glob(pep_path + '/*'):




In [None]:
print(filt_meta_pb.shape)
print(filt_meta_pb[['kmer', 'junction_coordinate']].drop_duplicates().shape)

In [19]:
df_kmers_no_metadata = pd.concat(kmers_no_metadata, axis = 0 )
print(f'{df_kmers_no_metadata.shape[0]} kmers without metadata')
print(f'{df_kmers_no_metadata.drop_duplicates().shape[0]} UNIQUE kmers without metadata')

506 kmers without metadata
434 UNIQUE kmers without metadata


In [20]:
df_kmers_not_in_gtex = pd.concat(kmers_not_in_gtex, axis = 0 )
print(f'{df_kmers_not_in_gtex.shape[0]} kmers completely absent in GTEX')
print(f'{df_kmers_not_in_gtex.drop_duplicates().shape[0]} UNIQUE kmers completely absent in GTEX')

16724 kmers completely absent in GTEX
14990 UNIQUE kmers completely absent in GTEX


In [21]:
df_check_hypothesis = pd.concat(check_hypothesis, axis = 0 )
print(f'{df_check_hypothesis.shape[0]} kmers coming from novel exons')
print(f'{df_check_hypothesis.drop_duplicates().shape[0]} UNIQUE kmers coming from novel exons')

126 kmers coming from novel exons
125 UNIQUE kmers coming from novel exons


In [22]:
# Create new labels based on the false positive check above 
# The sets no_metadata, not_GTEX, check_hypothesis are disjoint
print(set(df_kmers_no_metadata['kmer']).intersection( set(df_kmers_not_in_gtex['kmer'])))
print(set(df_kmers_no_metadata['kmer']).intersection( set(df_check_hypothesis['kmer'])))
print(set(df_kmers_not_in_gtex['kmer']).intersection( set(df_check_hypothesis['kmer'])))

# Drop duplicates
df_kmers_no_metadata = df_kmers_no_metadata.drop_duplicates() 
df_kmers_not_in_gtex = df_kmers_not_in_gtex.drop_duplicates()
df_check_hypothesis = df_check_hypothesis.drop_duplicates()

df_no_info = df_kmers_no_metadata.copy()
df_no_info['info_not_available'] = True

# Augment df with information about GTEX junction presence
df_kmers_not_in_gtex['coordinate_in_GTEX'] = False
df_kmers_no_metadata['coordinate_in_GTEX'] = False # No info
df_coord_in_gtex = pd.concat([df_kmers_not_in_gtex, df_kmers_no_metadata], axis = 0)

# Augment df with information about Novel exon hypothesis
df_kmers_no_metadata.drop('coordinate_in_GTEX', inplace=True, axis=1)
df_kmers_no_metadata['gtexExon2<cancExon2'] = False
df_check_hypothesis['gtexExon2<cancExon2'] = True
df_exon_length = pd.concat([df_check_hypothesis, df_kmers_no_metadata], axis = 0)

filt_meta = filt_meta.merge(df_coord_in_gtex, on = ['kmer', 'junction_coordinate'], how = 'left')
filt_meta['coordinate_in_GTEX'] = filt_meta['coordinate_in_GTEX'].fillna(True)

filt_meta = filt_meta.merge(df_exon_length, on = ['kmer', 'junction_coordinate'], how = 'left')
filt_meta['gtexExon2<cancExon2'] = filt_meta['gtexExon2<cancExon2'].fillna(False)

filt_meta = filt_meta.merge(df_no_info, on = ['kmer', 'junction_coordinate'], how = 'left')
filt_meta['info_not_available'] = filt_meta['info_not_available'].fillna(False)

display(filt_meta.groupby('gtexExon2<cancExon2').count())
display(filt_meta.groupby('coordinate_in_GTEX').count())

set()
set()
{'GKHEERKYT', 'SQQAASKWT', 'SQQAASKAV'}


Unnamed: 0_level_0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,TCGA25131901A01R156513all,TCGA25131301A01R156513all,TCGA61200801A02R156813all,TCGA24143101A01R156613all,TCGA24229801A01R156913all,isAnnotated,strand,junction_coordinate,coordinate_in_GTEX,info_not_available
gtexExon2<cancExon2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,131835,676,676,676,676,676,676,131835,131835,131835,...,131835,131835,131835,131835,131835,21,131835,131835,131835,131835
True,127,7,7,7,7,7,7,127,127,127,...,127,127,127,127,127,1,127,127,127,127


Unnamed: 0_level_0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,TCGA25131901A01R156513all,TCGA25131301A01R156513all,TCGA61200801A02R156813all,TCGA24143101A01R156613all,TCGA24229801A01R156913all,isAnnotated,strand,junction_coordinate,gtexExon2<cancExon2,info_not_available
coordinate_in_GTEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,17230,14,14,14,14,14,14,17230,17230,17230,...,17230,17230,17230,17230,17230,0,17230,17230,17230,17230
True,114732,669,669,669,669,669,669,114732,114732,114732,...,114732,114732,114732,114732,114732,22,114732,114732,114732,114732


In [23]:
# path_interest = f'G_{sample_target}_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz'
# path_interest_with_advanced_meta = os.path.join(output_dir, path_interest.replace('.tsv.gz', 'metadata2.tsv.gz'))
# print(f'Saving kmers with advanced metadata to {path_interest_with_advanced_meta}')
# filt_meta.to_csv(path_interest_with_advanced_meta, sep = '\t', compression = 'gzip', index = None)

Saving kmers with advanced metadata to /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374/filtering_samples/filters_22March_order_wany_wAnnot/G_TCGA-25-1319-01A-01R-1565-13_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1metadata2.tsv.gz


## Exploratory

In [None]:
pd.read

### Remove GTEX annotation 

In [19]:
# path_annot = glob.glob('/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_6920432_ANNOT_conf2_RFall_ref/cohort_mutNone/*/ref_annot_kmer.gz')

# print(len(path_annot))

In [20]:
# for idx, annot in enumerate(path_annot):
#     kmers_filter_pipeline = len(kmer_post_filter)
#     annot = pd.read_csv(annot, sep = '\t')
#     annot = set(annot['kmer'])
#     kmer_post_filter = kmer_post_filter.difference(annot)
#     if kmers_filter_pipeline != len(kmer_post_filter):
#         print(path_annot[idx], len(kmer_post_filter))