In [14]:
import numpy as np
import os
import timeit
import glob 
import pandas as pd
import time
import multiprocessing as mp 
import logging
import sys 
import pathlib
from pathlib import Path
import matplotlib.pyplot as plt 

# Order 
- Uniprot 
- Sample init
- Sample expression 
- Sample cohort 
- annotation 
- GTEX?

In [15]:
def filter_cancer_cohort(df, n_samples, threshold_column ):
    df = df.loc[df[threshold_column] >= n_samples]    
    return df
        

In [16]:
def get_threshold_colname(threshold, tag):
    if (threshold is not None) and (threshold > 0 ):
        threshold_column = f'{tag}filter >={threshold}'
    else:
        threshold_column = f'{tag}filter >{threshold}'
    return threshold_column

In [17]:
def filter_single_col(df, threshold, colname):
    if threshold:
        df = df.loc[df[colname] >= threshold, :]
    else:
        df = df.loc[df[colname] >threshold, :]
    return df

In [18]:
def max_recurrence_over_kmer(df, threshold_column, new_maxcol):
    df = df[['kmer', threshold_column]].groupby('kmer').max()
    df = df.reset_index()
    df = df.rename({threshold_column: new_maxcol}, axis = 1)
    return df 

In [19]:
def output_count(df, report_count, report_step, step_string, perform_count=True):
    '''
    Performs a count operation on the number of kmers present in spark dataframe after a given filtering step
    Note: This operation is expensive but useful if the user is interested in intermediate filtering steps
    :param perform_count: bool whether to perform a count operation
    :param df: dataframe with kmer expression counts
    :param report_count: list to store result of successive counting operations
    :param report_step: list to store name of successive counting operations
    :param step_string: str name of the counting operation
    '''
    if perform_count:
        mycount = df['kmer'].unique().shape[0]
        report_count.append(mycount)
        report_step.append(step_string)
        print(f'# {step_string} n = {mycount} kmers')

In [20]:
def save_output_count(output_count, report_count, report_steps, prefix, cancer_sample_ori, mutation_mode,
                      sample_expr_support_cancer, cohort_expr_support_cancer, n_samples_lim_cancer,
                          cohort_expr_support_normal, n_samples_lim_normal, id_normals):
    '''
    Saves the number of kmers present in spark dataframe after each filtering step in a tabular file
    :param output_count: str path for count file of intermediate filtering steps
    :param report_count: list to store result of successive counting operations
    :param report_step: list to store name of successive counting operations
    :param prefix: str information to be added to the result line in an info column
    :param cancer_sample_ori: str id of target cancer sample which was filtered
    :param mutation_mode: str information about whether mutations where applied or not
    :param sample_expr_support_cancer: float normalized expression threshold for the cancer target sample
    :param cohort_expr_support_cancer: float normalized expression threshold for the cancer cohort
    excluding the target sample
    hich should be met in n samples
    :param n_samples_lim_cancer: int number of cancer samples in which the cancer cohort expression threshold
    should be met
    :param cohort_expr_support_normal: float normalized expression threshold for the normal cohort
    required in any sample (>=1)
    :param n_samples_lim_normal: int number of normal samples in which any number of reads is required (>0)
    :param id_normals: str id of the normal cohort (example gtex)
    '''
    pipeline = 'peptide-based'
    if output_count:
        header = (f'{"sample"}\t{"mutation_mode"}\t{"pipeline"}\t{"min_sample_reads"}\t{"#_of_cohort_samples"}\t'
                  f'{"reads_per_cohort_sample"}\t{"#_normal_samples_allowed"}\t{"normal_cohort_id"}'
                  f'\t{"reads_per_normal_sample"}')
        line =   (f'{cancer_sample_ori}\t{mutation_mode}\t{pipeline}\t{sample_expr_support_cancer}\t{n_samples_lim_cancer}'
                  f'\t{cohort_expr_support_cancer}\t{n_samples_lim_normal}\t{id_normals}'
                  f'\t{cohort_expr_support_normal}')

        for idx in np.arange(len(report_count)):
            header += f'\t{report_steps[idx]}'
            line += f'\t{report_count[idx]}'
        if prefix:
            header += f'\t{"info"}'
            line += f'\t{prefix}'
        header += "\n"
        line += "\n"
        
#         print(header, len(header.split('\t')))
#         print(line, len(line.split('\t')))
        if not os.path.exists(output_count):
            with open(output_count,"w") as f:
                f.write(header)
        with open(output_count, "a") as f:
            f.write(line)
        logging.info(f'Save intermediate info to {output_count}')

In [21]:
# Julianne sample  mutation_mode   pipeline        min_sample_reads        #_of_cohort_samples     reads_per_cohort_sample #_normal_samples_allowed        normal_cohort_id        reads_per_normal_sample motif_filter    Init_cancer     Filter_Sample   Filter_Sample_Cohort    Filter_Sample_Cohort_CohortBackground   Filter_Motif    Filter_Sample_Cohort_CohortBackground_Uniprot


# MAIN

Note: Foreground joined with GTEX table \ 79 genes 
Genes excluded is a list of 215 genes for now. Should be fine 

In [22]:
run_type = 'brca'

In [49]:
# Inputs

if run_type == 'brca':
    target_samples = ['TCGA-C8-A12P-01A-11R-A115-07.all',
                      'TCGA-AO-A0JM-01A-21R-A056-07.all',
                      'TCGA-BH-A18V-01A-11R-A12D-07.all',
                      'TCGA-A2-A0D2-01A-21R-A034-07.all',
                      'TCGA-A2-A0SX-01A-12R-A084-07.all']
    basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
    intermediate_output = os.path.join(basedir, 'filtering_intermediate/complete_cancer_candidates_order_r.tsv.gz')
elif run_type == 'ov':
    target_samples = ['TCGA-25-1319-01A-01R-1565-13.all',
                      'TCGA-25-1313-01A-01R-1565-13.all',
                      'TCGA-61-2008-01A-02R-1568-13.all',
                      'TCGA-24-1431-01A-01R-1566-13.all',
                      'TCGA-24-2298-01A-01R-1569-13.all']
    basedir = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Ovarian_374'
    intermediate_output = os.path.join(basedir, 'filtering_intermediate/complete_cancer_candidates_order_r.tsv.gz')


# Outputs
filtering_id = 'chosen_filters_06March_order_wAnnot'

output_dir = os.path.join(basedir, 'filtering_samples', filtering_id)
pathlib.Path(output_dir).mkdir(exist_ok=True, parents=True)

In [24]:
# Discussion 02/22 Choices
# BACKGROUND cohorts we do (cohort_reads, sample_number)- KEEP pipeline as such
# cohort_reads=[0,1,3]
# sample_number=[1,2,10]
# FOREGROUND  (cohort_reads, sample_number) means
# cohort_reads=[0,2]
# sample_number(rest of cohort) =[1, 5]

In [54]:
# Parameters
Threshold_target = [0.0]
Threshold_cancer_cohort = [0.0, 2.0, None] # choices = [0.0, 1.0, 2.0, 3.0, 5.0, 10.0]
N_samples_cancer = [1, 5, None] # choices 1 to 1102 for BRCA and 374 for OV   

Threshold_normal_cohort = [0.0, 1.0, 3.0]   # choices = [0.0, 1.0, 2.0, 3.0, 5.0, 10.0]
N_samples_normal = [1, 2, 10] #choices 1 to max number of samples in Normal whitelist

tag_cancer = 'cancerCohort'
tag_normal = 'gtexCohort'


tag_prefix = 'G_'
mutation_mode = 'ref'
save_tag = 'GtexCohort'

metadata_save = ['kmer', 'coord', 'junctionAnnotated', 'readFrameAnnotated']

filter_annot = False


In [55]:
# Load matrix to be filtered
df_load = pd.read_csv(intermediate_output, sep = '\t')
print(f'Loaded {intermediate_output}')
df_load = df_load.rename({'batch': f'batch_{run_type}'}, axis = 1)
df_load.shape

Loaded /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_intermediate/complete_cancer_candidates_order_r.tsv.gz


(22348747, 24)

In [56]:
df_load.head()

Unnamed: 0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,cancerCohortfilter >=2.0,cancerCohortfilter >=3.0,cancerCohortfilter >=5.0,cancerCohortfilter >=10.0,TCGAC8A12P01A11RA11507all,TCGAAOA0JM01A21RA05607all,TCGABHA18V01A11RA12D07all,TCGAA2A0D201A21RA03407all,TCGAA2A0SX01A12RA08407all,isAnnotated
0,WYITRSGIA,3290.0,1969.0,861.0,394.0,89.0,4.0,92347505:92347506:92349915:92349941:None:None,False,False,...,138,82,26,5,0.0,0.0,0.0,4.102634,0.0,
1,WYITRSGIA,572.0,545.0,243.0,130.0,27.0,2.0,92347505:92347506:92349915:92349941:None:None,False,False,...,138,82,26,5,0.0,0.0,0.0,4.102634,0.0,
2,ISSQSRVEK,278.0,261.0,82.0,29.0,7.0,0.0,92379851:92379859:92493866:92493885:None:None,False,False,...,10,2,0,0,0.0,0.0,2.474321,0.0,0.0,
3,RSGDEEKYP,7350.0,4622.0,2653.0,1745.0,1126.0,734.0,92600493:92600508:92611313:92611325:None:None,True,True,...,641,520,348,170,2.922641,2.102386,1.237161,0.0,0.0,1.0
4,HLKMKMFQI,146.0,136.0,42.0,20.0,3.0,1.0,92379850:92379859:92496416:92496434:None:None,False,False,...,16,2,0,0,2.922641,0.0,0.0,0.0,0.0,


In [57]:
df_load.shape

(22348747, 24)

In [58]:
for cancer_sample_ori in target_samples: # TODO update
    # Sample naming
    target_sample = cancer_sample_ori.replace('-', '').replace('.', '')
    cancer_sample_ori = cancer_sample_ori.replace('.all', '')
    print(f'-------- processing {target_sample} -------- \n')
    
    # Summary file for sample
    summary_file = f'{tag_prefix}filtered_df_{cancer_sample_ori}_samp_chrt_norm_mot.tsv'
    summary_path = os.path.join(output_dir, summary_file)
    print(f'Saving to summary file {summary_path}')
    

    df_expr = []
    report_count = [] 
    report_steps = []
    for threshold_target in Threshold_target:
        for threshold_cancer_cohort in Threshold_cancer_cohort:
            for n_samples_cancer in N_samples_cancer:
                for threshold_normal_cohort in Threshold_normal_cohort:
                    for n_samples_normal in N_samples_normal:
                        if (n_samples_cancer is None) ^ (threshold_cancer_cohort is None):
                            continue

                        adjusted_threshold_col = 'tmp_cancer_cohort'
                        max_threshold_col = 'tmp_normal_Nmax_sup{}'.format(threshold_normal_cohort)
                        max_threshold_col_base = 'tmp_normal_Nmax_sup{}'.format(0)


                        df = df_load.copy()
                        # Make correction for number samples passing theshold in cohort. We want to exclude the target sample in counting
                        if (n_samples_cancer is not None) and (threshold_cancer_cohort is not None):
                            df[adjusted_threshold_col] = df[get_threshold_colname(threshold_cancer_cohort, tag_cancer)]
                            df.loc[df[target_sample] >= threshold_cancer_cohort, adjusted_threshold_col] -=1 

                        # Number of kmers expressed in sample 
                        df = filter_single_col(df, 0, target_sample)
                        output_count(df, report_count, report_steps, 'Init_Sample')

                        # Number of kmers >= threshold in sample 
                        df = filter_single_col(df, threshold_target, target_sample)
                        output_count(df, report_count, report_steps, 'Filter_Sample')

                        
                        # Filter for cancer cancer cohort 
                        if (n_samples_cancer is not None) and (threshold_cancer_cohort is not None):
                            df = filter_cancer_cohort(df, n_samples_cancer, adjusted_threshold_col)
                        output_count(df, report_count, report_steps, 'Filter_Sample_Cohort')
                            


                        # Expression in gtex cohort >= threshold 
                        recurrence_custom =  max_recurrence_over_kmer(df, 
                                                                      get_threshold_colname(threshold_normal_cohort, tag_normal), 
                                                                      max_threshold_col)

                        # Expression in gtex cohort > 0  
                        recurrence_custom_base = max_recurrence_over_kmer(df, 
                                                                          get_threshold_colname(0.0, tag_normal),
                                                                          max_threshold_col_base) 




                        # Perform Background filtering 
                        df = df.merge(recurrence_custom, on = 'kmer', how = 'left')
                        df = df.merge(recurrence_custom_base, on = 'kmer', how = 'left')

                        df = df.loc[ ~ ((df[max_threshold_col] >= 1) & (df[max_threshold_col_base] >= n_samples_normal)), :]
                        output_count(df, report_count, report_steps, 'Filter_Sample_Cohort_CohortNormal')
                    
                        #Perform Annotated junctions filtering 
                        if filter_annot:
                            df = df[df['isAnnotated'].isna()]
                            output_count(df, report_count, report_steps, 'Filter_Sample_Cohort_CohortNormal_pepAnnot')

#                         # DEV: Exclude genes where GTEX is missing
#                         df = df.loc[df['exclude'].isna()]
#                         output_count(df, report_count, report_steps, 'Filter_Sample_Cohort_CohortNormal_pepAnnot_EXPGTEX')

                        
                        
                        
                        # Save outputs 
                        # outpaths
                        base_path_final = os.path.join(output_dir,
                                                       (f'{tag_prefix}{cancer_sample_ori}_'
                                                        f'SampleLim{threshold_target}'
                                                        f'CohortLim{threshold_cancer_cohort}'
                                                        f'Across{n_samples_cancer}_'
                                                        f'FiltNormals{save_tag}'
                                                        f'Cohortlim{threshold_normal_cohort}'
                                                        f'Across{n_samples_normal}.tsv.gz'))
                        print(f'Saving outputs to: {base_path_final} \n')
                        df.loc[:, metadata_save].to_csv(base_path_final, compression = 'gzip', index = None, sep = '\t')


    save_output_count(summary_path, report_count, report_steps, '', cancer_sample_ori, mutation_mode,
                      threshold_target, threshold_cancer_cohort, n_samples_cancer,
                          threshold_normal_cohort, n_samples_normal, save_tag)

-------- processing TCGAC8A12P01A11RA11507all -------- 

Saving to summary file /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_filtered_df_TCGA-C8-A12P-01A-11R-A115-07_samp_chrt_norm_mot.tsv
# Init_Sample n = 594980 kmers
# Filter_Sample n = 594980 kmers
# Filter_Sample_Cohort n = 594740 kmers
# Filter_Sample_Cohort_CohortNormal n = 758 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-C8-A12P-01A-11R-A115-07_SampleLim0.0CohortLim0.0Across1_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz 

# Init_Sample n = 594980 kmers
# Filter_Sample n = 594980 kmers
# Filter_Sample_Cohort n = 594740 kmers
# Filter_Sample_Cohort_CohortNormal n = 765 kmers
Saving outputs to: /cluster/work/grlab/pro

# Init_Sample n = 594980 kmers
# Filter_Sample n = 594980 kmers
# Filter_Sample_Cohort n = 592771 kmers
# Filter_Sample_Cohort_CohortNormal n = 620 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-C8-A12P-01A-11R-A115-07_SampleLim0.0CohortLim0.0Across5_FiltNormalsGtexCohortCohortlim3.0Across10.tsv.gz 

# Init_Sample n = 594980 kmers
# Filter_Sample n = 594980 kmers
# Filter_Sample_Cohort n = 594185 kmers
# Filter_Sample_Cohort_CohortNormal n = 687 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-C8-A12P-01A-11R-A115-07_SampleLim0.0CohortLim2.0Across1_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz 

# Init_Sample n = 594980 kmers
# Filter_Sample n = 5949

# Init_Sample n = 594980 kmers
# Filter_Sample n = 594980 kmers
# Filter_Sample_Cohort n = 585082 kmers
# Filter_Sample_Cohort_CohortNormal n = 415 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-C8-A12P-01A-11R-A115-07_SampleLim0.0CohortLim2.0Across5_FiltNormalsGtexCohortCohortlim3.0Across10.tsv.gz 

# Init_Sample n = 594980 kmers
# Filter_Sample n = 594980 kmers
# Filter_Sample_Cohort n = 594980 kmers
# Filter_Sample_Cohort_CohortNormal n = 895 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-C8-A12P-01A-11R-A115-07_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim0.0Across1.tsv.gz 

# Init_Sample n = 594980 kmers
# Filter_Sample n = 

# Init_Sample n = 476112 kmers
# Filter_Sample n = 476112 kmers
# Filter_Sample_Cohort n = 475282 kmers
# Filter_Sample_Cohort_CohortNormal n = 607 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-AO-A0JM-01A-21R-A056-07_SampleLim0.0CohortLim0.0Across1_FiltNormalsGtexCohortCohortlim3.0Across2.tsv.gz 

# Init_Sample n = 476112 kmers
# Filter_Sample n = 476112 kmers
# Filter_Sample_Cohort n = 475282 kmers
# Filter_Sample_Cohort_CohortNormal n = 770 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-AO-A0JM-01A-21R-A056-07_SampleLim0.0CohortLim0.0Across1_FiltNormalsGtexCohortCohortlim3.0Across10.tsv.gz 

# Init_Sample n = 476112 kmers
# Filter_Sample n = 4761

# Init_Sample n = 476112 kmers
# Filter_Sample n = 476112 kmers
# Filter_Sample_Cohort n = 474548 kmers
# Filter_Sample_Cohort_CohortNormal n = 487 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-AO-A0JM-01A-21R-A056-07_SampleLim0.0CohortLim2.0Across1_FiltNormalsGtexCohortCohortlim3.0Across2.tsv.gz 

# Init_Sample n = 476112 kmers
# Filter_Sample n = 476112 kmers
# Filter_Sample_Cohort n = 474548 kmers
# Filter_Sample_Cohort_CohortNormal n = 586 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-AO-A0JM-01A-21R-A056-07_SampleLim0.0CohortLim2.0Across1_FiltNormalsGtexCohortCohortlim3.0Across10.tsv.gz 

# Init_Sample n = 476112 kmers
# Filter_Sample n = 4761

# Init_Sample n = 476112 kmers
# Filter_Sample n = 476112 kmers
# Filter_Sample_Cohort n = 476112 kmers
# Filter_Sample_Cohort_CohortNormal n = 1189 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-AO-A0JM-01A-21R-A056-07_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim3.0Across2.tsv.gz 

# Init_Sample n = 476112 kmers
# Filter_Sample n = 476112 kmers
# Filter_Sample_Cohort n = 476112 kmers
# Filter_Sample_Cohort_CohortNormal n = 1412 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-AO-A0JM-01A-21R-A056-07_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim3.0Across10.tsv.gz 

-------- processing TCGABHA18V01A11RA12D07all

# Init_Sample n = 703627 kmers
# Filter_Sample n = 703627 kmers
# Filter_Sample_Cohort n = 699618 kmers
# Filter_Sample_Cohort_CohortNormal n = 664 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-BH-A18V-01A-11R-A12D-07_SampleLim0.0CohortLim0.0Across5_FiltNormalsGtexCohortCohortlim3.0Across1.tsv.gz 

# Init_Sample n = 703627 kmers
# Filter_Sample n = 703627 kmers
# Filter_Sample_Cohort n = 699618 kmers
# Filter_Sample_Cohort_CohortNormal n = 665 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-BH-A18V-01A-11R-A12D-07_SampleLim0.0CohortLim0.0Across5_FiltNormalsGtexCohortCohortlim3.0Across2.tsv.gz 

# Init_Sample n = 703627 kmers
# Filter_Sample n = 70362

# Init_Sample n = 703627 kmers
# Filter_Sample n = 703627 kmers
# Filter_Sample_Cohort n = 684842 kmers
# Filter_Sample_Cohort_CohortNormal n = 356 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-BH-A18V-01A-11R-A12D-07_SampleLim0.0CohortLim2.0Across5_FiltNormalsGtexCohortCohortlim3.0Across1.tsv.gz 

# Init_Sample n = 703627 kmers
# Filter_Sample n = 703627 kmers
# Filter_Sample_Cohort n = 684842 kmers
# Filter_Sample_Cohort_CohortNormal n = 356 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-BH-A18V-01A-11R-A12D-07_SampleLim0.0CohortLim2.0Across5_FiltNormalsGtexCohortCohortlim3.0Across2.tsv.gz 

# Init_Sample n = 703627 kmers
# Filter_Sample n = 70362

# Init_Sample n = 633276 kmers
# Filter_Sample n = 633276 kmers
# Filter_Sample_Cohort n = 633052 kmers
# Filter_Sample_Cohort_CohortNormal n = 1141 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0D2-01A-21R-A034-07_SampleLim0.0CohortLim0.0Across1_FiltNormalsGtexCohortCohortlim1.0Across10.tsv.gz 

# Init_Sample n = 633276 kmers
# Filter_Sample n = 633276 kmers
# Filter_Sample_Cohort n = 633052 kmers
# Filter_Sample_Cohort_CohortNormal n = 934 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0D2-01A-21R-A034-07_SampleLim0.0CohortLim0.0Across1_FiltNormalsGtexCohortCohortlim3.0Across1.tsv.gz 

# Init_Sample n = 633276 kmers
# Filter_Sample n = 633

# Init_Sample n = 633276 kmers
# Filter_Sample n = 633276 kmers
# Filter_Sample_Cohort n = 632423 kmers
# Filter_Sample_Cohort_CohortNormal n = 1035 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0D2-01A-21R-A034-07_SampleLim0.0CohortLim2.0Across1_FiltNormalsGtexCohortCohortlim1.0Across10.tsv.gz 

# Init_Sample n = 633276 kmers
# Filter_Sample n = 633276 kmers
# Filter_Sample_Cohort n = 632423 kmers
# Filter_Sample_Cohort_CohortNormal n = 848 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0D2-01A-21R-A034-07_SampleLim0.0CohortLim2.0Across1_FiltNormalsGtexCohortCohortlim3.0Across1.tsv.gz 

# Init_Sample n = 633276 kmers
# Filter_Sample n = 633

# Init_Sample n = 633276 kmers
# Filter_Sample n = 633276 kmers
# Filter_Sample_Cohort n = 633276 kmers
# Filter_Sample_Cohort_CohortNormal n = 1301 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0D2-01A-21R-A034-07_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim1.0Across10.tsv.gz 

# Init_Sample n = 633276 kmers
# Filter_Sample n = 633276 kmers
# Filter_Sample_Cohort n = 633276 kmers
# Filter_Sample_Cohort_CohortNormal n = 1052 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0D2-01A-21R-A034-07_SampleLim0.0CohortLimNoneAcrossNone_FiltNormalsGtexCohortCohortlim3.0Across1.tsv.gz 

# Init_Sample n = 633276 kmers
# Filter_Sampl

# Init_Sample n = 708211 kmers
# Filter_Sample n = 708211 kmers
# Filter_Sample_Cohort n = 704701 kmers
# Filter_Sample_Cohort_CohortNormal n = 481 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0SX-01A-12R-A084-07_SampleLim0.0CohortLim0.0Across5_FiltNormalsGtexCohortCohortlim1.0Across2.tsv.gz 

# Init_Sample n = 708211 kmers
# Filter_Sample n = 708211 kmers
# Filter_Sample_Cohort n = 704701 kmers
# Filter_Sample_Cohort_CohortNormal n = 628 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0SX-01A-12R-A084-07_SampleLim0.0CohortLim0.0Across5_FiltNormalsGtexCohortCohortlim1.0Across10.tsv.gz 

# Init_Sample n = 708211 kmers
# Filter_Sample n = 7082

# Init_Sample n = 708211 kmers
# Filter_Sample n = 708211 kmers
# Filter_Sample_Cohort n = 691861 kmers
# Filter_Sample_Cohort_CohortNormal n = 295 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0SX-01A-12R-A084-07_SampleLim0.0CohortLim2.0Across5_FiltNormalsGtexCohortCohortlim1.0Across2.tsv.gz 

# Init_Sample n = 708211 kmers
# Filter_Sample n = 708211 kmers
# Filter_Sample_Cohort n = 691861 kmers
# Filter_Sample_Cohort_CohortNormal n = 380 kmers
Saving outputs to: /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_samples/chosen_filters_06March_order_wAnnot/G_TCGA-A2-A0SX-01A-12R-A084-07_SampleLim0.0CohortLim2.0Across5_FiltNormalsGtexCohortCohortlim1.0Across10.tsv.gz 

# Init_Sample n = 708211 kmers
# Filter_Sample n = 7082

In [None]:
df_load.shape

In [None]:
kmer = 'KVDTLGKST'

In [None]:
df.loc[df['kmer'] == kmer]

In [None]:
df_load.loc[df_load['kmer'] == kmer,\
            ['kmer', 'coord', 'exclude', 'batch_brca', 'junctionAnnotated', 'readFrameAnnotated', target_sample,  get_threshold_colname(threshold_cancer_cohort, tag_cancer), \
            get_threshold_colname(threshold_normal_cohort, tag_normal), get_threshold_colname(0.0, tag_normal)]]

In [None]:
df_load.loc[df_load['kmer'] == kmer, 'batch_brca']

In [None]:
gene_to_batch.loc[(gene_to_batch['batch_brca'] == 54877)]

In [None]:
foo = df_load.merge(gene_to_batch[['exclude', f'batch_{run_type}']], on = f'batch_{run_type}', how = 'left')

In [None]:
foo.loc[(foo['kmer'] == kmer)]

In [None]:
final  =df.loc[ ~ ((df[max_threshold_col] >= 1) & (df[max_threshold_col_base] >= n_samples_normal)), 
       ['kmer', get_threshold_colname(threshold_cancer_cohort, tag_cancer)]]

In [None]:
final = df

In [None]:
final.loc[final['cancerCohortfilter >0.0']> 403 , ['kmer', 'tmp_cancer_cohort',f'batch_{run_type}'] ] 

In [None]:
final['cancerCohortfilter >0.0'].describe()

In [None]:
final

In [None]:
df_load.loc[df_load['kmer'] == kmer,\
            [target_sample,  get_threshold_colname(threshold_cancer_cohort, tag_cancer), \
            get_threshold_colname(threshold_normal_cohort, tag_normal), get_threshold_colname(0.0, tag_normal)]]