In [1]:
import numpy as np
import os
import timeit
import glob 
import pandas as pd
import time
import multiprocessing as mp 
import logging
import sys 
from pathlib import Path
import matplotlib.pyplot as plt 

Matplotlib created a temporary config/cache directory at /scratch/slurm-job.725485/matplotlib-wgff5f6y because the default path (/cluster/customapps/biomed/grlab/users/prelotla/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


# Order 
- Uniprot 
- Sample init
- Sample expression 
- Sample cohort 
- annotation 
- GTEX?

In [2]:
def filter_cancer_cohort(df, n_samples, threshold_column ):
    df = df.loc[df[threshold_column] >= n_samples]    
    return df
        

In [3]:
def get_threshold_colname(threshold, tag):
    if (threshold is not None) and (threshold > 0 ):
        threshold_column = f'{tag}filter >={threshold}'
    else:
        threshold_column = f'{tag}filter >{threshold}'
    return threshold_column

In [4]:
def filter_single_col(df, threshold, colname):
    if threshold:
        df = df.loc[df[colname] >= threshold, :]
    else:
        df = df.loc[df[colname] >threshold, :]
    return df

In [5]:
def max_recurrence_over_kmer(df, threshold_column, new_maxcol):
    df = df[['kmer', threshold_column]].groupby('kmer').max()
    df = df.reset_index()
    df = df.rename({threshold_column: new_maxcol}, axis = 1)
    return df 

In [6]:
def output_count(df, report_count, report_step, step_string, perform_count=True):
    '''
    Performs a count operation on the number of kmers present in spark dataframe after a given filtering step
    Note: This operation is expensive but useful if the user is interested in intermediate filtering steps
    :param perform_count: bool whether to perform a count operation
    :param df: dataframe with kmer expression counts
    :param report_count: list to store result of successive counting operations
    :param report_step: list to store name of successive counting operations
    :param step_string: str name of the counting operation
    '''
    if perform_count:
        mycount = df['kmer'].unique().shape[0]
        report_count.append(mycount)
        report_step.append(step_string)
        logging.info(f'# {step_string} n = {mycount} kmers')

In [7]:
intermediate_output = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_intermediate/complete_cancer_candidates_missing_162_45.tsv.gz'

In [9]:
target_sample = 'TCGAC8A12P01A11RA11507all'

Threshold_target = [1.0, 3.0]
Threshold_cancer_cohort = [3.0, 5.0] # choices = [0.0, 1.0, 2.0, 3.0, 5.0, 10.0]
N_samples_cancer = [10, 520] # choices 1 to 1102 for BRCA and 379 for OV   

Threshold_normal_cohort = [0.0, 10.0]   # choices = [0.0, 1.0, 2.0, 3.0, 5.0, 10.0]
N_samples_normal = [2, 15] #choices 1 to max number of samples in Normal whitelist

tag_cancer = 'cancerCohort'
tag_normal = 'gtexCohort'



In [10]:
df_load = pd.read_csv(intermediate_output, sep = '\t')

# TODO
- Think about the order of the loops -- GTEX MERGE BEFORE???
- Think about the target samples 
- Add the annotation 
- Save the filtered files

In [None]:
for threshold_target in Threshold_target:
    for threshold_cancer_cohort in Threshold_cancer_cohort:
        for n_samples_cancer in N_samples_cancer:
            for threshold_normal_cohort in Threshold_normal_cohort:
                for n_samples_normal in N_samples_normal:
    
                    print(threshold_target)
                    print(threshold_cancer_cohort)
                    print(n_samples_cancer)
                    print(threshold_normal_cohort)
                    print(n_samples_normal)


                    adjusted_threshold_col = 'tmp_cancer_cohort'
                    max_threshold_col = 'tmp_normal_Nmax_sup{}'.format(threshold_normal_cohort)
                    max_threshold_col_base = 'tmp_normal_Nmax_sup{}'.format(0)

                    df_expr = []
                    report_count = [] 
                    report_steps = []

                    df = df_load.copy()
                    # Make correction for number samples passing theshold in cohort. We want to exclude the target sample in counting
                    df[adjusted_threshold_col] = df[get_threshold_colname(threshold_cancer_cohort, tag_cancer)]
                    df.loc[df[target_sample] >= threshold_cancer_cohort, adjusted_threshold_col] -=1 

                    # Number of kmers expressed in sample 
                    df = filter_single_col(df, 0, target_sample)
                    output_count(df, report_count, report_steps, 'Init_Sample')
                    print(f'filtering Init_Sample {df.shape}')

                    # Number of kmers >= threshold in sample 
                    df = filter_single_col(df, threshold_target, target_sample)
                    output_count(df, report_count, report_steps, 'Filter_Sample')
                    print(f'filtering Filter_Sample {df.shape}')

                    # Filter for cancer cancer cohort 
                    df = filter_cancer_cohort(df, n_samples_cancer, adjusted_threshold_col)
                    output_count(df, report_count, report_steps, 'Filter_Sample_Cohort')
                    print(f'filtering Filter_Sample_Cohort {df.shape}')

                    # Expression in gtex cohort >= threshold 
                    recurrence_custom =  max_recurrence_over_kmer(df, 
                                                                  get_threshold_colname(threshold_normal_cohort, tag_normal), 
                                                                  max_threshold_col)

                    # Expression in gtex cohort > 0  
                    recurrence_custom_base = max_recurrence_over_kmer(df, 
                                                                      get_threshold_colname(0.0, tag_normal),
                                                                      max_threshold_col_base) 




                    # Perform Background filtering 
                    df = df.merge(recurrence_custom, on = 'kmer', how = 'left')
                    df = df.merge(recurrence_custom_base, on = 'kmer', how = 'left')

                    df = df.loc[ ~ (df[max_threshold_col] >= 1) & (df[max_threshold_col_base] >= n_samples_normal), :]

                    output_count(df, report_count, report_steps, 'Filter_Sample_Cohort_CohortNormal')
                    print(f'filtering Filter_Sample_Cohort_CohortNormal {df.shape}')


1.0
3.0
10
0.0
2
filtering Init_Sample (7796235, 25)
filtering Filter_Sample (7796235, 25)
filtering Filter_Sample_Cohort (7490602, 25)
filtering Filter_Sample_Cohort_CohortNormal (0, 27)
1.0
3.0
10
0.0
15
filtering Init_Sample (7796235, 25)
filtering Filter_Sample (7796235, 25)
filtering Filter_Sample_Cohort (7490602, 25)
filtering Filter_Sample_Cohort_CohortNormal (0, 27)
1.0
3.0
10
10.0
2
filtering Init_Sample (7796235, 25)
filtering Filter_Sample (7796235, 25)
filtering Filter_Sample_Cohort (7490602, 25)
filtering Filter_Sample_Cohort_CohortNormal (2526, 27)
1.0
3.0
10
10.0
15
filtering Init_Sample (7796235, 25)
filtering Filter_Sample (7796235, 25)
filtering Filter_Sample_Cohort (7490602, 25)
filtering Filter_Sample_Cohort_CohortNormal (2432, 27)
1.0
3.0
520
0.0
2
filtering Init_Sample (7796235, 25)
filtering Filter_Sample (7796235, 25)
filtering Filter_Sample_Cohort (4196105, 25)
filtering Filter_Sample_Cohort_CohortNormal (0, 27)
1.0
3.0
520
0.0
15
filtering Init_Sample (7796235

In [None]:
df.head()