In [1]:
import numpy as np
import os
import timeit
import glob 
import pandas as pd
import time
import multiprocessing as mp 
import logging
import sys 
from pathlib import Path

In [2]:
sys.path.append('/cluster/home/prelotla/github/projects2020_ohsu/eth/peptides_filtering/python_pipeline/')
from helpers_filter import * 
from background_filter import process_on_cohort

%load_ext autoreload
%autoreload 2

In [3]:
cohort_expr_support_cancer = 2
n_samples_lim_cancer = 5
sample_expr_support_cancer = 1 

cohort_expr_support_normal = 2 #TODO introduce None filtering
n_samples_lim_cancer = 5


In [4]:
base_cancer = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102'
base_gtex = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref'
target_samples = ['TCGAC8A12P01A11RA11507all',
        'TCGAAOA0JM01A21RA05607all',
        'TCGABHA18V01A11RA12D07all',
        'TCGAA2A0D201A21RA03407all',
        'TCGAA2A0SX01A12RA08407all' ]
                 
expr_matrix = 'ref_graph_kmer_JuncExpr'
metadata = ['kmer', 'coord', 'junctionAnnotated', 'readFrameAnnotated', 'isCrossJunction'] # These are the non-sample columns
normalizer_libsize = 400000
path_libsize = os.path.join(base_cancer, 'expression_counts.libsize.tsv')
sample_lim = 1
interm_cancer_cohort = 'ref_graph_kmer_normalized_filtered__.gz'
interm_gtex_cohort = 'ref_graph_kmer_normalized_filtered_10-21overlap_.gz'

In [5]:
# Cancer cohort files - 3 min 
start_time  = timeit.default_timer()
cohort_cancer = glob.glob(os.path.join(base_cancer, 'cohort_mutNone/tmp_out_ref_batch_*', interm_cancer_cohort))
time_res = timeit.default_timer() - start_time 
print(time_res)

0.5131635121069849


In [6]:
# Cancer all raw files 
start_time  = timeit.default_timer()
path_cohort = glob.glob(os.path.join(base_cancer, 'cohort_mutNone/tmp_out_ref_batch_*'))
time_res = timeit.default_timer() - start_time 
print(time_res)

0.1211572210304439


In [7]:
# GTEX cohort files 4.5 min
start_time  = timeit.default_timer()
cohort_gtex = glob.glob(os.path.join(base_gtex,'cohort_mutNone/tmp_out_ref_batch_*', interm_gtex_cohort)) #path_gtex_cohort
time_res = timeit.default_timer() - start_time 
print(time_res)

0.9797909460030496


In [8]:
# Annot
start_time  = timeit.default_timer()
annot_cancer = glob.glob(os.path.join(base_cancer, 'cohort_mutNone/tmp_out_ref_batch_*/ref_annot_kmer.gz'))
time_res = timeit.default_timer() - start_time 
print(time_res)

0.4782306698616594


In [9]:
# Out directory
outdir = os.path.join(base_cancer, 'filtered_cancer') 
Path(outdir).mkdir(parents=True, exist_ok=True)
print(f'Creating directory {outdir}')

Creating directory /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtered_cancer


In [10]:
# Sample output dir
for target_sample in target_samples:
    outdir_sample = os.path.join(outdir, target_sample)
    Path(outdir).mkdir(parents=True, exist_ok=True)
    print(f'Creating directory {outdir_sample}')

Creating directory /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtered_cancer/TCGAC8A12P01A11RA11507all
Creating directory /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtered_cancer/TCGAAOA0JM01A21RA05607all
Creating directory /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtered_cancer/TCGABHA18V01A11RA12D07all
Creating directory /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtered_cancer/TCGAA2A0D201A21RA03407all
Creating directory /cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtered_cancer/TCGAA2A0SX01A12RA08407all


# Strategy 
- DO the cohort filtering once for cancer (AND HIDE SAMPLES.... Create 4 files)
    - OTHER STRATEGY: Check the cohort filter in the target cancer sample AND adjust the number of samples passing the threshold depending on the targetsample!!!!
    - ISSUE: THERE IS BOTH JUNCTION AND SEGMENT KMERS IN cancer cohort files ===> NOT NEEDED --- 
- Do the cohort filtering once for GTEX
- Load annot + 2 cohort + sample cancer (5) => Mega table - Save TODAY MONDAY !!!!!
- Then do the code for the filtering !!!!

# Create large intermediate table

## Load

In [11]:
df_expr = []
report_count = [] 
report_steps = []

In [12]:
# Load libsize - OK
libsize = process_libsize(path_libsize, normalizer_libsize) 

In [13]:
# Load cancer sample - 20 min #TODO Add batch of origin?

start_time  = timeit.default_timer()
for idx, batch_gene in enumerate(path_cohort):
    if os.path.exists(os.path.join(batch_gene , expr_matrix)) and \
       os.path.exists(os.path.join(batch_gene , 'output_sample_IS_SUCCESS')):
        
        partitions = glob.glob(os.path.join(batch_gene , expr_matrix) + '/*')
        #print(os.path.join(batch_gene , expr_matrix))
        #print(f'Number of partitions is {len(partitions)}')
        for part in partitions:
            kmers_sample = pd.read_csv(part, sep = '\t', usecols = target_samples + metadata)
            kmers_sample['batch'] = os.path.basename(batch_gene).split('_')[-1]
            df_expr.append(kmers_sample)
        #print(f'processed gene {idx}')


cancer_targets = pd.concat(df_expr, axis = 0)

time_res = timeit.default_timer() - start_time 
print(time_res)

267.6224669930525


In [14]:
metadata = metadata + ['batch']

In [15]:
# Normalize sample of interest 
sample_cols = target_samples 
cancer_targets = normalization(cancer_targets, sample_cols, libsize, metadata )

In [16]:
# Set of target kmers
target_kmers = set(cancer_targets['kmer'])

In [17]:
len(target_kmers)

4177614

In [18]:
cancer_targets.shape

(4713769, 11)

In [19]:
cancer_targets.head()

Unnamed: 0,kmer,coord,junctionAnnotated,readFrameAnnotated,isCrossJunction,batch,TCGAC8A12P01A11RA11507all,TCGAAOA0JM01A21RA05607all,TCGABHA18V01A11RA12D07all,TCGAA2A0D201A21RA03407all,TCGAA2A0SX01A12RA08407all
0,FKHLKRWSL,92379844:92379859:92387782:92387794:None:None,False,False,True,56543,0.0,0.0,0.0,0.0,0.0
1,WYITRSGIA,92347505:92347506:92349915:92349941:None:None,False,False,True,56543,0.0,0.0,0.0,4.102634,0.0
2,SEFVSRHLR,92345770:92345787:92586717:92586727:None:None,False,False,True,56543,0.0,0.0,0.0,0.0,0.0
3,RWSFALVVQ,92379857:92379859:92555247:92555272:None:None,False,False,True,56543,0.0,0.0,0.0,0.0,0.0
4,VSRREKRSV,92345779:92345787:92379767:92379786:None:None,False,False,True,56543,0.0,0.0,0.0,0.0,0.0


In [20]:
# restrict expressed in target samples - hardcoded number of targets

cancer_targets = cancer_targets.loc[(cancer_targets[target_samples[0]] > 0) | \
                  (cancer_targets[target_samples[1]] > 0) | \
                  (cancer_targets[target_samples[2]] > 0) |\
                  (cancer_targets[target_samples[3]] > 0) | \
                  (cancer_targets[target_samples[4]] > 0) ]

In [21]:
cancer_targets.shape

(1253921, 11)

In [22]:
cancer_targets = cancer_targets.drop_duplicates() # drop duplicates

In [23]:
cancer_targets.shape

(1253544, 11)

In [24]:
# Load annotation - OK 12 min 
# Create a set of annotated kmers
print('Processing annotation')
start_time  = timeit.default_timer()
cancer_targets_annotated = set()
for idx, path in enumerate(annot_cancer):
    annot = pd.read_csv(path, sep = ',') 
    annot = set(annot['kmer'])
    cancer_targets_annotated.update(target_kmers.intersection(annot))
time_res = timeit.default_timer() - start_time 
print(time_res)

Processing annotation
30.96976092015393


In [25]:
len(cancer_targets_annotated)

281109

In [26]:
# Load cancer cohort - 11 minutes
start_time  = timeit.default_timer()
df_cancer_cohort = []
print(f'Reading cohort cancer')
for path in cohort_cancer:
    tmp_cancer = pd.read_csv(path, sep = ',') 
    tmp_cancer = tmp_cancer.loc[tmp_cancer['isCrossJunction'] == True]
    tmp_cancer['batch'] = path.split('/')[-2].split('_')[-1]
    df_cancer_cohort.append(tmp_cancer)
time_res = timeit.default_timer() - start_time 
print(time_res)


Reading cohort cancer
59.40803129598498


In [27]:
len(df_cancer_cohort)

17314

In [28]:
# Load GTEX cohort - OK 1h50
start_time  = timeit.default_timer()
df_gtex_cohort = []
print(f'Reading cohort gtex')
for idx, path in enumerate(cohort_gtex):
    tmp_gtex = pd.read_csv(path, sep = ',')
    tmp_kmer = set(tmp_gtex['kmer'])
    tmp_kmer = tmp_kmer.intersection(target_kmers)
    tmp_gtex = tmp_gtex.set_index('kmer').loc[tmp_kmer].reset_index()
    df_gtex_cohort.append(tmp_gtex)
time_res = timeit.default_timer() - start_time 
print(time_res)

Reading cohort gtex
6544.86534468201


In [29]:
len(df_gtex_cohort)

19186

## Merge

In [30]:
cancer_targets.shape

(1253544, 11)

In [31]:
len(cancer_targets_annotated)

281109

In [32]:
len(df_gtex_cohort)

19186

In [33]:
len(df_cancer_cohort)

17314

In [34]:
df_gtex_cohort = pd.concat(df_gtex_cohort, axis = 0)

df_cancer_cohort = pd.concat(df_cancer_cohort, axis = 0)

df_gtex_cohort = df_gtex_cohort.drop_duplicates()

df_cancer_cohort = df_cancer_cohort.drop_duplicates()

In [35]:
len(df_gtex_cohort)

19676087

In [36]:
len(df_cancer_cohort)

4708347

In [37]:
# cohorts
df_gtex_cohort = df_gtex_cohort.rename({col: 'gtexCohort' + col for col in df_gtex_cohort.columns if 'filter' in col}, axis = 1)

df_cancer_cohort = df_cancer_cohort.rename({col: 'cancerCohort' + col for col in df_cancer_cohort.columns if 'filter' in col}, axis = 1)

In [38]:
# annotation
cancer_targets_annotated = pd.DataFrame(cancer_targets_annotated, columns = ['kmer'])

cancer_targets_annotated['isAnnotated'] = 1
cancer_targets_annotated = cancer_targets_annotated.drop_duplicates()

In [39]:
# Merge annotation 
cancer_targets = cancer_targets.merge(cancer_targets_annotated, on = 'kmer', how = 'left')

In [40]:
cancer_targets.shape

(1253544, 12)

In [41]:
# Merge cancer on coord and kmer col
cancer_targets = df_cancer_cohort.loc[:, metadata + \
                     [col for col in df_cancer_cohort.columns \
                      if 'filter' in col]].merge(cancer_targets , on = metadata, how = 'right')

In [42]:
cancer_targets.shape

(1253544, 18)

In [43]:
cancer_targets.head()

Unnamed: 0,kmer,coord,junctionAnnotated,readFrameAnnotated,isCrossJunction,batch,cancerCohortfilter >0.0,cancerCohortfilter >=1.0,cancerCohortfilter >=2.0,cancerCohortfilter >=3.0,cancerCohortfilter >=5.0,cancerCohortfilter >=10.0,TCGAC8A12P01A11RA11507all,TCGAAOA0JM01A21RA05607all,TCGABHA18V01A11RA12D07all,TCGAA2A0D201A21RA03407all,TCGAA2A0SX01A12RA08407all,isAnnotated
0,WYITRSGIA,92347505:92347506:92349915:92349941:None:None,False,False,True,56543,287,263,138,82,26,5,0.0,0.0,0.0,4.102634,0.0,
1,ISSQSRVEK,92379851:92379859:92493866:92493885:None:None,False,False,True,56543,49,42,10,2,0,0,0.0,0.0,2.474321,0.0,0.0,
2,RSGDEEKYP,92600493:92600508:92611313:92611325:None:None,True,True,True,56543,846,820,641,520,348,170,2.922641,2.102386,1.237161,0.0,0.0,1.0
3,HLKMKMFQI,92379850:92379859:92496416:92496434:None:None,False,False,True,56543,62,53,16,2,0,0,2.922641,0.0,0.0,0.0,0.0,
4,ELSEFVSRL,92345764:92345787:92347411:92347415:None:None,False,False,True,56543,304,296,246,185,99,31,0.0,0.0,0.0,2.735089,2.280385,


In [44]:
#TODO!!!!!! POTENTIAL BLOW UP !!!!!

In [45]:
# Merge normals on kmer col
cancer_targets = df_gtex_cohort.loc[:, ['kmer'] + \
                     [col for col in df_gtex_cohort.columns \
                      if 'filter' in col]].merge(cancer_targets , on = ['kmer'] , how = 'right')

In [46]:
cancer_targets.head()

Unnamed: 0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,cancerCohortfilter >=2.0,cancerCohortfilter >=3.0,cancerCohortfilter >=5.0,cancerCohortfilter >=10.0,TCGAC8A12P01A11RA11507all,TCGAAOA0JM01A21RA05607all,TCGABHA18V01A11RA12D07all,TCGAA2A0D201A21RA03407all,TCGAA2A0SX01A12RA08407all,isAnnotated
0,WYITRSGIA,3290.0,1969.0,861.0,394.0,89.0,4.0,92347505:92347506:92349915:92349941:None:None,False,False,...,138,82,26,5,0.0,0.0,0.0,4.102634,0.0,
1,WYITRSGIA,572.0,545.0,243.0,130.0,27.0,2.0,92347505:92347506:92349915:92349941:None:None,False,False,...,138,82,26,5,0.0,0.0,0.0,4.102634,0.0,
2,ISSQSRVEK,278.0,261.0,82.0,29.0,7.0,0.0,92379851:92379859:92493866:92493885:None:None,False,False,...,10,2,0,0,0.0,0.0,2.474321,0.0,0.0,
3,RSGDEEKYP,7350.0,4622.0,2653.0,1745.0,1126.0,734.0,92600493:92600508:92611313:92611325:None:None,True,True,...,641,520,348,170,2.922641,2.102386,1.237161,0.0,0.0,1.0
4,HLKMKMFQI,146.0,136.0,42.0,20.0,3.0,1.0,92379850:92379859:92496416:92496434:None:None,False,False,...,16,2,0,0,2.922641,0.0,0.0,0.0,0.0,


In [47]:
cancer_targets.shape

(14469694, 24)

In [50]:
cancer_targets.head()

Unnamed: 0,kmer,gtexCohortfilter >0.0,gtexCohortfilter >=1.0,gtexCohortfilter >=2.0,gtexCohortfilter >=3.0,gtexCohortfilter >=5.0,gtexCohortfilter >=10.0,coord,junctionAnnotated,readFrameAnnotated,...,cancerCohortfilter >=2.0,cancerCohortfilter >=3.0,cancerCohortfilter >=5.0,cancerCohortfilter >=10.0,TCGAC8A12P01A11RA11507all,TCGAAOA0JM01A21RA05607all,TCGABHA18V01A11RA12D07all,TCGAA2A0D201A21RA03407all,TCGAA2A0SX01A12RA08407all,isAnnotated
0,WYITRSGIA,3290.0,1969.0,861.0,394.0,89.0,4.0,92347505:92347506:92349915:92349941:None:None,False,False,...,138,82,26,5,0.0,0.0,0.0,4.102634,0.0,
1,WYITRSGIA,572.0,545.0,243.0,130.0,27.0,2.0,92347505:92347506:92349915:92349941:None:None,False,False,...,138,82,26,5,0.0,0.0,0.0,4.102634,0.0,
2,ISSQSRVEK,278.0,261.0,82.0,29.0,7.0,0.0,92379851:92379859:92493866:92493885:None:None,False,False,...,10,2,0,0,0.0,0.0,2.474321,0.0,0.0,
3,RSGDEEKYP,7350.0,4622.0,2653.0,1745.0,1126.0,734.0,92600493:92600508:92611313:92611325:None:None,True,True,...,641,520,348,170,2.922641,2.102386,1.237161,0.0,0.0,1.0
4,HLKMKMFQI,146.0,136.0,42.0,20.0,3.0,1.0,92379850:92379859:92496416:92496434:None:None,False,False,...,16,2,0,0,2.922641,0.0,0.0,0.0,0.0,


In [51]:
intermediate_output = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_intermediate/complete_cancer_candidates_missing_162_45.tsv.gz'

In [52]:
cancer_targets.to_csv(intermediate_output, compression='gzip', sep = '\t', index=None)

# Perform the filtering

In [None]:
cancer_targets2.shape

In [None]:
cancer_targets3 = cancer_targets_back[['kmer', 'coord', 'batch']].drop_duplicates().merge(cancer_targets2, how = 'right', on = ['coord','kmer'])

In [None]:
cancer_targets3.shape

In [None]:
cancer_targets3.loc[cancer_targets3['isAnnotated'] != 1, :].head()

In [None]:
filter_threshold = 'gtexCohortfilter >0.0'

In [None]:
max_threshold_gtex = cancer_targets3[['kmer',filter_threshold]].groupby('kmer').max().reset_index()\
.rename({filter_threshold: 'max_' + filter_threshold}, axis = 1)

In [None]:
not_annot_not_gtex = max_threshold_gtex.merge(cancer_targets3, how = 'right', on = 'kmer')

In [None]:
not_annot_not_gtex.shape

In [None]:
not_annot_not_gtex = not_annot_not_gtex.loc[(not_annot_not_gtex['max_' + filter_threshold] == 0) \
                    & (not_annot_not_gtex['isAnnotated'] != 1), :]

In [None]:
not_annot_not_gtex.shape


In [None]:
not_annot_not_gtex

In [None]:
not_annot_not_gtex.head()

In [None]:
import matplotlib.pyplot as plt 

In [None]:
not_annot_not_gtex.columns

In [None]:
not_annot_not_gtex['junctionAnnotated'].unique()

In [None]:
plt.hist(not_annot_not_gtex['cancerCohortfilter >0.0'], bins = 100)

In [None]:
not_completed = '/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/GTEX2019_eth/GTEX2019_c4dd02c_conf2_RFall_ref/missing_20230214.txt'

In [None]:
not_completed = list(pd.read_csv(not_completed, header = None)[0])

In [None]:
not_completed

In [None]:
df_not = []
start_time  = timeit.default_timer()
for idx, batch_gene in enumerate(path_cohort):
    if os.path.basename(batch_gene) in not_completed:
        if os.path.exists(os.path.join(batch_gene , expr_matrix)) and \
           os.path.exists(os.path.join(batch_gene , 'output_sample_IS_SUCCESS')):

            partitions = glob.glob(os.path.join(batch_gene , expr_matrix) + '/*')
            #print(os.path.join(batch_gene , expr_matrix))
            #print(f'Number of partitions is {len(partitions)}')
            for part in partitions:
                kmers_sample = pd.read_csv(part, sep = '\t', usecols = target_samples + metadata)
                df_not.append(kmers_sample)
            #print(f'processed gene {idx}')


cancer_not = pd.concat(df_not, axis = 0)

time_res = timeit.default_timer() - start_time 
print(time_res)

In [None]:
cancer_not.shape

In [None]:
cancer_not['gtex_not_processed'] = 1


In [None]:
cancer_not = cancer_not.drop_duplicates()

In [None]:
cancer_not[['gtex_not_processed', 'kmer']].head()

In [None]:
not_annot_not_gtex = not_annot_not_gtex.merge(cancer_not[['gtex_not_processed', 'kmer']], on='kmer', how='left')

In [None]:
plt.hist(not_annot_not_gtex.loc[not_annot_not_gtex['junctionAnnotated'] == False,\
                                'cancerCohortfilter >0.0'], bins = 100)

In [None]:
df_plot = not_annot_not_gtex.loc[not_annot_not_gtex['junctionAnnotated'] == False,\
                                :]

plt.scatter(df_plot['cancerCohortfilter >0.0'], df_plot['TCGAC8A12P01A11RA11507all'])

In [None]:
df_plot.head()

In [None]:
df_plot = not_annot_not_gtex.loc[not_annot_not_gtex['junctionAnnotated'] == True,\
                                :]


plt.scatter(df_plot['cancerCohortfilter >0.0'], df_plot['TCGAC8A12P01A11RA11507all'])

In [None]:
df_plot = not_annot_not_gtex[not_annot_not_gtex['cancerCohortfilter >0.0'] > 800]


In [None]:
len(df_plot['batch'].unique())


In [None]:
df_plot.to_csv('/cluster/work/grlab/projects/projects2020_OHSU/peptides_generation/CANCER_eth/commit_c4dd02c_conf2_Frame_cap0_runs/TCGA_Breast_1102/filtering_intermediate/complete_cancer_candidates_false_positives.tsv.gz', 
              compression = 'gzip', index = None)

In [None]:
df_plot.head()

# Checks

In [None]:
kmer_test = 'GRRWSRPRW'

In [None]:
df_gtex_cohort.loc[df_gtex_cohort['kmer'] == kmer_test]

In [None]:
cancer_targets3.loc[cancer_targets3['kmer'] == kmer_test]

In [None]:
# Look for the 

In [None]:
cancer_targets = filter_single_col(cancer_targets, 0, target_sample)
output_count(cancer_targets, report_count, report_steps, 'Init_Sample')
cancer_targets = filter_single_col(cancer_targets, 10, target_sample)
output_count(cancer_targets, report_count, report_steps, 'Filter_Sample')

In [None]:
def filter_normal_cohort(df, n_samples_lim_normal, cohort_expr_support_normal, tag = '' ):
    base_reads = 0 
    base_filter_column = f'{tag}filter > {base_reads}'
    base_sample = 1
    if (cohort_expr_support_normal is not None) or (cohort_expr_support_normal > 0 ):
        filter_column = f'{tag}filter >= {cohort_expr_support_normal}'
    else:
        filter_column = base_filter_column
    
      
    res_column = f'{tag}filter >= {cohort_expr_support_normal} in any sample and any read in {n_samples_lim_normal} '
    df[res_column] = df.loc[(df[filter_column] >= base_sample) \
                            and (df[base_filter_column] >= n_samples_lim_normal)]

        
    return df
        

In [None]:
def filter_single_col(df, threshold, col):
    print(df.shape)
    if threshold:
        filter_col = f'filter >={threshold}'
        df = df.loc[df[col] >= threshold, :]
    else:
        filter_col = f'filter >{threshold}'
        df = df.loc[df[col] >threshold, :]
    print(df.shape)
    return df

In [None]:
def output_count(df, report_count, report_step, step_string, perform_count=True):
    '''
    Performs a count operation on the number of kmers present in spark dataframe after a given filtering step
    Note: This operation is expensive but useful if the user is interested in intermediate filtering steps
    :param perform_count: bool whether to perform a count operation
    :param df: dataframe with kmer expression counts
    :param report_count: list to store result of successive counting operations
    :param report_step: list to store name of successive counting operations
    :param step_string: str name of the counting operation
    '''
    if perform_count:
        mycount = df['kmer'].unique().shape[0]
        report_count.append(mycount)
        report_step.append(step_string)
        logging.info(f'# {step_string} n = {mycount} kmers')

In [None]:
def save_output_count(output_count, report_count, report_steps, prefix, cancer_sample_ori, mutation_mode,
                      sample_expr_support_cancer, cohort_expr_support_cancer, n_samples_lim_cancer,
                          cohort_expr_support_normal, n_samples_lim_normal, id_normals):
    '''
    Saves the number of kmers present in spark dataframe after each filtering step in a tabular file
    :param output_count: str path for count file of intermediate filtering steps
    :param report_count: list to store result of successive counting operations
    :param report_step: list to store name of successive counting operations
    :param prefix: str information to be added to the result line in an info column
    :param cancer_sample_ori: str id of target cancer sample which was filtered
    :param mutation_mode: str information about whether mutations where applied or not
    :param sample_expr_support_cancer: float normalized expression threshold for the cancer target sample
    :param cohort_expr_support_cancer: float normalized expression threshold for the cancer cohort
    excluding the target sample
    hich should be met in n samples
    :param n_samples_lim_cancer: int number of cancer samples in which the cancer cohort expression threshold
    should be met
    :param cohort_expr_support_normal: float normalized expression threshold for the normal cohort
    required in any sample (>=1)
    :param n_samples_lim_normal: int number of normal samples in which any number of reads is required (>0)
    :param id_normals: str id of the normal cohort (example gtex)
    '''
    if output_count:
        header = (f'{"sample"}\t{"mutation_mode"}\t{"min_sample_reads"}\t{"#_of_cohort_samples"}\t'
                  f'{"reads_per_cohort_sample"}\t{"#_normal_samples_allowed"}\t{"normal_cohort_id"}'
                  f'\t{"reads_per_normal_sample"}')
        line =   (f'{cancer_sample_ori}\t{mutation_mode}\t{sample_expr_support_cancer}\t{n_samples_lim_cancer}'
                  f'\t{cohort_expr_support_cancer}\t{n_samples_lim_normal}\t{id_normals}'
                  f'\t{cohort_expr_support_normal}')

        for idx in np.arange(len(report_count)):
            header += f'\t{report_steps[idx]}'
            line += f'\t{report_count[idx]}'
        if prefix:
            header += f'\t{"info"}'
            line += f'\t{prefix}'
        header += "\n"
        line += "\n"
        if not os.path.exists(output_count):
            with open(output_count,"w") as f:
                f.write(header)
        with open(output_count, "a") as f:
            f.write(line)
        logging.info(f'Save intermediate info to {output_count}')