In [1]:
import numpy as np
import pandas as pd 
import glob
import os
from multiprocessing import Pool

#### First, load all compactors that were submitted to Pfam. 

In [3]:
pfam_files = glob.glob('pfam_with_position/Pfam_alignment/*fasta')

pfams = pd.read_csv(pfam_files[0],engine='python',sep='\t',header=None)
for i in range(1,len(pfam_files)):
    try:
        pfams_1 = pd.read_csv(pfam_files[i],engine='python',sep='\t',header=None)
        pfams = pd.concat([pfams_1,pfams])
    except pd.errors.EmptyDataError:
        pass

pfams = pd.DataFrame({'header':[i for i in pfams[0] if i[0]=='>'],'sequence':[i for i in pfams[0] if i[0]!='>']})

pfams['header'] = [i[1:] for i in pfams['header'] ]

pfams.to_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/compactors_input_to_Pfam.tsv',sep='\t',index=None)


In [2]:
pfams = pd.read_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/compactors_input_to_Pfam.tsv',sep='\t')


#### Now, load all relevant SPLASH statistics for these anchors. 

In [4]:
stats = pd.read_csv('/oak/stanford/groups/horence/george/protein_domain_project/mining_10x_ss2_Pfam_compactors_SPLASH_stats.tsv',sep='\t',usecols=['anchor','ds','effect_size_bin','number_nonzero_samples','target_entropy','avg_edit_distance_max_target'])
stats2 = pd.read_csv('mining_botryllus_ss2_Pfam_compactors_SPLASH_stats.tsv',sep='\t',usecols=['anchor','ds','effect_size_bin','number_nonzero_samples','target_entropy','avg_edit_distance_max_target'])
stats = pd.concat([stats2.drop_duplicates(),stats.drop_duplicates()]).reset_index(drop=True)

botryllus = pd.read_csv('/oak/stanford/groups/horence/george/botryllus_december_2023/all_compactors_and_SPLASH_statistics_and_Pfam.tsv',sep='\t',usecols=['compactor','exact_support','ds'])
m_test = pd.read_csv('/oak/stanford/groups/horence/george/mass_pfam_01192024/mining_10x_ss2_mkokot.tsv',sep='\t',usecols=['compactor','exact_support','ds'])

botryllus = botryllus.drop_duplicates()
m_test = m_test.drop_duplicates()
compactor_support_ds = pd.concat([m_test,botryllus]).reset_index(drop=True)
compactor_support_ds['anchor'] = [i[:27] for i in compactor_support_ds['compactor']]

compactors_stats = compactor_support_ds.merge(stats,how='left')
compactors_stats.to_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/compactors_input_to_Pfam_with_SPLASH_stats.tsv',sep='\t',index=None)


In [3]:
compactors_stats = pd.read_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/compactors_input_to_Pfam_with_SPLASH_stats.tsv',sep='\t')


#### Join the SPLASH statistics left onto the compactors with headers.

In [4]:
pfams['anchor'] = [i[:27] for i in pfams['sequence']]

#### Load compactor Pfam results. 

In [7]:
pfam_files = glob.glob('pfam_with_position/Pfam_alignment/*PFAM.tblout')

pfams1 = pd.read_csv(pfam_files[0],engine='python',sep='\t',header=None)
for i in range(1,len(pfam_files)):
    try:
        pfams_1 = pd.read_csv(pfam_files[i],engine='python',sep='\t',header=None)
        pfams1 = pd.concat([pfams_1,pfams1])
    except pd.errors.EmptyDataError:
        pass
    
## Define the fields in the Pfam output. 
fields = ['header','accession','tlen','query_name','accession2','qlen','full_seq_evalue','full_seq_score','full_seq_bias','this_domain_number','this_domain_of','this_domain_c_evalue','this_domain_i_evalue','this_domain_score','this_domain_bias','hmm_coord_from','hmm_coord_to','ali_coord_from','ali_coord_to','env_coord_from','env_coord_to','acc','description_of_target']

## Parse the entries in the Pfam table which are not related to formatting. 
pfam_list_compactor = [i for i in list(pfams1.iloc[:,0]) if '#' not in i]
compactor_ok = [ i.split(' ') for i in pfam_list_compactor]
compactor_okok = []
for i in compactor_ok: 
    lis = [j for j in i if j]
    lis = lis[:22] + [' '.join(lis[23:])]
    compactor_okok.append(lis)

compactor_pfam_structured = pd.DataFrame(compactor_okok, columns = fields)
compactor_pfam_structured = compactor_pfam_structured.add_prefix('Pfam_') 

## As the table has been loaded as a string, convert the numerical field (e-value) to a float to ensure 
## integrity of operations using this value. 
compactor_pfam_structured['Pfam_full_seq_evalue'] = compactor_pfam_structured['Pfam_full_seq_evalue'].astype(float)

## Filter so that we retain e-values < 0.05. 
compactor_pfam_structured = compactor_pfam_structured[compactor_pfam_structured['Pfam_full_seq_evalue']<0.05].reset_index(drop=True)

compactor_pfam_structured = compactor_pfam_structured.rename(columns={'Pfam_header':'header'})

compactor_pfam_structured = compactor_pfam_structured.merge(pfams,how='left')
compactor_pfam_structured['Pfam_frame'] = [int(i.split('=')[1]) for i in compactor_pfam_structured['header']]
compactor_pfam_structured['Pfam_strand'] = [i>0 for i in compactor_pfam_structured['Pfam_frame']]

compactor_pfam_structured[['Pfam_env_coord_from','Pfam_env_coord_to']] = compactor_pfam_structured[['Pfam_env_coord_from','Pfam_env_coord_to']].astype(int)
compactor_pfam_structured[['Pfam_this_domain_i_evalue']] = compactor_pfam_structured[['Pfam_this_domain_i_evalue']].astype(float)
compactor_pfam_structured['Pfam_strand'] = compactor_pfam_structured['Pfam_strand'].replace({True:'+',False:'-'})

## Write the Pfam results file. 

compactor_pfam_structured.to_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/Pfam_hits.tsv',sep='\t',index=None)


In [6]:
compactor_pfam_structured = pd.read_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/Pfam_hits.tsv',sep='\t')


#### Use lookup table to perform contaminant filtering. 

In [None]:
"""file = open('/oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/lookup_input.fasta','a')

for i in pfams.index:
    
    file.write('>'+pfams['header'][i]+'\n'+pfams['compactor'][i]+'\n')
    
file.close()

os.system('sbatch /oak/stanford/groups/horence/george/splash_utils/lookup_242_fasta.sh /oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/lookup_input.fasta /oak/stanford/groups/horence/george/lookup_table_index_02082024/mining_built_index /oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/lookup_out.tsv')
"""

In [None]:
command = 'grep -vn "Carp_GCA_019924925\|Ralstonia\|SRA_synthetic_barcodes\|illumina_adapters\|GCA_004000535\|UniVec" lookup_out.tsv | cut -f1 -d: > internal_files/non_contaminant_indices.txt '
command2 = 'grep -n "final_purged_primary\|final_purged_haplotigs\|botznik-chr" lookup_out.tsv | cut -f1 -d: > internal_files/botryllus_indices.txt'
    

In [45]:
## Get the compactors that do not show up in any contaminant file. 

indic = pd.read_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/internal_files/non_contaminant_indices.txt',header=None)
indic[0] = [i-1 for i in indic[0]]

acceptable_compactors = pfams.loc[indic[0]][['compactor']]


#### Reduce this set of compactor hits to that set representing nonoverlapping best hits. 

In [8]:
def inds(data):
    
    ## Create a local copy. 
    copy = data.sort_values('Pfam_this_domain_i_evalue')

    ## Get the index corresponding to the best e-value hit. 
    take_index = [int(copy.index[0])]
    
    ## Get the range of seqeunce coordinates corresponding to the hit. 
    initial_coord_set = set(list(range(int(copy['Pfam_env_coord_from'][take_index[0]]),int(copy['Pfam_env_coord_to'][take_index[0]]+1))))
    
    ## For each remaining hit: 
    for ind in copy.index[1:]:
        
        ## Extrac the range of sequence coordinates. 
        this_coord_set = set(list(range(int(copy['Pfam_env_coord_from'][ind]),int(copy['Pfam_env_coord_to'][ind])+1)))
        
        ## Get the intersect of this hit's sequence coordinates and all accepted hits' coordinates. 
        intersect = set(this_coord_set&initial_coord_set)
        
        ## If the intersect size is 0, we do not have a best hit for these coordinates, so we add this one.
        ## Further, we update the set of coordinates for which we have best hits. 
        if len(intersect) == 0:
            take_index.append(ind)
            initial_coord_set = set(this_coord_set|initial_coord_set)

    ## We return the indices corresponding to best hits covering the input sequence. 
    return take_index

def applyParallel(dfGrouped, func):
    with Pool(int(os.environ['SLURM_JOB_CPUS_PER_NODE'])) as p:
        ret_list = p.map(func, [group for name, group in dfGrouped])
    return ret_list

def flatten_extend(matrix):
    """
    https://realpython.com/python-flatten-list/
    """
    flat_list = []
    for row in matrix:
        flat_list.extend(row)
    return flat_list

In [9]:
gpby = compactor_pfam_structured.groupby(['header','Pfam_strand'])[['Pfam_this_domain_i_evalue','Pfam_env_coord_from','Pfam_env_coord_to']]
outs = applyParallel(gpby, inds)
best_spots = compactor_pfam_structured.loc[flatten_extend(outs)]


In [10]:
pfams = pfams.rename(columns={'sequence':'compactor'})
compactors_stats = compactors_stats.merge(pfams,how='left')
assert compactors_stats.shape == (5857829, 9)

(5857829, 9)

In [17]:
#### Merge these selected Pfam hits with the anchors and SPLASH statistics. 
best_spots = best_spots[[i for i in best_spots.columns if i not in ['sequence','anchor']]]
best_spots['Pfam_header'] = best_spots['header']
best_spots['header'] = [i.split('_frame')[0] for i in best_spots['Pfam_header']]
pfam_with_stats = compactors_stats.merge(best_spots)

#### Introduce length. 
pfam_with_stats['length'] = [len(i) for i in pfam_with_stats['compactor']]


In [11]:
pfam_with_stats = pd.read_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/best_nonoverlapping_Pfam_hits.tsv',sep='\t')
marek_metadata = pd.read_csv('/scratch/groups/horence/mkokot/2023-11-16/SraRunTable.csv',sep=',')[['Experiment','Organism']]
pfam_with_stats['Experiment'] = [i.split('/')[-1] for i in pfam_with_stats['ds']]
metadata = pd.concat([marek_metadata, pd.DataFrame({'Experiment':['cell_island_brain','brain'],'Organism':['Botryllus schlosseri','Botryllus schlosseri']})]).drop_duplicates().reset_index(drop=True)
pfam_with_stats.merge(metadata,how='left').to_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/best_nonoverlapping_Pfam_hits.tsv',sep='\t',index=None)


#### Define a function to check for co-occurrence of domains. 


In [12]:

def make_table(data, length):
    
    ## Take only sequences of a particular length.
    subset = data[data['length'] == length].dropna(subset='Pfam_query_name')
    
    ## Get the number of unique compactors per anchor, Pfam domain pair. 
    test = subset.groupby(['anchor','Pfam_query_name'])['compactor'].nunique().reset_index()
    
    ## Get compactor's unique count and get the domains in the top 3 by # assigned compactors for the anchor. 
    test['domain_rank'] = test.groupby(['anchor'])['compactor'].rank('first',ascending=False)
    test = test.rename(columns={'compactor':'compactor_count'})
    test = test[test['domain_rank'] < 4]
    test['domain_rank'] = test['domain_rank'].astype(int)
    top_three = subset[['anchor','compactor','Pfam_query_name']].drop_duplicates().reset_index(drop=True)\
    .merge(test[['anchor','Pfam_query_name','domain_rank','compactor_count']])

    ## Get counts for compactors having domains 1 and 2. 
    one_two = top_three[top_three['domain_rank'].isin([1,2])]
    one_two = one_two.groupby(['anchor','compactor'])['Pfam_query_name'].nunique().reset_index()
    one_two = one_two[one_two['Pfam_query_name']>1]
    one_two = one_two.groupby('anchor')['compactor'].nunique().reset_index().rename(columns={'compactor':'1_2'})
    
    ## Get counts for compactors having domains 1 and 3. 
    one_three = top_three[top_three['domain_rank'].isin([1,3])]
    one_three = one_three.groupby(['anchor','compactor'])['Pfam_query_name'].nunique().reset_index()
    one_three = one_three[one_three['Pfam_query_name']>1]
    one_three = one_three.groupby('anchor')['compactor'].nunique().reset_index().rename(columns={'compactor':'1_3'})
    
    ## Get counts for compactors having domains 3 and 2. 
    two_three = top_three[top_three['domain_rank'].isin([3,2])]
    two_three = two_three.groupby(['anchor','compactor'])['Pfam_query_name'].nunique().reset_index()
    two_three = two_three[two_three['Pfam_query_name']>1]
    two_three = two_three.groupby('anchor')['compactor'].nunique().reset_index().rename(columns={'compactor':'2_3'})

    ## Get counts for compactors having domains 1, 2, and 3. 
    one_two_three = top_three.copy()
    one_two_three = one_two_three.groupby(['anchor','compactor'])['Pfam_query_name'].nunique().reset_index()
    one_two_three = one_two_three[one_two_three['Pfam_query_name']>2]
    one_two_three = one_two_three.groupby('anchor')['compactor'].nunique().reset_index().rename(columns={'compactor':'1_2_3'})

    ## Get counts (stratified by intersect) into one table. 
    summarize_intersect = one_two.merge(one_three,how='outer').fillna(0)\
    .merge(two_three,how='outer').fillna(0)\
    .merge(one_two_three,how='outer').fillna(0)

    top_three = top_three[['anchor','Pfam_query_name','domain_rank','compactor_count']].drop_duplicates()
    top_three['domain_rank'] = ['Domain_'+str(i) for i in top_three['domain_rank']]
    name_top_3 = top_three.pivot(columns='domain_rank',values='Pfam_query_name',index='anchor').reset_index()
    top_three['domain_rank'] = [i + '_Count' for i in top_three['domain_rank']]
    top_three = top_three.pivot(columns='domain_rank',values='compactor_count',index='anchor').reset_index()

    domains_counts_per_anchor = name_top_3.merge(top_three)
    if 'Domain_2_Count' not in domains_counts_per_anchor.columns: 
        domains_counts_per_anchor['Domain_2_Count'], domains_counts_per_anchor['Domain_3_Count'] = [0 for i in range(domains_counts_per_anchor.shape[0])],[0 for i in range(domains_counts_per_anchor.shape[0])]
    elif 'Domain_3_Count' not in domains_counts_per_anchor.columns:
        domains_counts_per_anchor['Domain_3_Count'] = [0 for i in range(domains_counts_per_anchor.shape[0])]
    domains_counts_per_anchor['Domain_1_Count'], domains_counts_per_anchor['Domain_2_Count'], domains_counts_per_anchor['Domain_3_Count'] = domains_counts_per_anchor['Domain_1_Count'].fillna(0).astype(int), domains_counts_per_anchor['Domain_2_Count'].fillna(0).astype(int), domains_counts_per_anchor['Domain_3_Count'].fillna(0).astype(int)
    domains_counts_per_anchor = domains_counts_per_anchor.merge(summarize_intersect,how='left').fillna(int(0))
    
    return domains_counts_per_anchor

In [34]:
def write_concurrence_func(data,outdir):
    loa = dict()
    
    ## For each value of compactor length:
    for i in data['length'].unique(): 
        
        ## Make a table using the function we've defined. 
        loa[str(i)] = make_table(data,i)
        
    ## First assign a new column outside of the loop. 
    floa = loa[str(min(data['length'].unique()))]
    floa['compactor_length'] = min(data['length'].unique())
    for i in loa.keys():
        
        ## Assign new column if we are not revisiting the original. 
        if int(i) != min(data['length'].unique()):
            loa[i]['compactor_length'] = int(i)
            floa = pd.concat([floa,loa[i]])
    
    ## Write results across values of num_extended.
    floa.fillna(0).to_csv(outdir+'/domain_concurrence_counts.tsv',sep='\t',index=None)
    return


In [19]:
"""loa = dict()
for i in pfam_with_stats['length'].unique(): 
    loa[str(i)] = make_table(pfam_with_stats,i)
floa = loa['81']
floa['compactor_length'] = 81
for i in loa.keys():
    if int(i) != 81:
        loa[i]['compactor_length'] = int(i)
        floa = pd.concat([floa,loa[i]])
floa.fillna(0).to_csv('/oak/stanford/groups/horence/george/protein_domain_project/workflow/domain_concurrence_counts.tsv',sep='\t',index=None)
"""

#### Define a function to add domain-level statistics. 

In [14]:
def write_stats(input_df, outpath):
    
    ## Given an input dataframe, deduplicate by anchor, dataset, and Pfam query name.
    input_df1 = input_df.drop_duplicates(subset=['anchor','ds','Pfam_query_name'])
    
    ## Get the number of unique compactors, anchors, and samplesheets (datasets). 
    ## Get the mean, 80th and 20th percentile of effect size, entropy, # nonzero samples, and edit distance.
    df_summary = input_df.groupby(['Pfam_query_name'])['compactor'].nunique().reset_index() \
    .merge(input_df.groupby(['Pfam_query_name'])['anchor'].nunique().reset_index()) \
    .merge(input_df.groupby(['Pfam_query_name'])['ds'].nunique().reset_index()) \
    .merge(input_df1.groupby(['Pfam_query_name'])['effect_size_bin'].mean().reset_index().rename(columns={'effect_size_bin':'mean_effect_size'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['effect_size_bin'].quantile(0.8).reset_index().rename(columns={'effect_size_bin':'80th_percentile_effect_size'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['effect_size_bin'].quantile(0.2).reset_index().rename(columns={'effect_size_bin':'20th_percentile_effect_size'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['target_entropy'].mean().reset_index().rename(columns={'target_entropy':'mean_entropy'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['target_entropy'].quantile(0.8).reset_index().rename(columns={'target_entropy':'80th_percentile_entropy'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['target_entropy'].quantile(0.2).reset_index().rename(columns={'target_entropy':'20th_percentile_entropy'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['number_nonzero_samples'].mean().reset_index().rename(columns={'number_nonzero_samples':'mean_nonzero_samples'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['number_nonzero_samples'].quantile(0.8).reset_index().rename(columns={'number_nonzero_samples':'80th_percentile_nonzero_samples'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['number_nonzero_samples'].quantile(0.2).reset_index().rename(columns={'number_nonzero_samples':'20th_percentile_nonzero_samples'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['avg_edit_distance_max_target'].mean().reset_index().rename(columns={'avg_edit_distance_max_target':'mean_lev_max_target'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['avg_edit_distance_max_target'].quantile(0.8).reset_index().rename(columns={'avg_edit_distance_max_target':'80th_percentile_lev_max_target'})) \
    .merge(input_df1.groupby(['Pfam_query_name'])['avg_edit_distance_max_target'].quantile(0.2).reset_index().rename(columns={'avg_edit_distance_max_target':'20th_percentile_lev_max_target'}))
    
    df_summary.to_csv(outpath,sep='\t',index=None)
    
    return df_summary

In [21]:
domain_stats = write_stats(pfam_with_stats,'/oak/stanford/groups/horence/george/protein_domain_project/workflow/domain_summary_statistics.tsv')



#### Stratify these results by organism and collection type. 

In [21]:
data_with_organism = pfam_with_stats.merge(metadata)

In [22]:
data_with_technology = pfam_with_stats.copy()
data_with_technology['technology'] = data_with_technology['ds'].str.contains('10x').replace({True:'10x',False:'SS2'})


In [27]:
#os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology')
#os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology/10x')
#os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology/SS2')

write_stats(data_with_technology[data_with_technology['technology']=='SS2'],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology/SS2/domain_summary_statistics.tsv')
write_stats(data_with_technology[data_with_technology['technology']=='10x'],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology/10x/domain_summary_statistics.tsv')

write_concurrence_func(data_with_technology[data_with_technology['technology']=='SS2'],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology/SS2')
write_concurrence_func(data_with_technology[data_with_technology['technology']=='10x'],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology/10x')


In [29]:
data_with_organism['Organism'].unique()

array(['Pogona vitticeps', 'Danio rerio', 'Gallus gallus',
       'Anolis sagrei', 'Lytechinus variegatus', 'Nicotiana tabacum',
       'Taeniopygia guttata', 'Botryllus schlosseri',
       'Drosophila melanogaster', 'Saccharomyces cerevisiae',
       'Arabidopsis thaliana', 'Ciona intestinalis', 'Plasmodium vivax',
       'Nematostella vectensis', 'Harpegnathos saltator',
       'Plasmodium berghei', 'Eisenia andrei', 'Populus trichocarpa',
       'Schistosoma mansoni', 'Isodiametra pulchra', 'Xenia sp.',
       'Caenorhabditis elegans', 'Chlamydomonas reinhardtii',
       'Plasmodium falciparum', 'Astyanax mexicanus',
       'Nicotiana attenuata', 'Oryzias latipes', 'Zea mays',
       'Loligo vulgaris', 'Xenopus laevis',
       'Populus tremula x Populus alba', 'Dreissena rostriformis'],
      dtype=object)

In [33]:
data_with_organism[data_with_organism['Organism']=='Ciona intestinalis']['length'].unique()

array([108])

In [35]:
os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_organism')

for organism in data_with_organism['Organism'].unique():
    
    organism_under = '_'.join(organism.split(' '))
    
    if 'Xenia' in organism:
        
        organism_under = 'Xenia_sp'
        
    os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_organism/'+organism_under)

    write_concurrence_func(data_with_organism[data_with_organism['Organism']==organism],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_organism/'+organism_under)

    write_stats(data_with_organism[data_with_organism['Organism']==organism],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_organism/'+organism_under+'/domain_summary_statistics.tsv')


In [47]:
data_with_organism.shape

(1122328, 37)

In [49]:
data_with_organism2 = data_with_organism.merge(acceptable_compactors)

os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_organism_contaminant_filtered')

for organism in data_with_organism2['Organism'].unique():
    
    organism_under = '_'.join(organism.split(' '))
    
    if 'Xenia' in organism:
        
        organism_under = 'Xenia_sp'
        
    os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_organism_contaminant_filtered/'+organism_under)

    write_concurrence_func(data_with_organism2[data_with_organism2['Organism']==organism],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_organism_contaminant_filtered/'+organism_under)

    write_stats(data_with_organism2[data_with_organism2['Organism']==organism],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_organism_contaminant_filtered/'+organism_under+'/domain_summary_statistics.tsv')


In [50]:
data_with_technology2 = data_with_technology.merge(acceptable_compactors)

os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology_contaminant_filtered')
os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology_contaminant_filtered/10x')
os.mkdir('/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology_contaminant_filtered/SS2')

write_stats(data_with_technology2[data_with_technology2['technology']=='SS2'],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology_contaminant_filtered/SS2/domain_summary_statistics.tsv')
write_stats(data_with_technology2[data_with_technology2['technology']=='10x'],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology_contaminant_filtered/10x/domain_summary_statistics.tsv')

write_concurrence_func(data_with_technology2[data_with_technology2['technology']=='SS2'],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology_contaminant_filtered/SS2')
write_concurrence_func(data_with_technology2[data_with_technology2['technology']=='10x'],'/oak/stanford/groups/horence/george/protein_domain_project/workflow/split_by_technology_contaminant_filtered/10x')


In [56]:
pws_acc = pfam_with_stats.merge(acceptable_compactors)

write_stats(pws_acc,'/oak/stanford/groups/horence/george/protein_domain_project/workflow/contaminant_filtered_domain_summary_statistics.tsv')
write_concurrence_func(pws_acc,'/oak/stanford/groups/horence/george/protein_domain_project')
