In [6]:
import matplotlib.pyplot as plt
from math import log
import pandas as pd
import numpy as np
import subprocess
import seaborn
import glob
import re

In [7]:
from library_functions import count_net_charge
from library_functions import name_to_scaffold
from library_functions import name_to_surface
from library_functions import get_seq_param

In [None]:
pwd

In [14]:
''' Functions to load and mange dataframes '''
def load_count_df_and_consensus_CDS(count_df_filename, subpool, tag):
    filepath = '/'.join(count_df_filename.split('/')[:-1])
    count_df = pd.read_csv(count_df_filename, sep='\t')
    count_df_CDS = add_consensus_CDS(count_df, f'{filepath}/SP{subpool}_{tag}/')
    return (count_df_CDS)

def add_consensus_CDS(count_df, path):
    consensus_sequences = []
    for design in count_df['design']:
        try:
            with open(f'{path}/{design}_reads_CONSENSUS.fas', 'r') as consensus_fasta:
                consensus_sequences.append( consensus_fasta.readlines()[1].replace('\n','') )
        except FileNotFoundError:
            missing_read_df = count_df[count_df["design"] == design]
            missing_read_count = int(missing_read_df['count'])
            #print (missing_read_count['count'])
            #print ()
            # print (f'FileNotFoundError: {path}/{design}_reads_CONSENSUS.fas, is missing.')
            # assert missing_read_count < 2, 'ERROR: Missing consensus file for design with > 1 read count'
            consensus_sequences.append( '--' )

    count_df['CDS'] = consensus_sequences
    return (count_df)

In [16]:
# LOADING INDIVIDUAL READ COUNT DATAFRAMES AND ASSOCIATED CONSENSUS PROTEIN CDSs
# path_to_counts = '/home/pylesh/designs/repeats/library_analysis/matched_reads/'
path_to_counts = './identify_enriched_sequences/matched_reads/'

if not path_to_counts.endswith('/'): path_to_counts = path_to_counts + '/'

## Uncomment and run individually
conditions = ['exp', 'In2O3'] # must be len == 2, like: [presort, postsort]; this is just for the string tags
# conditions = ['exp', 'ZnO'] # must be len == 2, like: [presort, postsort]; this is just for the string tags
# conditions = ['exp', 'Fe2O3'] # must be len == 2, like: [presort, postsort]; this is just for the string tags

count_dfs = []
for SP in range(1,17):
    ## Uncomment for missing subpools as needed
    # if SP in [8, 12]:
    #     continue
    for tag in conditions:
        count_df_file = f'{path_to_counts}chip_assembled_SP{SP}_{tag}_counts.tsv'
        print (f'Loading: {count_df_file}')
        count_df = load_count_df_and_consensus_CDS(count_df_file, SP, tag)
        count_df['subpool'] = [f'SP{SP}' for design in count_df['design']]
        count_dfs.append(count_df)
l = len(count_dfs)

all_subpools = []
for e, c in zip(range(0,l,2),range(1,l,2)):
    expression_df = count_dfs[e]
    cellsorted_df = count_dfs[c]
    SP = list(expression_df['subpool'])[0]
    print ()
    print (f'Subpool: {SP}')
    exp_total = expression_df['count'].sum()
    ### Remove rows without CDS, i.e. things not observed in sequencing reads
    expression_df = expression_df[expression_df['CDS'] != '']
    expression_df = expression_df[expression_df['CDS'] != '-']
    expression_df = expression_df[expression_df['CDS'] != '--']
    exp_assigned = expression_df['count'].sum()
    ### Prior to merging, rename counts to associate with condition
    expression_df.rename(columns={'count': f'{conditions[0]}_count'}, inplace=True)    
    expression_df.rename(columns={'CDS': f'{conditions[0]}_CDS'}, inplace=True)    
    expression_df[f'{conditions[0]}_frac'] = [ float(c)/exp_total for c in expression_df[f'{conditions[0]}_count'] ]    
    print (f'{int(exp_assigned)} of {int(exp_total)} ({round(exp_assigned/exp_total,2)}) assigned in expression pool')
    
    sort_total = cellsorted_df['count'].sum()
    ### Remove rows without CDS, i.e. things not observed in sequencing reads
    cellsorted_df = cellsorted_df[cellsorted_df['CDS'] != '']
    cellsorted_df = cellsorted_df[cellsorted_df['CDS'] != '-']
    cellsorted_df = cellsorted_df[cellsorted_df['CDS'] != '--']
    sort_assigned = cellsorted_df['count'].sum()
    ### Prior to merging, rename counts to associate with condition
    cellsorted_df.rename(columns={'count': f'{conditions[1]}_count'}, inplace=True)    
    cellsorted_df.rename(columns={'CDS': f'{conditions[1]}_CDS'}, inplace=True)    
    cellsorted_df[f'{conditions[1]}_frac'] = [ float(c)/sort_total for c in cellsorted_df[f'{conditions[1]}_count'] ]    
    print (f'{int(sort_assigned)} of {int(sort_total)} ({round(sort_assigned/sort_total,2)}) assigned in sorted pool')
    
    conditions_df = expression_df.merge(cellsorted_df, how='outer')
    conditions_df['exp_count'].fillna(0, inplace=True)
    conditions_df[f'{conditions[1]}_count'].fillna(0, inplace=True)
    
    all_subpools.append(conditions_df)
    
combined_subpools = pd.concat(all_subpools)
combined_subpools[f'{conditions[0]}_CDS'].fillna('X', inplace=True)
combined_subpools[f'{conditions[1]}_CDS'].fillna('X', inplace=True)


Loading: /home/pylesh/designs/repeats/library_analysis_may_2023_InO/matched_reads/chip_assembled_SP1_exp_counts.tsv
Loading: /home/pylesh/designs/repeats/library_analysis_may_2023_InO/matched_reads/chip_assembled_SP1_InO_counts.tsv
Loading: /home/pylesh/designs/repeats/library_analysis_may_2023_InO/matched_reads/chip_assembled_SP2_exp_counts.tsv
Loading: /home/pylesh/designs/repeats/library_analysis_may_2023_InO/matched_reads/chip_assembled_SP2_InO_counts.tsv
Loading: /home/pylesh/designs/repeats/library_analysis_may_2023_InO/matched_reads/chip_assembled_SP3_exp_counts.tsv
Loading: /home/pylesh/designs/repeats/library_analysis_may_2023_InO/matched_reads/chip_assembled_SP3_InO_counts.tsv
Loading: /home/pylesh/designs/repeats/library_analysis_may_2023_InO/matched_reads/chip_assembled_SP4_exp_counts.tsv
Loading: /home/pylesh/designs/repeats/library_analysis_may_2023_InO/matched_reads/chip_assembled_SP4_InO_counts.tsv
Loading: /home/pylesh/designs/repeats/library_analysis_may_2023_InO/matc

In [22]:
subpools = sorted(list(set(combined_subpools['subpool'])))
d = 'observed_cds'

for subpool in subpools:
    ## Uncomment for missing subpools
    # if subpool in ['SP15', 'SP8']: continue
    subpool_df = combined_subpools[ combined_subpools['subpool'] == subpool ]
    ## Print CDS to fasta_fileS 
    with open(f'{d}/{subpool}_{conditions[0]}_only.fas', 'w') as e_only:
        for design, e_seq, s_seq, e_count, s_count in zip(subpool_df['design'], subpool_df['exp_CDS'], subpool_df[f'{conditions[1]}_CDS'], subpool_df['exp_count'], subpool_df[f'{conditions[1]}_count']):
            if e_seq != '-' and s_seq == '-':
                print (f'>{design}_{conditions[0]}_count{round(e_count)}\n{e_seq}', file=e_only)

    with open(f'{d}/{subpool}_{conditions[1]}_only.fas', 'w') as s_only:
        for design, e_seq, s_seq, e_count, s_count in zip(subpool_df['design'], subpool_df['exp_CDS'], subpool_df[f'{conditions[1]}_CDS'], subpool_df['exp_count'], subpool_df[f'{conditions[1]}_count']):
            if e_seq == '-' and s_seq != '-':
                print (f'>{design}_{conditions[1]}_count{round(s_count)}\n{s_seq}', file=s_only)

    with open(f'{d}/{subpool}_both.fas', 'w') as both:            
        for design, e_seq, s_seq, e_count, s_count in zip(subpool_df['design'], subpool_df['exp_CDS'], subpool_df[f'{conditions[1]}_CDS'], subpool_df['exp_count'], subpool_df[f'{conditions[1]}_count']):
            if e_seq != '-' and s_seq != '-':
                print (f'>{design}_{conditions[0]}_count{round(e_count)}\n{e_seq}', file=both)
                print (f'>{design}_{conditions[1]}_count{round(s_count)}\n{s_seq}', file=both)


In [23]:
representative_cds = []
for e_seq, s_seq in zip(combined_subpools['exp_CDS'], combined_subpools[f'{conditions[1]}_CDS']):
        # if reads present in expression and sorted pools, use the SORTED consensus cds
        if e_seq != 'X' and s_seq != 'X':
            representative_cds.append(s_seq)
        # if reads only present in expression pools, use the expression consensus cds
        if e_seq != 'X' and s_seq == 'X':
            representative_cds.append(e_seq)         
        # if reads only present in sorted pools, use the sorted consensus cds
        if e_seq == 'X' and s_seq != 'X':
            representative_cds.append(s_seq)            

combined_subpools['rep_CDS'] = representative_cds
print(list(combined_subpools[f'{conditions[1]}_CDS']).count('X'), f'sequences in {conditions[0]} but not in {conditions[1]} ')
print(list(combined_subpools['exp_CDS']).count('X'), f'sequences in {conditions[1]} but not in {conditions[0]} ')
# print(list(combined_subpools['rep_CDS']).count('-'))

5370 sequences in exp but not in InO 
40 sequences in InO but not in exp 


In [24]:
## 'Normalize' zero counts to 1, and calculate enrichment for counts
## only designs that are observed in one condition or the other get this treatment
if min(list(combined_subpools['exp_count'])) == 0:
    combined_subpools['exp_count'] = combined_subpools['exp_count'] + 1
    combined_subpools[f'{conditions[1]}_count'] = combined_subpools[f'{conditions[1]}_count'] + 1
combined_subpools['enrichment'] = [ float(sort)/float(exp) for exp,sort in zip(combined_subpools['exp_count'],combined_subpools[f'{conditions[1]}_count'])]
combined_subpools['log(enrichment)'] = [log(enrich) for enrich in combined_subpools['enrichment']]

In [25]:
combined_subpools = get_seq_param(combined_subpools, 'rep_CDS')

In [26]:
for design in combined_subpools['design']:
    name_to_surface(design) 

In [27]:
combined_subpools['scaffold'] = [name_to_scaffold(design) for design in combined_subpools['design']]
print (sorted(set(combined_subpools['scaffold'])))
combined_subpools['surface'] = [name_to_surface(design) for design in combined_subpools['design']]
print (sorted(set(combined_subpools['surface'])))

['2H_15_cap', '2H_26_cap', '2H_28_cap', '2H_58', '2H_58_0001_CHIMERA_2H_58', 'DHR10_5CWG_XtalFit', 'DHR14_5CWH_XtalFit', 'DHR14_5CWH_XtalFit_0001_CHIMERA_DHR14_5CWH_XtalFit', 'DHR49_5CWJ_XtalFit', 'DHR53_5CWK_XtalFit', 'EIQAQFQGDTQVQNG', 'EVQNVNKF', 'FD023_sol_CAETN', 'FI2000161', 'FI998143', 'FI998252', 'FI_AP_6MRR', 'FI_AP_6MRS', 'FQIGSSGQ', 'KVSSNQVQQV', 'PDL_0_4', 'PPR_c3a145_N2SLT', 'PPR_c3a145_OG', 'QAEGGQLQVQAQGNSQIEVGSNG', 'QAQAQLQLQAQGGGDT', 'QAQLQIQASGT', 'QAQLQIQSSGSS', 'QAQLQVQGSSV', 'QFQVQLQAGSGEIQLSNSQLQIQAQIGTG', 'QIQQGT', 'QIQVQAQGSNT', 'QIQVQIQSSGGS', 'QNQVQLQGGS', 'QVQAQLQVQSTG', 'QVQIQVQAQAQG', 'QVQVQIQSSGAS', 'RiAFP_4DT5', 'THR_8_NSR_XtalFit', 'THR_8_NSR_XtalFit_0001_CHIMERA_THR_8_NSR_XtalFit', 'THR_8_NSR_XtalFit_0001_CHIMERA_THR_DN_T6_XtalFit', 'THR_DN_T6_XtalFit', 'abr_10', 'abr_3']
['DEAroMidGr', 'H40E20D20', 'H40LIM40', 'His20Cys20', 'Neg60', 'Neg90', 'NegDEqQN', 'NegGre45', 'PolyAs', 'PolyLys', 'Pos60', 'Pos90', 'PosGre45', 'PosMidGr', 'PosMidGrDv', 'Q65N25', '

In [28]:
print ('Designs:')
print ('\t'.join(sorted(set(combined_subpools['design']))[:5]), '...')
print ('\nScaffolds:')
print ('\t'.join(sorted(set(combined_subpools['scaffold']))))
print ('\nSurfaces:')
print ('\t'.join(sorted(set(combined_subpools['surface']))))

Designs:
2H_15_cap_DEAroMidGr_surfA_antirep_thread7_0009	2H_15_cap_DEAroMidGr_surfA_antirep_thread9_0020	2H_15_cap_DEAroMidGr_surfA_repeat_thread8_0003	2H_15_cap_DEAroMidGr_surfB_antirep_thread2_0041	2H_15_cap_DEAroMidGr_surfB_antirep_thread4_0030_CHIMERA_2H_15_cap_PosGre45_surfB_repeat_thread6_0009 ...

Scaffolds:
2H_15_cap	2H_26_cap	2H_28_cap	2H_58	2H_58_0001_CHIMERA_2H_58	DHR10_5CWG_XtalFit	DHR14_5CWH_XtalFit	DHR14_5CWH_XtalFit_0001_CHIMERA_DHR14_5CWH_XtalFit	DHR49_5CWJ_XtalFit	DHR53_5CWK_XtalFit	EIQAQFQGDTQVQNG	EVQNVNKF	FD023_sol_CAETN	FI2000161	FI998143	FI998252	FI_AP_6MRR	FI_AP_6MRS	FQIGSSGQ	KVSSNQVQQV	PDL_0_4	PPR_c3a145_N2SLT	PPR_c3a145_OG	QAEGGQLQVQAQGNSQIEVGSNG	QAQAQLQLQAQGGGDT	QAQLQIQASGT	QAQLQIQSSGSS	QAQLQVQGSSV	QFQVQLQAGSGEIQLSNSQLQIQAQIGTG	QIQQGT	QIQVQAQGSNT	QIQVQIQSSGGS	QNQVQLQGGS	QVQAQLQVQSTG	QVQIQVQAQAQG	QVQVQIQSSGAS	RiAFP_4DT5	THR_8_NSR_XtalFit	THR_8_NSR_XtalFit_0001_CHIMERA_THR_8_NSR_XtalFit	THR_8_NSR_XtalFit_0001_CHIMERA_THR_DN_T6_XtalFit	THR_DN_T6_XtalFit	abr_10	abr

In [29]:
combined_subpools.sort_values(by='log(enrichment)', inplace=True, ascending=False)
combined_subpools.to_csv(f'merged_{conditions[1]}_counts.tsv', sep='\t')

In [30]:
combined_subpools

Unnamed: 0,design,exp_count,exp_CDS,subpool,exp_frac,InO_count,InO_CDS,InO_frac,rep_CDS,enrichment,...,Isoelectric point,length,GRAVY,% His,% Thr,% Arg,% Asp,% Cys,scaffold,surface
484,THR_DN_T6_XtalFit_H40E20D20_surfB_antirep_thre...,1.0,X,SP15,,73003.0,RIPDEPAPDPKSSEEIVEEAETALKALLEEAEKGGKHDAHHIAEKL...,0.243934,RIPDEPAPDPKSSEEIVEEAETALKALLEEAEKGGKHDAHHIAEKL...,73003.000000,...,8.985362,164,-0.416774,2.439024,2.439024,5.487805,2.439024,0.000000,THR_DN_T6_XtalFit,H40E20D20
512,QAQAQLQLQAQGGGDT_seq5_0001_PolyAs_surfB_repeat...,1.0,X,SP6,,41160.0,RNENQQNDTKGSTSESEVNVERSTTSDAQADSDSDVDEDTTSDAQA...,0.126248,RNENQQNDTKGSTSESEVNVERSTTSDAQADSDSDVDEDTTSDAQA...,41160.000000,...,4.050028,156,-0.896667,0.641026,8.333333,1.923077,13.461538,0.000000,QAQAQLQLQAQGGGDT,PolyAs
513,QAQAQLQLQAQGGGDT_seq5_0001_greNegDEY_surfA_ant...,1.0,X,SP6,,39881.0,RIQDSPDIVPKGSTSVSDVEVVRETTSDAEARSSSKVEESTTSDAQ...,0.122325,RIQDSPDIVPKGSTSVSDVEVVRETTSDAEARSSSKVEESTTSDAQ...,39881.000000,...,4.446480,141,-0.535766,0.000000,9.219858,2.127660,7.092199,0.000000,QAQAQLQLQAQGGGDT,greNegDEY
398,FQIGSSGQ_seq86_0004_Neg60_surfA_repeat_thread2...,1.0,X,SP9,,28107.0,RDQIIPGVNPKGSDERDETDTDQADEHQTRQADETQEVCYW,0.121046,RDQIIPGVNPKGSDERDETDTDQADEHQTRQADETQEVCYW,28107.000000,...,4.094420,41,-1.704878,2.439024,9.756098,7.317073,17.073171,2.439024,FQIGSSGQ,Neg60
116,RiAFP_4DT5_PosMidGr_surfB_antirep_thread2_0021...,1.0,X,SP2,,11726.0,XSGSSGGRAPPPPGVINVKGSGVARAYGAGAMASGTSEPGSRSRAV...,0.026559,XSGSSGGRAPPPPGVINVKGSGVARAYGAGAMASGTSEPGSRSRAV...,11726.000000,...,9.802821,139,-0.640146,0.719424,10.071942,7.194245,2.877698,0.000000,RiAFP_4DT5,PosMidGr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,QAEGGQLQVQAQGNSQIEVGSNG_seq58_0004_NegGre45_su...,3644.0,NDXTXRGGDLDVEAVGPSEINVGNDAKAEGGTLFVIADGPSKITVG...,SP5,0.017785,1.0,X,,NDXTXRGGDLDVEAVGPSEINVGNDAKAEGGTLFVIADGPSKITVG...,0.000274,...,4.050028,118,-0.041379,0.847458,5.084746,0.847458,6.779661,0.000000,QAEGGQLQVQAQGNSQIEVGSNG,NegGre45
2,FI998143_Thr90_surfB_repeat_thread4_0016,3702.0,RNXXDXXXXXXXKGSGPITVDVTKPGEGTTELTKGLTETKKSDTTV...,SP16,0.017388,1.0,X,,RNXXDXXXXXXXKGSGPITVDVTKPGEGTTELTKGLTETKKSDTTV...,0.000270,...,9.695094,81,-0.965278,2.469136,17.283951,6.172840,4.938272,0.000000,FI998143,Thr90
0,QAEGGQLQVQAQGNSQIEVGSNG_seq58_0004_Thr60_surfA...,3724.0,VENDXXXRGGTLIVTAYGPSEINVGNDAKAEGGTLIVTAYGPSKIT...,SP5,0.018175,1.0,X,,VENDXXXRGGTLIVTAYGPSEINVGNDAKAEGGTLIVTAYGPSKIT...,0.000269,...,4.382480,120,0.043590,0.833333,11.666667,0.833333,4.166667,0.000000,QAEGGQLQVQAQGNSQIEVGSNG,Thr60
0,RiAFP_4DT5_His20Cys20_surfA_repeat_thread8_0006,3735.0,KGSGHAKATGAGAMASGTSEPGSHSVAVACGRGATARSTSTGRGHA...,SP2,0.025213,1.0,X,,KGSGHAKATGAGAMASGTSEPGSHSVAVACGRGATARSTSTGRGHA...,0.000268,...,9.967731,121,-0.016529,5.785124,11.570248,4.132231,0.000000,4.132231,RiAFP_4DT5,His20Cys20
