In [14]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
####USER DEFINED VARIABLES####
indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/ssd'
outdir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/ssd'

strictest_rmsd_cutoff = 0.5
strict_rmsd_cutoff = 1.0
lenient_rmsd_cutoff = 1.5
plddt_cutoff = 90
pae_cutoff = 5
min0_motif_res_in_heptad = 7
min2_motif_res_in_heptad = 0

In [16]:
#import data for all ssd design conditions
#mpnn dfs already filtered:
    #min0 < 1.0 angstroms to rosetta
    #min2 < 1.5 angstroms to rosetta
    #socket_call == 1
    #min0 <= 7 ALFA res in heptad
    #min2 ==0 ALFA res in heptad

passing_designs_to_concat = []

for file in os.listdir(indir):
    if file.endswith('all_metrics.csv'):
        # Read in the data
        df = pd.read_csv(os.path.join(indir, file), index_col=0)

        #filter designs
        if 'min0' in file:

            if '52' in file:
                #filter
                df = df.query('min0_all_rmsd_no_loop < @strictest_rmsd_cutoff &' 
                            'avg_plddt_no_loop > @plddt_cutoff &' 
                            'avg_pae_no_loop < @pae_cutoff &'
                            'num_motif_res_in_heptad >= @min0_motif_res_in_heptad').copy()
            
                #filter for unique sequences
                df = df.sort_values('min0_all_rmsd_no_loop').drop_duplicates('sequence').copy()

                print(file)
                print(f"Unique sequences passing filters: {df.shape[0]}")
                passing_designs_to_concat.append(df)
            
            elif '53' in file:
                #filter
                df = df.query('min0_all_rmsd_no_loop < @strict_rmsd_cutoff &' 
                            'avg_plddt_no_loop > @plddt_cutoff &' 
                            'avg_pae_no_loop < @pae_cutoff &'
                            'num_motif_res_in_heptad >= @min0_motif_res_in_heptad').copy()
            
                #filter for unique sequences
                df = df.sort_values('min0_all_rmsd_no_loop').drop_duplicates('sequence').copy()

                print(file)
                print(f"Unique sequences passing filters: {df.shape[0]}")
                passing_designs_to_concat.append(df)

        elif 'min2' in file:
            #filter
            df = df.query('min0_all_rmsd_no_loop < @lenient_rmsd_cutoff &' 
                            'avg_plddt_no_loop > @plddt_cutoff &' 
                            'avg_pae_no_loop < @pae_cutoff &'
                            'num_motif_res_in_heptad == @min2_motif_res_in_heptad').copy()
            
            #filter for unique sequences
            df = df.sort_values('min0_all_rmsd_no_loop').drop_duplicates('sequence').copy()

            print(file)
            print(f"Unique sequences passing filters: {df.shape[0]}")
            passing_designs_to_concat.append(df)

passing_designs_df = pd.concat(passing_designs_to_concat)
print(passing_designs_df.shape)
print(passing_designs_df.columns)
passing_designs_df.to_csv(f'{outdir}/ssd_designs_passing.csv')
        

SSD_min2_52_f2s_g4s_all_metrics.csv
Unique sequences passing filters: 9
SSD_min2_53_mpnn_bm01_all_metrics.csv
Unique sequences passing filters: 81
SSD_min2_53_mpnn_g4s_all_metrics.csv
Unique sequences passing filters: 63
SSD_min2_53_f2s_bm01_all_metrics.csv
Unique sequences passing filters: 43
SSD_min0_52_mpnn_bm01_all_metrics.csv
Unique sequences passing filters: 37
SSD_min2_52_mpnn_g4s_all_metrics.csv
Unique sequences passing filters: 22
SSD_min2_52_f2s_bm01_all_metrics.csv
Unique sequences passing filters: 1
SSD_min0_53_f2s_g4s_all_metrics.csv
Unique sequences passing filters: 1
SSD_min0_53_mpnn_g4s_all_metrics.csv
Unique sequences passing filters: 34
SSD_min0_53_f2s_bm01_all_metrics.csv
Unique sequences passing filters: 0
SSD_min2_53_f2s_g4s_all_metrics.csv
Unique sequences passing filters: 52
SSD_min0_53_mpnn_bm01_all_metrics.csv
Unique sequences passing filters: 4
SSD_min0_52_f2s_g4s_all_metrics.csv
Unique sequences passing filters: 20
SSD_min0_52_mpnn_g4s_all_metrics.csv
Unique 

In [17]:
print(passing_designs_df['min_condition'].value_counts())

min_condition
min2    281
min0    214
Name: count, dtype: int64


In [18]:


min_conditions = passing_designs_df['min_condition'].unique()
thread_positions = passing_designs_df['thread_position'].unique()
loops = ['bm01', 'g4s']

vals_list = []
vals_df_list = []

for min_condition in min_conditions:
    for thread_position in thread_positions:

        test_df = passing_designs_df.query('min_condition == @min_condition & thread_position == @thread_position', engine='python').copy()
        vals_df = test_df['bb_id'].value_counts().rename_axis('bb_id').reset_index(name='num_passing_seqs')

        #get backbones with >= 3 passing sequences, ie "designable"
        vals_df = vals_df.query('num_passing_seqs >= 3', engine='python').copy()
        vals_df['min_condition'] = min_condition
        vals_df['thread_position'] = thread_position

        vals_df_list.append(vals_df)

vals_df = pd.concat(vals_df_list)
vals_df.to_csv(f'{outdir}/ssd_bbs_designable.csv')

In [19]:
#get lowest rmsds for each bb_id for most designable bbs
designable_bbs_df = passing_designs_df.query('bb_id in @vals_df.bb_id', engine='python').copy()

dfs_to_concat = []

for min_condition in min_conditions:
    for thread_position in thread_positions:
        test_df = designable_bbs_df.query('min_condition == @min_condition & thread_position == @thread_position', engine='python').copy()

        #rank by min0_all_rmsd_no_loop and get top 10 unique bb_id
        test_df = test_df.sort_values('min0_all_rmsd_no_loop').drop_duplicates('bb_id').copy()

        dfs_to_concat.append(test_df)

all_df = pd.concat(dfs_to_concat)
all_df['min_thread_info'] = all_df['min_condition'] + '_' + all_df['thread_position'].astype(str)
all_df.to_csv(f'{outdir}/ssd_lowest_rmsd_predicted_bbs_designable.csv')