In [1]:
import os
import pandas as pd

In [2]:
####USER DEFINED VARIABLES####
indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd_f2s_os_scaled_up/outputs'
outdir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd_f2s_os_scaled_up/outputs'

strictest_rmsd_cutoff = 0.5
strict_rmsd_cutoff = 1.0
lenient_rmsd_cutoff = 1.5
plddt_cutoff = 80
pae_cutoff = 5
min0_motif_res_in_heptad = 7
min2_motif_res_in_heptad = 0

In [4]:
#import data for all msd design conditions
#mpnn dfs already filtered:
    #min0 < 1.0 angstroms to rosetta
    #min2 < 1.5 angstroms to rosetta
    #socket_call == 1
    #min0 <= 7 ALFA res in heptad
    #min2 ==0 ALFA res in heptad

passing_designs_to_concat = []
min0_passing_designs_to_concat = []
min2_passing_designs_to_concat = []

for file in os.listdir(indir):
    if file.endswith('all_metrics.csv'):
        print(f"Processing file: {file}")

        # Read in the data
        df = pd.read_csv(os.path.join(indir, file), index_col=0)
        print(df.shape)
        print(len(df['sequence'].unique()))

        #filter for plddt
        df = df.query('avg_plddt_no_loop > @plddt_cutoff', engine='python').copy()
        print(f"plddt filtered: {len(df['sequence'].unique())}") 
        
        #filter for pae
        df = df.query('avg_pae_no_loop < @pae_cutoff', engine='python').copy()
        print(f"pae filtered: {len(df['sequence'].unique())}")

        #get prediction with lowest rmsd to each state
        lowest_rmsd_min0_df = df.sort_values('min0_all_rmsd_no_loop', ascending=True).drop_duplicates('sequence').sort_index() 
        lowest_rmsd_min0_df = lowest_rmsd_min0_df.query('min0_all_rmsd_no_loop < @lenient_rmsd_cutoff').copy()
        print(f"rmsd filtered (min0 only): {len(lowest_rmsd_min0_df['seq_id'].unique())}") #TODO: change back to sequence

        lowest_rmsd_min2_df = df.sort_values('min2_all_rmsd_no_loop', ascending=True).drop_duplicates('sequence').sort_index()
        lowest_rmsd_min2_df = lowest_rmsd_min2_df.query('min2_all_rmsd_no_loop < @lenient_rmsd_cutoff').copy()
        print(f"rmsd filtered (min2 only): {len(lowest_rmsd_min2_df['seq_id'].unique())}") #TODO: change back to sequence

        intersect_df = pd.merge(lowest_rmsd_min0_df, lowest_rmsd_min2_df, how='inner', on='sequence', suffixes=('_min0', '_min2'))
        print(f"rmsd filtered: {len(intersect_df['sequence'].unique())}")
        passing_designs_to_concat.append(intersect_df)

        #get sequences that pass filters for only one state
        min0_only_df = lowest_rmsd_min0_df[~lowest_rmsd_min0_df['seq_id'].isin(intersect_df['sequence'])]
        print(f"min0 only: {len(min0_only_df['sequence'].unique())}")
        min0_passing_designs_to_concat.append(min0_only_df)

        min2_only_df = lowest_rmsd_min2_df[~lowest_rmsd_min2_df['seq_id'].isin(intersect_df['sequence'])]
        print(f"min2 only: {len(min2_only_df['sequence'].unique())}")
        min2_passing_designs_to_concat.append(min2_only_df)

all_passing_designs_df = pd.concat(passing_designs_to_concat)
min0_passing_designs_df = pd.concat(min0_passing_designs_to_concat)
min2_passing_designs_df = pd.concat(min2_passing_designs_to_concat)
print(all_passing_designs_df.shape)
print(len(all_passing_designs_df['sequence'].unique()))
all_passing_designs_df.to_csv(f'{outdir}/msd_designs_passing.csv')
min0_passing_designs_df.to_csv(f'{outdir}/msd_designs_passing_min0_only.csv')
min2_passing_designs_df.to_csv(f'{outdir}/msd_designs_passing_min2_only.csv')

Processing file: 1_all_metrics.csv
(12381, 37)
3118
plddt filtered: 3024
pae filtered: 2437
rmsd filtered (min0 only): 2149
rmsd filtered (min2 only): 239
rmsd filtered: 38
min0 only: 2149
min2 only: 239
Processing file: 6_all_metrics.csv
(12405, 37)
3090
plddt filtered: 3002
pae filtered: 2458
rmsd filtered (min0 only): 2197
rmsd filtered (min2 only): 199
rmsd filtered: 35
min0 only: 2197
min2 only: 199
Processing file: 9_all_metrics.csv
(12249, 37)
3063
plddt filtered: 2977
pae filtered: 2422
rmsd filtered (min0 only): 2125
rmsd filtered (min2 only): 236
rmsd filtered: 33
min0 only: 2125
min2 only: 236
Processing file: 7_all_metrics.csv
(12362, 37)
3095
plddt filtered: 3009
pae filtered: 2414
rmsd filtered (min0 only): 2123
rmsd filtered (min2 only): 237
rmsd filtered: 27
min0 only: 2123
min2 only: 237
Processing file: 8_all_metrics.csv
(12510, 37)
3108
plddt filtered: 3036
pae filtered: 2449
rmsd filtered (min0 only): 2171
rmsd filtered (min2 only): 232
rmsd filtered: 39
min0 only: 

In [5]:
print(all_passing_designs_df['min2_bb_id_min0'].value_counts())
print(all_passing_designs_df['seq_design_method_min0'].value_counts(normalize=True))
print(len(all_passing_designs_df['min0_bb_id_min2'].unique())) #note bb naming differs between f2s and mpnn so may be some duplicates
print(len(all_passing_designs_df['min2_bb_id_min2'].unique()))
print(len(all_passing_designs_df['msd_bbs_id_min2'].unique()))

#all_passing_designs_df.to_csv(f'{indir}/MSD_r2_SECrilly_af2_rosetta_bbs_passing_designs.csv')

min2_bb_id_min0
7141    181
8351     77
7154     35
7439     33
Name: count, dtype: int64
seq_design_method_min0
f2s_os    1.0
Name: proportion, dtype: float64
29
4
58
