In [20]:
import os
import pandas as pd

In [21]:
####USER DEFINED VARIABLES####
indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd'
outdir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd'

strictest_rmsd_cutoff = 0.5
strict_rmsd_cutoff = 1.0
lenient_rmsd_cutoff = 1.5
plddt_cutoff = 80
pae_cutoff = 5
min0_motif_res_in_heptad = 7
min2_motif_res_in_heptad = 0

In [22]:
#import data for all msd design conditions
#mpnn dfs already filtered:
    #min0 < 1.0 angstroms to rosetta
    #min2 < 1.5 angstroms to rosetta
    #socket_call == 1
    #min0 <= 7 ALFA res in heptad
    #min2 ==0 ALFA res in heptad

passing_designs_to_concat = []
min0_passing_designs_to_concat = []
min2_passing_designs_to_concat = []

for file in os.listdir(indir):
    if file.endswith('all_metrics.csv'):
        print(f"Processing file: {file}")

        # Read in the data
        df = pd.read_csv(os.path.join(indir, file), index_col=0)
        print(df.shape)
        print(len(df['sequence'].unique()))

        #filter for plddt
        df = df.query('avg_plddt_no_loop > @plddt_cutoff', engine='python').copy()
        print(f"plddt filtered: {len(df['sequence'].unique())}")
        
        #filter for pae
        df = df.query('avg_pae_no_loop < @pae_cutoff', engine='python').copy()
        print(f"pae filtered: {len(df['sequence'].unique())}")

        #get prediction with lowest rmsd to each state
        lowest_rmsd_min0_df = df.sort_values('min0_all_rmsd_no_loop', ascending=True).drop_duplicates('sequence').sort_index()
        lowest_rmsd_min0_df = lowest_rmsd_min0_df.query('min0_all_rmsd_no_loop < @lenient_rmsd_cutoff').copy()
        print(f"rmsd filtered (min0 only): {len(lowest_rmsd_min0_df['sequence'].unique())}")

        lowest_rmsd_min2_df = df.sort_values('min2_all_rmsd_no_loop', ascending=True).drop_duplicates('sequence').sort_index()
        lowest_rmsd_min2_df = lowest_rmsd_min2_df.query('min2_all_rmsd_no_loop < @lenient_rmsd_cutoff').copy()
        print(f"rmsd filtered (min2 only): {len(lowest_rmsd_min2_df['sequence'].unique())}")

        intersect_df = pd.merge(lowest_rmsd_min0_df, lowest_rmsd_min2_df, how='inner', on='sequence', suffixes=('_min0', '_min2'))
        print(f"rmsd filtered: {len(intersect_df['sequence'].unique())}")
        passing_designs_to_concat.append(intersect_df)

        #get sequences that pass filters for only one state
        min0_only_df = lowest_rmsd_min0_df[~lowest_rmsd_min0_df['sequence'].isin(intersect_df['sequence'])]
        print(f"min0 only: {len(min0_only_df['sequence'].unique())}")
        min0_passing_designs_to_concat.append(min0_only_df)

        min2_only_df = lowest_rmsd_min2_df[~lowest_rmsd_min2_df['sequence'].isin(intersect_df['sequence'])]
        print(f"min2 only: {len(min2_only_df['sequence'].unique())}")
        min2_passing_designs_to_concat.append(min2_only_df)

all_passing_designs_df = pd.concat(passing_designs_to_concat)
min0_passing_designs_df = pd.concat(min0_passing_designs_to_concat)
min2_passing_designs_df = pd.concat(min2_passing_designs_to_concat)
print(all_passing_designs_df.shape)
print(len(all_passing_designs_df['sequence'].unique()))
all_passing_designs_df.to_csv(f'{outdir}/msd_designs_passing.csv')
min0_passing_designs_df.to_csv(f'{outdir}/msd_designs_passing_min0_only.csv')
min2_passing_designs_df.to_csv(f'{outdir}/msd_designs_passing_min2_only.csv')

Processing file: MSD_MPNN_53_g4s_af2_all_metrics.csv
(1247, 39)
377
plddt filtered: 331
pae filtered: 220
rmsd filtered (min0 only): 173
rmsd filtered (min2 only): 14
rmsd filtered: 2
min0 only: 171
min2 only: 12
Processing file: MSD_MPNN_52_bm01_rosetta_all_metrics.csv
(7951, 38)
2347
plddt filtered: 2126
pae filtered: 1081
rmsd filtered (min0 only): 956
rmsd filtered (min2 only): 85
rmsd filtered: 9
min0 only: 947
min2 only: 76
Processing file: MSD_f2s_os_52_bm01_af2_all_metrics.csv
(1050, 39)
276
plddt filtered: 273
pae filtered: 191
rmsd filtered (min0 only): 92
rmsd filtered (min2 only): 96
rmsd filtered: 0
min0 only: 92
min2 only: 96
Processing file: MSD_f2s_as_52_bm01_af2_all_metrics.csv
(1127, 39)
278
plddt filtered: 274
pae filtered: 178
rmsd filtered (min0 only): 125
rmsd filtered (min2 only): 54
rmsd filtered: 2
min0 only: 123
min2 only: 52
Processing file: MSD_MPNN_53_g4s_rosetta_all_metrics.csv
(1611, 38)
535
plddt filtered: 457
pae filtered: 264
rmsd filtered (min0 only):

In [23]:
print(all_passing_designs_df['min2_bb_id_min0'].value_counts())
print(all_passing_designs_df['seq_design_method_min0'].value_counts(normalize=True))
print(len(all_passing_designs_df['min0_bb_id_min2'].unique())) #note bb naming differs between f2s and mpnn so may be some duplicates
print(len(all_passing_designs_df['min2_bb_id_min2'].unique()))
print(len(all_passing_designs_df['msd_bbs_id_min2'].unique()))

#all_passing_designs_df.to_csv(f'{indir}/MSD_r2_SECrilly_af2_rosetta_bbs_passing_designs.csv')

min2_bb_id_min0
07141_3_rank_001          49
7141                      25
8351                      11
20230321_08351_ALFA_52    10
7154                       5
07154_3_rank_001           4
20230321_07154_ALFA_52     3
20230321_07439_ALFA_52     3
20230321_07141_ALFA_52     3
08351_7_rank_001           3
08939_8_rank_002           2
7439                       2
07439_6_rank_003           1
Name: count, dtype: int64
seq_design_method_min0
MPNN      0.644628
f2s_os    0.239669
f2s_as    0.115702
Name: proportion, dtype: float64
51
13
50
