In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
####USER DEFINED VARIABLES####
mpnn_indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/ssd/mpnn_ssd'
f2s_indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/ssd/f2s_ssd'
seq_design_condition= 'SSD'

outdir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/ssd'

In [5]:
#MPNN
#make mpnn_log.txt file in outdir
mpnn_log_file = os.path.join(outdir, f'{seq_design_condition}_mpnn_log.txt')
with open(mpnn_log_file, 'w') as log_file:
    log_file.write(f'Processing MPNN metrics for {seq_design_condition} condition\n\n')
    log_file.write(f'Input directory: {mpnn_indir}\n\n')
    log_file.write(f'Output directory: {outdir}\n\n')

full_dfs_to_concat = []

for file in os.listdir(mpnn_indir):
    if file.endswith('af2_metrics.csv'):
        condition_id = file.split('af2_metrics.csv')[0]
        print(condition_id)

        min_condition = file.split('_')[0]
        thread_position = file.split('_')[1]
        seq_method = file.split('_')[3]
        loop = file.split('_')[4]

        #load corresponding set of dfs to merge
        af2_df = pd.read_csv(f'{mpnn_indir}/{file}')
        socket_fil_df = pd.read_csv(f'{mpnn_indir}/{seq_design_condition}_{min_condition}_{thread_position}_{seq_method}_{loop}_socket_filtered.csv')
        seqs_df = pd.read_csv(f'{mpnn_indir}/{seq_design_condition}_{min_condition}_{thread_position}_{seq_method}_seqs.csv')

        #print out number of seqs
        print(file)
        print(f'seqs_df shape: {seqs_df.shape[0]}') #should be 1/5 of af2_df
        print(f'af2_df shape: {af2_df.shape[0]}')
        print(f'socket_fil_df shape: {socket_fil_df.shape[0]}') #should be same as af2_df, unless some sockets didn't run

        #add additional info
        af2_df['state_design'] = seq_design_condition
        af2_df['min_condition'] = min_condition
        af2_df['thread_position'] = thread_position
        af2_df['seq_method'] = seq_method
        af2_df['loop'] = loop

        af2_df['bb_id'] = af2_df['design_id'].str.split('_\d+_unrelaxed', regex=True, expand=False).str[0]
        af2_df['seq_id'] = af2_df['design_id'].str.split('_unrelaxed_', regex=True, expand=False).str[0]
        af2_df['af_model_info'] = af2_df['design_id'].str.split('_unrelaxed_', regex=True, expand=False).str[1]

        #add in relevant loop
        if loop == 'bm01':
            seqs_df['sequence'] = seqs_df['sequence'].str.replace('/', 'SDPRKK')
        elif loop == 'g4s':
            seqs_df['sequence'] = seqs_df['sequence'].str.replace('/', 'GGGGS')

        #merge af2 and socket
        af2_socket_fil_df = af2_df.merge(socket_fil_df, on='design_id', how='left')

        #merge af2, socket, and seqs
        full_df = af2_socket_fil_df.merge(seqs_df, on='seq_id', how='left')
        full_df = full_df.loc[:, ~full_df.columns.str.contains('^Unnamed')]
        full_dfs_to_concat.append(full_df)

        #merge with all socket metrics for structure with cc
        socket_df = pd.read_csv(f'{mpnn_indir}/{seq_design_condition}_{min_condition}_{thread_position}_{seq_method}_{loop}_all_socket_outputs.csv')
        socket_df = socket_df.loc[:, ~socket_df.columns.str.contains('^Unnamed')]
        print(f'socket_df shape: {socket_df.shape[0]}') #should reflect number of pdbs detected as cc

        with open(mpnn_log_file, 'a') as log_file:
            log_file.write(f'{condition_id} total seqs: {seqs_df.shape[0]}\n')
            log_file.write(f'{condition_id} af2 structures: {af2_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket filtered structures: {socket_fil_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket metrics: {socket_df.shape[0]} total\n\n')

        cc_design_metrics_df = full_df.merge(socket_df, on=['design_id', 'socket_call', 'h1_seq', 'h1_reg', 'h2_seq', 'h2_reg', 'h1_non_canon_num_res', 'h2_non_canon_num_res'], how='right')
        cc_design_metrics_df.to_csv(f'{outdir}/{seq_design_condition}_{min_condition}_{thread_position}_{seq_method}_{loop}_all_metrics.csv')

#save master mpnn df
full_df = pd.concat(full_dfs_to_concat)
full_df.to_csv(f'{outdir}/{seq_design_condition}_all_mpnn_metrics.csv')        

min0_52_ssd_mpnn_g4s_
min0_52_ssd_mpnn_g4s_af2_metrics.csv
seqs_df shape: 3070
af2_df shape: 15350
socket_fil_df shape: 15350
socket_df shape: 11742
min2_52_ssd_mpnn_bm01_
min2_52_ssd_mpnn_bm01_af2_metrics.csv
seqs_df shape: 1230
af2_df shape: 6150
socket_fil_df shape: 6150
socket_df shape: 918
min2_53_ssd_mpnn_bm01_
min2_53_ssd_mpnn_bm01_af2_metrics.csv
seqs_df shape: 1900
af2_df shape: 9500
socket_fil_df shape: 9500
socket_df shape: 4595
min2_53_ssd_mpnn_g4s_
min2_53_ssd_mpnn_g4s_af2_metrics.csv
seqs_df shape: 1900
af2_df shape: 9500
socket_fil_df shape: 9500
socket_df shape: 6306
min2_52_ssd_mpnn_g4s_
min2_52_ssd_mpnn_g4s_af2_metrics.csv
seqs_df shape: 1230
af2_df shape: 6150
socket_fil_df shape: 6150
socket_df shape: 1890
min0_53_ssd_mpnn_bm01_
min0_53_ssd_mpnn_bm01_af2_metrics.csv
seqs_df shape: 6000
af2_df shape: 30000
socket_fil_df shape: 30000
socket_df shape: 15641
min0_53_ssd_mpnn_g4s_
min0_53_ssd_mpnn_g4s_af2_metrics.csv
seqs_df shape: 6000
af2_df shape: 30000
socket_fil_df 

In [6]:
#F2s
#make f2s_log.txt file in outdir
f2s_log_file = os.path.join(outdir, f'{seq_design_condition}_f2s_log.txt')
with open(f2s_log_file, 'w') as log_file:
    log_file.write(f'Processing F2s metrics for {seq_design_condition} condition\n\n')
    log_file.write(f'Input directory: {f2s_indir}\n\n')
    log_file.write(f'Output directory: {outdir}\n\n')

full_dfs_to_concat = []

for file in os.listdir(f2s_indir):
    if file.endswith('af2_metrics.csv'):
        condition_id = file.split('af2_metrics.csv')[0]
        condition_id = condition_id.replace('SSD_', '')
        print(condition_id)

        min_condition = file.split('_')[1]
        thread_position = file.split('_')[2]
        seq_method = file.split('_')[3]
        loop = file.split('_')[4]

        #load corresponding set of dfs to merge
        af2_df = pd.read_csv(f'{f2s_indir}/{file}')
        socket_fil_df = pd.read_csv(f'{f2s_indir}/{seq_design_condition}_{min_condition}_{thread_position}_{seq_method}_{loop}_socket_filtered.csv')
        seqs_df = pd.read_csv(f'{f2s_indir}/{seq_design_condition}_{min_condition}_{thread_position}_{seq_method}_seqs.csv')

        #print out number of seqs
        print(file)
        print(f'seqs_df shape: {seqs_df.shape[0]}') #should be 1/5 of af2_df
        print(f'af2_df shape: {af2_df.shape[0]}')
        print(f'socket_fil_df shape: {socket_fil_df.shape[0]}') #should be same as af2_df, unless some sockets didn't run

        #add additional info
        af2_df['state_design'] = seq_design_condition
        af2_df['min_condition'] = min_condition
        af2_df['thread_position'] = thread_position
        af2_df['seq_method'] = seq_method
        af2_df['loop'] = loop

        af2_df['bb_id'] = af2_df['design_id'].str[:22]
        af2_df['seq_id'] = af2_df['design_id'].str[:24]
        af2_df['af_model_info'] = af2_df['design_id'].str[25:]

        #add in relevant loop
        if loop == 'bm01':
            seqs_df['sequence'] = seqs_df['sequence'].str.slice(0, 28) + 'SDPRKK' + seqs_df['sequence'].str.slice(28)
        elif loop == 'g4s':
            seqs_df['sequence'] = seqs_df['sequence'].str.slice(0, 28) + 'GGGGS' + seqs_df['sequence'].str.slice(28)

        #merge af2 and socket
        af2_socket_fil_df = af2_df.merge(socket_fil_df, on='design_id', how='left')

        #merge af2, socket, and seqs
        full_df = af2_socket_fil_df.merge(seqs_df, on='seq_id', how='left')
        full_df = full_df.loc[:, ~full_df.columns.str.contains('^Unnamed')]
        full_dfs_to_concat.append(full_df)

        #merge with all socket metrics for structure with cc
        socket_df = pd.read_csv(f'{f2s_indir}/{seq_design_condition}_{min_condition}_{thread_position}_{seq_method}_{loop}_all_socket_outputs.csv')
        socket_df = socket_df.loc[:, ~socket_df.columns.str.contains('^Unnamed')]
        print(f'socket_df shape: {socket_df.shape[0]}') #should reflect number of pdbs detected as cc
        
        with open(f2s_log_file, 'a') as log_file:
            log_file.write(f'{condition_id} total seqs: {seqs_df.shape[0]}\n')
            log_file.write(f'{condition_id} af2 structures: {af2_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket filtered structures: {socket_fil_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket metrics: {socket_df.shape[0]} total\n\n')
            
        cc_design_metrics_df = full_df.merge(socket_df, on=['design_id', 'socket_call', 'h1_seq', 'h1_reg', 'h2_seq', 'h2_reg', 'h1_non_canon_num_res', 'h2_non_canon_num_res'], how='right')
        cc_design_metrics_df.to_csv(f'{outdir}/{seq_design_condition}_{min_condition}_{thread_position}_{seq_method}_{loop}_all_metrics.csv')

#save master f2s df
full_df = pd.concat(full_dfs_to_concat)
full_df.to_csv(f'{outdir}/{seq_design_condition}_all_f2s_metrics.csv')        

min2_53_f2s_bm01_
SSD_min2_53_f2s_bm01_af2_metrics.csv
seqs_df shape: 1900
af2_df shape: 9500
socket_fil_df shape: 9499
socket_df shape: 1784
min2_52_f2s_g4s_
SSD_min2_52_f2s_g4s_af2_metrics.csv
seqs_df shape: 1230
af2_df shape: 6150
socket_fil_df shape: 6150
socket_df shape: 1276
min0_53_f2s_g4s_
SSD_min0_53_f2s_g4s_af2_metrics.csv
seqs_df shape: 6000
af2_df shape: 30000
socket_fil_df shape: 29985
socket_df shape: 9209
min2_52_f2s_bm01_
SSD_min2_52_f2s_bm01_af2_metrics.csv
seqs_df shape: 1230
af2_df shape: 6150
socket_fil_df shape: 6149
socket_df shape: 1125
min2_53_f2s_g4s_
SSD_min2_53_f2s_g4s_af2_metrics.csv
seqs_df shape: 1900
af2_df shape: 9500
socket_fil_df shape: 9498
socket_df shape: 2180
min0_53_f2s_bm01_
SSD_min0_53_f2s_bm01_af2_metrics.csv
seqs_df shape: 6000
af2_df shape: 30000
socket_fil_df shape: 29989
socket_df shape: 8868
min0_52_f2s_bm01_
SSD_min0_52_f2s_bm01_af2_metrics.csv
seqs_df shape: 3070
af2_df shape: 15350
socket_fil_df shape: 15349
socket_df shape: 2357
min0_5