In [1]:
import os
import json
import re
import pandas as pd

In [2]:
####USER DEFINED VARIABLES####
rosetta_mpnn_indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd/mpnn_rosetta_msd'
af2_mpnn_indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd/mpnn_af2_msd'

af2_f2s_os_indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd/f2s_os_af2_msd'
af2_f2s_as_indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd/f2s_as_af2_msd'

seq_design_condition= 'MSD'

outdir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd'

In [3]:
#MPNN for rosetta backbones
bb_model_type = 'rosetta'
seq_design_method = 'MPNN'

#make mpnn_log.txt file in outdir
mpnn_log_file = os.path.join(outdir, f'{seq_design_condition}_mpnn_log.txt')
with open(mpnn_log_file, 'w') as log_file:
    log_file.write(f'Processing MPNN metrics for {seq_design_condition}, {bb_model_type} backbones condition\n\n')
    log_file.write(f'Input directory: {rosetta_mpnn_indir}\n\n')
    log_file.write(f'Output directory: {outdir}\n\n')

full_dfs_to_concat = []

for file in os.listdir(rosetta_mpnn_indir):
    if file.endswith('af2_metrics.csv'):
        condition_id = file.split('af2_metrics.csv')[0]
        print(condition_id)

        thread_position = file.split('_')[1]
        loop = file.split('_')[2]
        input_bb_type = file.split('_')[3]

        if loop == 'bm01':
            loop_seq = 'SDPRKK'
        elif loop == 'g4s':
            loop_seq = 'GGGGS'

        #load corresponding set of dfs to merge
        af2_df = pd.read_csv(f'{rosetta_mpnn_indir}/{file}')
        socket_fil_df = pd.read_csv(f'{rosetta_mpnn_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_socket_filtered.csv')
        seqs_df = pd.read_csv(f'{rosetta_mpnn_indir}/{thread_position}_threads_MPNN_{loop_seq}_all_seqs_for_colabfold.csv')

        #print out number of seqs
        print(file)
        print(f'seqs_df shape: {seqs_df.shape[0]}') #should be 1/5 of af2_df
        print(f'af2_df shape: {af2_df.shape[0]}')
        print(f'socket_fil_df shape: {socket_fil_df.shape[0]}') #should be same as af2_df, unless some sockets didn't run

        #add additional info
        af2_df['state_design'] = seq_design_condition
        af2_df['thread_position'] = thread_position
        af2_df['loop'] = loop
        af2_df['input_bb_type'] = input_bb_type
        af2_df['seq_design_method'] = seq_design_method

        af2_df[['seq_id', 'af2_model_info']] = af2_df['design_id'].str.split('_unrelaxed_', regex=True, expand=True)
        af2_df['min0_bb_id'] = af2_df['seq_id'].str.split('_\d{8}_', regex=True, expand=False).str[0]
        af2_df['min2_bb_id'] = af2_df['seq_id'].str.split('\d{8}_\d{5}_ALFA_\d{2}_', regex=True, n=1, expand=False).str[1]
        af2_df['min2_bb_id'] = af2_df['min2_bb_id'].str[:22] #hardcoded fix later
        
        #load json
        with open(f'{rosetta_mpnn_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_file_mapping.json') as f:
            json_data = json.load(f)
        
        #make json into df
        inverted_json_dict = dict((v,k) for k,v in json_data.items())

        #add og design name to socket df
        socket_fil_df['design_id_pdb'] = socket_fil_df['design_id'].astype(str) + '.pdb'
        socket_fil_df['design_id'] = socket_fil_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_fil_df = socket_fil_df.drop(columns=['design_id_pdb'])
        socket_fil_df['design_id'] = socket_fil_df['design_id'].str.replace('.pdb', '')

        #merge af2 and socket
        af2_socket_fil_df = af2_df.merge(socket_fil_df, on='design_id', how='left')

        #merge af2, socket, and seqs
        full_df = af2_socket_fil_df.merge(seqs_df, on='seq_id', how='left')
        full_df = full_df.loc[:, ~full_df.columns.str.contains('^Unnamed')]
        full_dfs_to_concat.append(full_df)

        #merge with all socket metrics for structure with cc
        socket_df = pd.read_csv(f'{rosetta_mpnn_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_all_socket_outputs.csv')
        
        #add og design name to socket df
        socket_df['design_id_pdb'] = socket_df['design_id'].astype(str) + '.pdb'
        socket_df['design_id'] = socket_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_df = socket_df.drop(columns=['design_id_pdb'])
        socket_df['design_id'] = socket_df['design_id'].str.replace('.pdb', '')
        socket_df = socket_df.loc[:, ~socket_df.columns.str.contains('^Unnamed')]
        print(f'socket_df shape: {socket_df.shape[0]}') #should reflect number of pdbs detected as cc

        with open(mpnn_log_file, 'a') as log_file:
            log_file.write(f'{condition_id} total seqs: {seqs_df.shape[0]}\n')
            log_file.write(f'{condition_id} af2 structures: {af2_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket filtered structures: {socket_fil_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket metrics: {socket_df.shape[0]} total\n\n')

        cc_design_metrics_df = full_df.merge(socket_df, on=['design_id', 'socket_call', 'h1_seq', 'h1_reg', 'h2_seq', 'h2_reg', 'h1_non_canon_num_res', 'h2_non_canon_num_res'], how='right')
        cc_design_metrics_df.to_csv(f'{outdir}/{seq_design_condition}_{seq_design_method}_{thread_position}_{loop}_{input_bb_type}_all_metrics.csv')

#save master mpnn df
full_df = pd.concat(full_dfs_to_concat)
full_df.to_csv(f'{outdir}/{seq_design_condition}_{seq_design_method}_{bb_model_type}_compiled_metrics.csv')        

MSD_53_g4s_rosetta_bb_
MSD_53_g4s_rosetta_bb_af2_metrics.csv
seqs_df shape: 800
af2_df shape: 4000
socket_fil_df shape: 4000
socket_df shape: 1611
MSD_52_g4s_rosetta_bb_
MSD_52_g4s_rosetta_bb_af2_metrics.csv
seqs_df shape: 2900
af2_df shape: 14500
socket_fil_df shape: 14500
socket_df shape: 12098
MSD_52_bm01_rosetta_bb_
MSD_52_bm01_rosetta_bb_af2_metrics.csv
seqs_df shape: 2900
af2_df shape: 14500
socket_fil_df shape: 14500
socket_df shape: 7951
MSD_53_bm01_rosetta_bb_
MSD_53_bm01_rosetta_bb_af2_metrics.csv
seqs_df shape: 800
af2_df shape: 4000
socket_fil_df shape: 4000
socket_df shape: 972


In [4]:
#MPNN for af2 backbones
bb_model_type = 'af2'
seq_design_method = 'MPNN'

#make mpnn_log.txt file in outdir
with open(mpnn_log_file, 'a') as log_file:
    log_file.write(f'Processing MPNN metrics for {seq_design_condition}, {bb_model_type} backbones condition\n\n')
    log_file.write(f'Input directory: {af2_mpnn_indir}\n\n')
    log_file.write(f'Output directory: {outdir}\n\n')

full_dfs_to_concat = []

for file in os.listdir(af2_mpnn_indir):
    if file.endswith('af2_metrics.csv'):
        condition_id = file.split('af2_metrics.csv')[0]
        print(condition_id)

        thread_position = file.split('_')[1]
        loop = file.split('_')[2]
        input_bb_type = file.split('_')[3]

        if loop == 'bm01':
            loop_seq = 'SDPRKK'
        elif loop == 'g4s':
            loop_seq = 'GGGGS'

        #load corresponding set of dfs to merge
        af2_df = pd.read_csv(f'{af2_mpnn_indir}/{file}')
        socket_fil_df = pd.read_csv(f'{af2_mpnn_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_socket_filtered.csv')
        seqs_df = pd.read_csv(f'{af2_mpnn_indir}/{thread_position}_threads_{loop}_MPNN_all_seqs_for_colabfold.csv')

        #print out number of seqs
        print(file)
        print(f'seqs_df shape: {seqs_df.shape[0]}') #should be 1/5 of af2_df
        print(f'af2_df shape: {af2_df.shape[0]}')
        print(f'socket_fil_df shape: {socket_fil_df.shape[0]}') #should be same as af2_df, unless some sockets didn't run

        #add additional info
        af2_df['state_design'] = seq_design_condition
        af2_df['thread_position'] = thread_position
        af2_df['loop'] = loop
        af2_df['input_bb_type'] = input_bb_type
        af2_df['seq_design_method'] = seq_design_method

        af2_df['min0_bb_id'] = af2_df['design_id'].str.split('_\d\d\d\d\d_\d+_rank', regex=True, expand=False).str[0]
        af2_df['min2_bb_id'] = af2_df['design_id'].str.split('ALFA_\d\d_\d+_rank_\d\d\d_|_\d+_unrelaxed', regex=True, expand=False).str[1]
        af2_df['af2_model_info'] = af2_df['design_id'].str.split('ALFA_\d\d_\d+_rank_\d\d\d_|_\d+_unrelaxed', regex=True, expand=False).str[2]
        af2_df['msd_bbs_id'] = af2_df['min0_bb_id'] + '_' + af2_df['min2_bb_id']
        af2_df['seq_id'] = af2_df['design_id'].str.split('_unrelaxed_rank', regex=True, expand=False).str[0]
        
        #load json
        with open(f'{af2_mpnn_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_file_mapping.json') as f:
            json_data = json.load(f)
        
        #make json into df
        inverted_json_dict = dict((v,k) for k,v in json_data.items())

        #add og design name to socket df
        socket_fil_df['design_id_pdb'] = socket_fil_df['design_id'].astype(str) + '.pdb'
        socket_fil_df['design_id'] = socket_fil_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_fil_df = socket_fil_df.drop(columns=['design_id_pdb'])
        socket_fil_df['design_id'] = socket_fil_df['design_id'].str.replace('.pdb', '')

        #merge af2 and socket
        af2_socket_fil_df = af2_df.merge(socket_fil_df, on='design_id', how='left')

        #merge af2, socket, and seqs
        full_df = af2_socket_fil_df.merge(seqs_df, on='seq_id', how='left')
        full_df = full_df.loc[:, ~full_df.columns.str.contains('^Unnamed')]
        full_dfs_to_concat.append(full_df)

        #merge with all socket metrics for structure with cc
        socket_df = pd.read_csv(f'{af2_mpnn_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_all_socket_outputs.csv')
        
        #add og design name to socket df
        socket_df['design_id_pdb'] = socket_df['design_id'].astype(str) + '.pdb'
        socket_df['design_id'] = socket_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_df = socket_df.drop(columns=['design_id_pdb'])
        socket_df['design_id'] = socket_df['design_id'].str.replace('.pdb', '')
        socket_df = socket_df.loc[:, ~socket_df.columns.str.contains('^Unnamed')]
        print(f'socket_df shape: {socket_df.shape[0]}') #should reflect number of pdbs detected as cc

        with open(mpnn_log_file, 'a') as log_file:
            log_file.write(f'{condition_id} total seqs: {seqs_df.shape[0]}\n')
            log_file.write(f'{condition_id} af2 structures: {af2_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket filtered structures: {socket_fil_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket metrics: {socket_df.shape[0]} total\n\n')

        cc_design_metrics_df = full_df.merge(socket_df, on=['design_id', 'socket_call', 'h1_seq', 'h1_reg', 'h2_seq', 'h2_reg', 'h1_non_canon_num_res', 'h2_non_canon_num_res'], how='right')
        cc_design_metrics_df.to_csv(f'{outdir}/{seq_design_condition}_{seq_design_method}_{thread_position}_{loop}_{input_bb_type}_all_metrics.csv')

#save master mpnn df
full_df = pd.concat(full_dfs_to_concat)
full_df.to_csv(f'{outdir}/{seq_design_condition}_{seq_design_method}_{bb_model_type}_compiled_metrics.csv')        

MSD_52_bm01_af2_bb_
MSD_52_bm01_af2_bb_af2_metrics.csv
seqs_df shape: 300
af2_df shape: 1500
socket_fil_df shape: 1500
socket_df shape: 1233
MSD_53_g4s_af2_bb_
MSD_53_g4s_af2_bb_af2_metrics.csv
seqs_df shape: 650
af2_df shape: 3250
socket_fil_df shape: 3250
socket_df shape: 1247
MSD_52_g4s_af2_bb_
MSD_52_g4s_af2_bb_af2_metrics.csv
seqs_df shape: 3450
af2_df shape: 17250
socket_fil_df shape: 17250
socket_df shape: 13091


In [5]:
#F2s original sampling (os) for AF2 backbones
bb_model_type = 'af2'
seq_design_method= 'f2s_os'

#make f2s_log.txt file in outdir
f2s_log_file = os.path.join(outdir, f'{seq_design_condition}_f2s_os_log.txt')
with open(f2s_log_file, 'w') as log_file:
    log_file.write(f'Processing F2s os metrics for {seq_design_condition}, {bb_model_type} backbones condition\n\n')
    log_file.write(f'Input directory: {af2_f2s_os_indir}\n\n')
    log_file.write(f'Output directory: {outdir}\n\n')

full_dfs_to_concat = []

for file in os.listdir(af2_f2s_os_indir):
    if file.endswith('af2_metrics.csv'):
        condition_id = file.split('af2_metrics.csv')[0]
        print(condition_id)

        thread_position = file.split('_')[1]
        loop = file.split('_')[2]
        input_bb_type = file.split('_')[3]

        #load json
        with open(f'{af2_f2s_os_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_f2s_os_file_mapping.json') as f:
            json_data = json.load(f)
        
        #make json into df
        inverted_json_dict = dict((v,k) for k,v in json_data.items())

        #load corresponding set of dfs to merge
        af2_df = pd.read_csv(f'{af2_f2s_os_indir}/{file}')
        socket_fil_df = pd.read_csv(f'{af2_f2s_os_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_f2s_os_socket_filtered.csv')
        seqs_df = pd.read_csv(f'{af2_f2s_os_indir}/F2S_all_seqs_for_colabfold.csv')

        #get relevant seqs for design condition
        seqs_df = seqs_df[seqs_df['seq_id'].str.contains(f'seqs{thread_position}_{loop}')].copy()

        #add og design name to socket df
        socket_fil_df['design_id_pdb'] = socket_fil_df['design_id'].astype(str) + '.pdb'
        socket_fil_df['design_id'] = socket_fil_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_fil_df = socket_fil_df.drop(columns=['design_id_pdb'])
        socket_fil_df['design_id'] = socket_fil_df['design_id'].str.replace('.pdb', '')

        #print out number of seqs
        print(file)
        print(f'seqs_df shape: {seqs_df.shape[0]}') #should be 1/5 of af2_df
        print(f'af2_df shape: {af2_df.shape[0]}')
        print(f'socket_fil_df shape: {socket_fil_df.shape[0]}') #should be same as af2_df, unless some sockets didn't run

        #add additional info
        af2_df['state_design'] = seq_design_condition
        af2_df['thread_position'] = thread_position
        af2_df['loop'] = loop
        af2_df['input_bb_type'] = input_bb_type
        af2_df['seq_design_method'] = seq_design_method

        #seqs52_g4s_min0_13931_min2_07141_49_unrelaxed_rank_003_alphafold2_multimer_v3_model_5_seed_000.pdb
        af2_df['min0_bb_id'] = af2_df['design_id'].str.split('_min\d_', regex=True, expand=False).str[1]
        af2_df['min2_bb_id'] = af2_df['design_id'].str.split('min2_|_\d+_unrelaxed', regex=True, expand=False).str[1]
        af2_df['af2_model_info'] = af2_df['design_id'].str.split('_\d+_unrelaxed', regex=True, expand=False).str[1]
        af2_df['msd_bbs_id'] = af2_df['min0_bb_id'] + '_' + af2_df['min2_bb_id']
        af2_df['seq_id'] = af2_df['design_id'].str.split('_unrelaxed_rank', regex=True, expand=False).str[0]

        #merge af2 and socket
        af2_socket_fil_df = af2_df.merge(socket_fil_df, on='design_id', how='left')

        #merge af2, socket, and seqs
        full_df = af2_socket_fil_df.merge(seqs_df, on='seq_id', how='left')
        full_df = full_df.loc[:, ~full_df.columns.str.contains('^Unnamed')]
        full_dfs_to_concat.append(full_df)

        #merge with all socket metrics for structure with cc
        socket_df = pd.read_csv(f'{af2_f2s_os_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_f2s_os_all_socket_outputs.csv')
        
        #add og design name to socket df
        socket_df['design_id_pdb'] = socket_df['design_id'].astype(str) + '.pdb'
        socket_df['design_id'] = socket_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_df = socket_df.drop(columns=['design_id_pdb'])
        socket_df['design_id'] = socket_df['design_id'].str.replace('.pdb', '')
        socket_df = socket_df.loc[:, ~socket_df.columns.str.contains('^Unnamed')]
        print(f'socket_df shape: {socket_df.shape[0]}') #should reflect number of pdbs detected as cc

        with open(f2s_log_file, 'a') as log_file:
            log_file.write(f'{condition_id} total seqs: {seqs_df.shape[0]}\n')
            log_file.write(f'{condition_id} af2 structures: {af2_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket filtered structures: {socket_fil_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket metrics: {socket_df.shape[0]} total\n\n')

        cc_design_metrics_df = full_df.merge(socket_df, on=['design_id', 'socket_call', 'h1_seq', 'h1_reg', 'h2_seq', 'h2_reg', 'h1_non_canon_num_res', 'h2_non_canon_num_res'], how='right')
        cc_design_metrics_df.to_csv(f'{outdir}/{seq_design_condition}_{seq_design_method}_{thread_position}_{loop}_{input_bb_type}_all_metrics.csv')

#save master df
full_df = pd.concat(full_dfs_to_concat)
full_df.to_csv(f'{outdir}/{seq_design_condition}_{seq_design_method}_{bb_model_type}_compiled_metrics.csv')        

MSD_52_g4s_af2_bb_f2s_os_
MSD_52_g4s_af2_bb_f2s_os_af2_metrics.csv
seqs_df shape: 3450
af2_df shape: 17250
socket_fil_df shape: 17249
socket_df shape: 11461
MSD_52_bm01_af2_bb_f2s_os_
MSD_52_bm01_af2_bb_f2s_os_af2_metrics.csv
seqs_df shape: 300
af2_df shape: 1500
socket_fil_df shape: 1500
socket_df shape: 1050
MSD_53_g4s_af2_bb_f2s_os_
MSD_53_g4s_af2_bb_f2s_os_af2_metrics.csv
seqs_df shape: 650
af2_df shape: 3250
socket_fil_df shape: 3250
socket_df shape: 1114


In [6]:
#F2s autoregressive sampling (as) for AF2 backbones
bb_model_type = 'af2'
seq_design_method= 'f2s_as'

#make f2s_log.txt file in outdir
f2s_log_file = os.path.join(outdir, f'{seq_design_condition}_f2s_as_log.txt')
with open(f2s_log_file, 'w') as log_file:
    log_file.write(f'Processing F2s as metrics for {seq_design_condition}, {bb_model_type} backbones condition\n\n')
    log_file.write(f'Input directory: {af2_f2s_as_indir}\n\n')
    log_file.write(f'Output directory: {outdir}\n\n')

full_dfs_to_concat = []

for file in os.listdir(af2_f2s_as_indir):
    if file.endswith('af2_metrics.csv'):
        condition_id = file.split('af2_metrics.csv')[0]
        print(condition_id)

        thread_position = file.split('_')[1]
        loop = file.split('_')[2]
        input_bb_type = file.split('_')[3]

        #load json
        with open(f'{af2_f2s_as_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_f2s_as_file_mapping.json') as f:
            json_data = json.load(f)
        
        #make json into df
        inverted_json_dict = dict((v,k) for k,v in json_data.items())

        #load corresponding set of dfs to merge
        af2_df = pd.read_csv(f'{af2_f2s_as_indir}/{file}')
        socket_fil_df = pd.read_csv(f'{af2_f2s_as_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_f2s_as_socket_filtered.csv')
        seqs_df = pd.read_csv(f'{af2_f2s_as_indir}/F2S_all_seqs_for_colabfold.csv')

        #get relevant seqs for design condition
        seqs_df = seqs_df[seqs_df['seq_id'].str.contains(f'seqs{thread_position}_{loop}')].copy()

        #add og design name to socket df
        socket_fil_df['design_id_pdb'] = socket_fil_df['design_id'].astype(str) + '.pdb'
        socket_fil_df['design_id'] = socket_fil_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_fil_df = socket_fil_df.drop(columns=['design_id_pdb'])
        socket_fil_df['design_id'] = socket_fil_df['design_id'].str.replace('.pdb', '')

        #print out number of seqs
        print(file)
        print(f'seqs_df shape: {seqs_df.shape[0]}') #should be 1/5 of af2_df
        print(f'af2_df shape: {af2_df.shape[0]}')
        print(f'socket_fil_df shape: {socket_fil_df.shape[0]}') #should be same as af2_df, unless some sockets didn't run

        #add additional info
        af2_df['state_design'] = seq_design_condition
        af2_df['thread_position'] = thread_position
        af2_df['loop'] = loop
        af2_df['input_bb_type'] = input_bb_type
        af2_df['seq_design_method'] = seq_design_method

        #seqs52_g4s_min0_13931_min2_07141_49_unrelaxed_rank_003_alphafold2_multimer_v3_model_5_seed_000.pdb
        af2_df['min0_bb_id'] = af2_df['design_id'].str.split('_min\d_', regex=True, expand=False).str[1]
        af2_df['min2_bb_id'] = af2_df['design_id'].str.split('min2_|_\d+_unrelaxed', regex=True, expand=False).str[1]
        af2_df['af2_model_info'] = af2_df['design_id'].str.split('_\d+_unrelaxed', regex=True, expand=False).str[1]
        af2_df['msd_bbs_id'] = af2_df['min0_bb_id'] + '_' + af2_df['min2_bb_id']
        af2_df['seq_id'] = af2_df['design_id'].str.split('_unrelaxed_rank', regex=True, expand=False).str[0]

        #merge af2 and socket
        af2_socket_fil_df = af2_df.merge(socket_fil_df, on='design_id', how='left')

        #merge af2, socket, and seqs
        full_df = af2_socket_fil_df.merge(seqs_df, on='seq_id', how='left')
        full_df = full_df.loc[:, ~full_df.columns.str.contains('^Unnamed')]
        full_dfs_to_concat.append(full_df)

        #merge with all socket metrics for structure with cc
        socket_df = pd.read_csv(f'{af2_f2s_as_indir}/{seq_design_condition}_{thread_position}_{loop}_{input_bb_type}_bb_f2s_as_all_socket_outputs.csv')
        
        #add og design name to socket df
        socket_df['design_id_pdb'] = socket_df['design_id'].astype(str) + '.pdb'
        socket_df['design_id'] = socket_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_df = socket_df.drop(columns=['design_id_pdb'])
        socket_df['design_id'] = socket_df['design_id'].str.replace('.pdb', '')
        socket_df = socket_df.loc[:, ~socket_df.columns.str.contains('^Unnamed')]
        print(f'socket_df shape: {socket_df.shape[0]}') #should reflect number of pdbs detected as cc

        with open(f2s_log_file, 'a') as log_file:
            log_file.write(f'{condition_id} total seqs: {seqs_df.shape[0]}\n')
            log_file.write(f'{condition_id} af2 structures: {af2_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket filtered structures: {socket_fil_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket metrics: {socket_df.shape[0]} total\n\n')

        cc_design_metrics_df = full_df.merge(socket_df, on=['design_id', 'socket_call', 'h1_seq', 'h1_reg', 'h2_seq', 'h2_reg', 'h1_non_canon_num_res', 'h2_non_canon_num_res'], how='right')
        cc_design_metrics_df.to_csv(f'{outdir}/{seq_design_condition}_{seq_design_method}_{thread_position}_{loop}_{input_bb_type}_all_metrics.csv')

#save master df
full_df = pd.concat(full_dfs_to_concat)
full_df.to_csv(f'{outdir}/{seq_design_condition}_{seq_design_method}_{bb_model_type}_compiled_metrics.csv')        

MSD_53_g4s_af2_bb_f2s_as_
MSD_53_g4s_af2_bb_f2s_as_af2_metrics.csv
seqs_df shape: 650
af2_df shape: 3250
socket_fil_df shape: 3250
socket_df shape: 1388
MSD_52_bm01_af2_bb_f2s_as_
MSD_52_bm01_af2_bb_f2s_as_af2_metrics.csv
seqs_df shape: 300
af2_df shape: 1500
socket_fil_df shape: 1500
socket_df shape: 1127
MSD_52_g4s_af2_bb_f2s_as_
MSD_52_g4s_af2_bb_f2s_as_af2_metrics.csv
seqs_df shape: 3450
af2_df shape: 17250
socket_fil_df shape: 17250
socket_df shape: 11681
