In [4]:
import os
import json
import re
import pandas as pd

In [5]:
####USER DEFINED VARIABLES####
af2_f2s_os_indir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd_f2s_os_scaled_up'

seq_design_condition= 'MSD'

outdir = '/Users/stephaniecrilly/Library/CloudStorage/Box-Box/kortemmelab/home/scrilly/helix_sliding/20250604_r2_hs_lib/metric_files/msd_f2s_os_scaled_up/outputs'

In [6]:
#F2s original sampling (os) for AF2 backbones
bb_model_type = 'af2'
seq_design_method= 'f2s_os'

#make f2s_log.txt file in outdir
f2s_log_file = os.path.join(outdir, f'{seq_design_condition}_f2s_os_log.txt')
with open(f2s_log_file, 'w') as log_file:
    log_file.write(f'Processing F2s os (scaled up) metrics for {seq_design_condition}, {bb_model_type} backbones condition\n\n')
    log_file.write(f'Input directory: {af2_f2s_os_indir}\n\n')
    log_file.write(f'Output directory: {outdir}\n\n')

full_dfs_to_concat = []

for file in os.listdir(af2_f2s_os_indir):
    if file.startswith('af2_metrics'):
        condition_id = file.split('_')[2]
        print(condition_id)
        condition_id = condition_id.split('.')[0]
        print(condition_id)

        input_bb_type = bb_model_type

        #load json
        with open(f'{af2_f2s_os_indir}/file_mapping_{condition_id}.json') as f:
            json_data = json.load(f)
        
        #make json into df
        inverted_json_dict = dict((v,k) for k,v in json_data.items())

        #load corresponding set of dfs to merge
        af2_df = pd.read_csv(f'{af2_f2s_os_indir}/{file}')
        socket_fil_df = pd.read_csv(f'{af2_f2s_os_indir}/{seq_design_condition}_f2s_os_scale_up_{str(condition_id)}_socket_filtered.csv')
        seqs_df = pd.read_csv(f'{af2_f2s_os_indir}/F2S_all_seqs_for_colabfold.csv')

        #add og design name to socket df
        socket_fil_df['design_id_pdb'] = socket_fil_df['design_id'].astype(str) + '.pdb'
        socket_fil_df['design_id'] = socket_fil_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_fil_df = socket_fil_df.drop(columns=['design_id_pdb'])
        socket_fil_df['design_id'] = socket_fil_df['design_id'].str.replace('.pdb', '')

        #print out number of seqs
        print(file)
        print(f'seqs_df shape: {seqs_df.shape[0]}') #should be 1/5 of af2_df
        print(f'af2_df shape: {af2_df.shape[0]}')
        print(f'socket_fil_df shape: {socket_fil_df.shape[0]}') #should be same as af2_df, unless some sockets didn't run

        #add additional info
        af2_df['state_design'] = seq_design_condition
        af2_df['input_bb_type'] = input_bb_type
        af2_df['seq_design_method'] = seq_design_method

        #seqs52_g4s_min0_13931_min2_07141_49_unrelaxed_rank_003_alphafold2_multimer_v3_model_5_seed_000.pdb
        af2_df['min0_bb_id'] = af2_df['design_id'].str.split('_min\d_', regex=True, expand=False).str[1]
        af2_df['min2_bb_id'] = af2_df['design_id'].str.split('min2_|_\d+_unrelaxed', regex=True, expand=False).str[1]
        af2_df['af2_model_info'] = af2_df['design_id'].str.split('_\d+_unrelaxed', regex=True, expand=False).str[1]
        af2_df['msd_bbs_id'] = af2_df['min0_bb_id'] + '_' + af2_df['min2_bb_id']
        af2_df['seq_id'] = af2_df['design_id'].str.split('_unrelaxed_rank', regex=True, expand=False).str[0]

        #merge af2 and socket
        af2_socket_fil_df = af2_df.merge(socket_fil_df, on='design_id', how='left')

        #merge af2, socket, and seqs
        full_df = af2_socket_fil_df.merge(seqs_df, on='seq_id', how='left')
        full_df = full_df.loc[:, ~full_df.columns.str.contains('^Unnamed')]
        full_dfs_to_concat.append(full_df)

        #merge with all socket metrics for structure with cc
        socket_df = pd.read_csv(f'{af2_f2s_os_indir}/{seq_design_condition}_f2s_os_scale_up_{str(condition_id)}_all_socket_outputs.csv')
        
        #add og design name to socket df
        socket_df['design_id_pdb'] = socket_df['design_id'].astype(str) + '.pdb'
        socket_df['design_id'] = socket_df['design_id_pdb'].map(inverted_json_dict)

        #cleanup
        socket_df = socket_df.drop(columns=['design_id_pdb'])
        socket_df['design_id'] = socket_df['design_id'].str.replace('.pdb', '')
        socket_df = socket_df.loc[:, ~socket_df.columns.str.contains('^Unnamed')]
        print(f'socket_df shape: {socket_df.shape[0]}') #should reflect number of pdbs detected as cc

        with open(f2s_log_file, 'a') as log_file:
            # log_file.write(f'{condition_id} total seqs: {seqs_df.shape[0]}\n')
            log_file.write(f'{condition_id} af2 structures: {af2_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket filtered structures: {socket_fil_df.shape[0]}\n')
            log_file.write(f'{condition_id} socket metrics: {socket_df.shape[0]} total\n\n')

        cc_design_metrics_df = full_df.merge(socket_df, on=['design_id', 'socket_call', 'h1_seq', 'h1_reg', 'h2_seq', 'h2_reg', 'h1_non_canon_num_res', 'h2_non_canon_num_res'], how='right')
        cc_design_metrics_df.to_csv(f'{outdir}/{str(condition_id)}_all_metrics.csv')

#save master df
full_df = pd.concat(full_dfs_to_concat)
full_df.to_csv(f'{outdir}/{seq_design_condition}_{seq_design_method}_{bb_model_type}_compiled_metrics.csv')    

#need to update with sequences when get file from wynton

1.csv
1
af2_metrics_1.csv
seqs_df shape: 39600
af2_df shape: 19805
socket_fil_df shape: 19805
socket_df shape: 12381
3.csv
3
af2_metrics_3.csv
seqs_df shape: 39600
af2_df shape: 19800
socket_fil_df shape: 19800
socket_df shape: 12310
2.csv
2
af2_metrics_2.csv
seqs_df shape: 39600
af2_df shape: 19800
socket_fil_df shape: 19800
socket_df shape: 12434
6.csv
6
af2_metrics_6.csv
seqs_df shape: 39600
af2_df shape: 19795
socket_fil_df shape: 19792
socket_df shape: 12405
7.csv
7
af2_metrics_7.csv
seqs_df shape: 39600
af2_df shape: 19800
socket_fil_df shape: 19800
socket_df shape: 12362
5.csv
5
af2_metrics_5.csv
seqs_df shape: 39600
af2_df shape: 19800
socket_fil_df shape: 19799
socket_df shape: 12200
4.csv
4
af2_metrics_4.csv
seqs_df shape: 39600
af2_df shape: 19800
socket_fil_df shape: 19800
socket_df shape: 12603
10.csv
10
af2_metrics_10.csv
seqs_df shape: 39600
af2_df shape: 19800
socket_fil_df shape: 19800
socket_df shape: 12212
9.csv
9
af2_metrics_9.csv
seqs_df shape: 39600
af2_df shape: 