In [1]:
import re
import os
import pandas as pd

In [2]:
def parse_fa_file(file_path):
    data = []  # List to store the parsed data
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('>'):
                # Use regular expression to extract relevant information
                info = re.search(r'^>([^,]+),.*sample=(\d+),.*score=([^,]+),.*seq_recovery=([^,]+),.*pLDDT=(\S+)', line)
                if info:
                    name_sample = f"{info.group(1)}_{info.group(2)}"
                    score = info.group(3)
                    seq_recovery = info.group(4)
                    plddt = info.group(5)
                    # Append the extracted information to the data list
                    data.append({
                        'sample': name_sample+"_wm" if "wm" in file_path else name_sample+"_original",
                        'T': float(info.group(1)[2:]),
                        'score': score,
                        'seq_recovery': seq_recovery,
                        'pLDDT': plddt,
                        'type': "wm" if "wm" in file_path else "original"
                    })
    df = pd.DataFrame(data)
    return df

In [3]:
def extract_info(temperature):
    original_path = "./outputs/monomer_original_"+str(temperature)+'/seqs/'
    wm_path = './wm_outputs/monomer_wm_'+str(temperature)+'/seqs/'
    filenames = os.listdir(original_path)
    for filename in filenames:
        if '.fa' in filename:
            original_df = parse_fa_file(original_path+filename)
            wm_df = parse_fa_file(wm_path+filename)
            df = pd.concat([original_df,wm_df])
            df.to_csv('./results/'+filename[:-3]+'_'+str(temperature)+'.csv')
        else:
            pass

In [4]:
extract_info(0.1)
extract_info(0.3)
extract_info(0.5)
extract_info(0.7)
extract_info(0.9)

In [5]:
file_names = os.listdir("./results")
file_names = [name for name in file_names if '.csv' in name ]

In [6]:
prefixes = set([name.split('_')[0] for name in file_names])

# Directory where the files are located and where the new files will be saved
dir_path = './results/'

# Group files by prefix
files_by_prefix = {prefix: [] for prefix in prefixes}

for file_name in file_names:
    prefix = file_name.split('_')[0]
    files_by_prefix[prefix].append(file_name)

# Concatenate and save files by prefix
for prefix, files in files_by_prefix.items():
    dfs = []  # To hold dataframes for concatenation
    for file_name in files:
        file_path = os.path.join(dir_path, file_name)
        dfs.append(pd.read_csv(file_path,index_col=0))
    concatenated_df = pd.concat(dfs, ignore_index=True)
    combined_file_name = f"{prefix}.csv"  # Updated to match your request
    concatenated_df.to_csv(os.path.join(dir_path, combined_file_name), index=False)
    
    for file_name in files:
        os.remove(os.path.join(dir_path, file_name))