
# Analysis of ColabFold Output
Author: Lilli Schuckert \
Date: Last updated 28.06.2024 10:25 CET \
Availability: This code is available on GitHub https://github.com/slilli/Protein-Sequence-Space under the MIT license.

This script analyses three dimensional structures that were predicted by ColabFold. It assumes PDB files as input. \

Install and import the following dependencies:

In [None]:
import os
import re
import matplotlib.pyplot as plt
import numpy as np
import csv
import argparse
import math
from pathlib import Path

from scipy.ndimage import gaussian_filter1d

from Bio.PDB import PDBParser
from Bio.PDB.Superimposer import Superimposer
from Bio import PDB
from Bio.PDB import Superimposer
from Bio.PDB import DSSP
from Bio.PDB.DSSP import dssp_dict_from_pdb_file

from tmtools.io import get_structure, get_residue_data
from tmtools import tm_align

import freesasa

## Count the number of three dimensional structures that were successfully predicted by ColabFold

In [None]:
def count_files_in_folders(folder_paths):
    
    file_counts = {}
    for folder_path in folder_paths:
        
        file_count = 0
        
        for item in os.listdir(folder_path):
            if os.path.isfile(os.path.join(folder_path, item)):
                file_count += 1

        file_counts[folder_path] = file_count

    return file_counts

folder_paths = ["colabfold/metrics/pdb_1/", "colabfold/metrics/pdb_2/"]
file_counts = count_files_in_folders(folder_paths)

for folder_path, num_files in file_counts.items():
    print(f"Number of files in '{folder_path}': {num_files}")

## Extract the pLDDT values from the B-factor column in the PDB files.
The average smoothed pLDDT values spanning the trajectories are plotted, as well as the standard deviation.
The plots are saved to the specified output folder.

In [None]:
def extract_plddt(pdb_file):
    plddts = []
    with open(pdb_file, 'r') as file:
        for line in file:
            if line.startswith('ATOM') and line[13:15] == 'CA':
                pldtt = float(line[60:66].strip())
                plddts.append(pldtt)
    return plddts

def calculate_average_plddt(pdb_file):
    plddts = extract_plddt(pdb_file)
    average_plddt = np.mean(plddts)
    return average_plddt

def analyze_trajectories(parent_directories, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    for amino_acid in amino_acids:
        plt.figure()
        plt.title(f'pLDDT Trajectory for Amino Acid {amino_acid}')
        plt.xlabel('k-step')
        plt.ylabel('Average pLDDT')
        plt.xlim(0, 195)  # Set x-axis limit
        plt.ylim(25, 95)  # Set y-axis limit

        all_plddts = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            plddts = [calculate_average_plddt(os.path.join(parent_directory, pdb_file)) for pdb_file in pdb_files]
            all_plddts.append(plddts)

        # Trim all lists to the minimum length because some structures did not get predicted and we want to caclulate the average from two folders
        min_length = min(len(plddts) for plddts in all_plddts)
        trimmed_plddts = [plddts[:min_length] for plddts in all_plddts]

        all_plddts = np.array(trimmed_plddts)
        mean_plddts = np.mean(all_plddts, axis=0)
        std_plddts = np.std(all_plddts, axis=0)

        # Apply Gaussian smoothing
        smoothed_mean = gaussian_filter1d(mean_plddts, sigma=smoothing_sigma)
        smoothed_std = gaussian_filter1d(std_plddts, sigma=smoothing_sigma)

        x_range = range(len(smoothed_mean))
        plt.plot(x_range, smoothed_mean, label='Average pLDDT', linewidth=1)
        plt.fill_between(x_range, smoothed_mean - smoothed_std, smoothed_mean + smoothed_std, alpha=0.3)
        plt.legend()

        plot_dir = 'colabfold/PLOTS/pLDDT/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_title = f'pLDDT_Trajectory_{amino_acid}'
        plot_filename = os.path.join(plot_dir, f'{plot_title}.png')
        plt.savefig(plot_filename)
        plt.show()

parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
analyze_trajectories(parent_directories)

### Caclulate RMSD, tm_score and SASA for one PDB file.
Specify desired input sequence and reference PDB file.

In [None]:
def load_structures(input_path, ref_path):
    p = PDBParser(QUIET=True)
    struct_path = Path(input_path)
    ref_path = Path(ref_path)
    parent_dir = struct_path.parent
    key_name = parent_dir.name
    fixed_struct = p.get_structure('fixed', ref_path)
    moving_struct = p.get_structure('moving', struct_path)

    return fixed_struct, moving_struct, key_name

def calculate_rmsd(fixed_atoms, moving_atoms, structure):
    super_imposer = Superimposer()
    super_imposer.set_atoms(fixed_atoms, moving_atoms)
    super_imposer.apply(structure.get_atoms())
    return super_imposer.rms

def rmsd(input_path, ref_path):
    fixed_struct, moving_struct, key_name = load_structures(input_path, ref_path)
    fixed_model = fixed_struct[0]
    for chain in fixed_model:
        if chain.id == 'E':
            all_atoms_fixed = [atom for residue in chain.get_residues() for atom in residue]
            all_atoms_moving = [atom for residue in moving_struct.get_residues() for atom in residue]
            if len(all_atoms_fixed) > len(all_atoms_moving):
                dif = len(all_atoms_fixed) - len(all_atoms_moving)
                all_atoms_fixed = all_atoms_fixed[:len(all_atoms_fixed) - dif]
            else:
                dif = len(all_atoms_moving) - len(all_atoms_fixed)
                all_atoms_moving = all_atoms_moving[:len(all_atoms_moving) - dif]
            assert len(all_atoms_fixed) == len(all_atoms_moving)
            rmsd_value = calculate_rmsd(all_atoms_fixed, all_atoms_moving, chain)
            return rmsd_value

    return None

def tm_score(input_path, ref_path):
    ref_struct, query_struct, key_name = load_structures(input_path, ref_path)
    chain = next(ref_struct.get_chains())
    ref_coords, ref_seq = get_residue_data(chain)
    chain = next(query_struct.get_chains())
    coords, seq = get_residue_data(chain)
    res = tm_align(ref_coords, coords, ref_seq, seq)
    tm_score = res.tm_norm_chain1
    return tm_score

def sasa(input_path):
    structure = freesasa.Structure(str(input_path))
    result = freesasa.calc(structure)
    total_sasa = result.totalArea()
    return total_sasa


input_path = 'colabfold/metrics/pdb_1/A_trajectory_1_unrelaxed_rank_001_alphafold2_ptm_model_3_seed_000.pdb'
ref_path = 'ACE2(1).pdb'

rmsd_value = rmsd(input_path, ref_path)
print("RMSD Value:", rmsd_value)

tm_score_value = tm_score(input_path, ref_path)
print("TM Score:", tm_score_value)

sasa_value = sasa(input_path)
print("Total SASA:", sasa_value)

## Plot average RMSD on trajectories
The average smoothed RMSD values spanning the trajectories are plotted, as well as the standard deviation.
The plots are saved to the specified output folder.

In [None]:
def analyze_trajectories(parent_directories, reference_pdb, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    for amino_acid in amino_acids:
        plt.figure()
        plt.title(f'RMSD Trajectory for Amino Acid {amino_acid}')
        plt.xlabel('k-steps')
        plt.ylabel('RMSD')
        plt.xlim(0, 195)  # Set x-axis limit
        plt.ylim(0, 80)   # Adjust y-axis limit based on expected RMSD range

        all_rmsd_values = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            rmsd_values = [rmsd(os.path.join(parent_directory, pdb_file), reference_pdb) for pdb_file in pdb_files]
            all_rmsd_values.append(rmsd_values)

        min_length = min(len(rmsd_values) for rmsd_values in all_rmsd_values)
        trimmed_rmsd_values = [rmsd_values[:min_length] for rmsd_values in all_rmsd_values]

        all_rmsd_values = np.array(trimmed_rmsd_values)
        mean_rmsd = np.mean(all_rmsd_values, axis=0)
        std_rmsd = np.std(all_rmsd_values, axis=0)

        # Apply Gaussian smoothing
        smoothed_mean = gaussian_filter1d(mean_rmsd, sigma=smoothing_sigma)
        smoothed_std = gaussian_filter1d(std_rmsd, sigma=smoothing_sigma)

        x_range = range(len(smoothed_mean))
        plt.plot(x_range, smoothed_mean, label='Average RMSD', linewidth=1)
        plt.fill_between(x_range, smoothed_mean - smoothed_std, smoothed_mean + smoothed_std, alpha=0.3)
        plt.legend()

        plot_dir = 'colabfold/PLOTS/RMSD/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_title = f'RMSD_Trajectory_{amino_acid}'
        plot_filename = os.path.join(plot_dir, f'{plot_title}.png')
        plt.savefig(plot_filename)

        plt.show()

parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_trajectories(parent_directories, reference_pdb)


## Plot average TMscore on trajectories
The average smoothed TMscore spanning the trajectories are plotted, as well as the standard deviation.
The plots are saved to the specified output folder.

In [None]:
def analyze_trajectories(parent_directories, reference_pdb, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    for amino_acid in amino_acids:
        plt.figure()
        plt.title(f'TMscore Trajectory for Amino Acid {amino_acid}')
        plt.xlabel('k-steps')
        plt.ylabel('TMscore')
        plt.xlim(0, 195)  # Set x-axis limit
        plt.ylim(0, 1)    # Set y-axis limit based on TMscore range

        all_tmscores = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            tmscores = [tm_score(os.path.join(parent_directory, pdb_file), reference_pdb) for pdb_file in pdb_files]
            all_tmscores.append(tmscores)

        min_length = min(len(tmscores) for tmscores in all_tmscores)
        trimmed_tmscores = [tmscores[:min_length] for tmscores in all_tmscores]

        all_tmscores = np.array(trimmed_tmscores)
        mean_tmscores = np.mean(all_tmscores, axis=0)
        std_tmscores = np.std(all_tmscores, axis=0)

        # Apply Gaussian smoothing
        smoothed_mean = gaussian_filter1d(mean_tmscores, sigma=smoothing_sigma)
        smoothed_std = gaussian_filter1d(std_tmscores, sigma=smoothing_sigma)

        x_range = range(len(smoothed_mean))
        plt.plot(x_range, smoothed_mean, label='Average TMscore', linewidth=1)
        plt.fill_between(x_range, smoothed_mean - smoothed_std, smoothed_mean + smoothed_std, alpha=0.3)

        plt.legend()

        plot_dir = 'colabfold/PLOTS/TMscore/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_title = f'TMscore_Trajectory_{amino_acid}'
        plot_filename = os.path.join(plot_dir, f'{plot_title}.png')
        plt.savefig(plot_filename)

        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_trajectories(parent_directories, reference_pdb)


## Plot average SASA on trajectories
The average smoothed SASA spanning the trajectories are plotted, as well as the standard deviation.
The plots are saved to the specified output folder.

In [None]:
def analyze_trajectories(parent_directories, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    for amino_acid in amino_acids:
        plt.figure()
        plt.title(f'SASA Trajectory for Amino Acid {amino_acid}')
        plt.xlabel('k-steps')
        plt.ylabel('SASA')
        plt.xlim(0, 195)  # Set x-axis limit
        plt.ylim(9000, 37000)  # Set y-axis limit based on expected SASA range

        all_sasa_values = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            sasa_values = [sasa(os.path.join(parent_directory, pdb_file)) for pdb_file in pdb_files]
            all_sasa_values.append(sasa_values)

        min_length = min(len(sasa_values) for sasa_values in all_sasa_values)
        trimmed_sasa_values = [sasa_values[:min_length] for sasa_values in all_sasa_values]
        all_sasa_values = np.array(trimmed_sasa_values)

        mean_sasa = np.mean(all_sasa_values, axis=0)
        std_sasa = np.std(all_sasa_values, axis=0)
        
        # Apply gaussian smoothing
        smoothed_mean = gaussian_filter1d(mean_sasa, sigma=smoothing_sigma)
        smoothed_std = gaussian_filter1d(std_sasa, sigma=smoothing_sigma)

        x_range = range(len(smoothed_mean))
        plt.plot(x_range, smoothed_mean, label='Average SASA', linewidth=1)
        plt.fill_between(x_range, smoothed_mean - smoothed_std, smoothed_mean + smoothed_std, alpha=0.3)
        plt.legend()

        plot_dir = 'colabfold/PLOTS/SASA/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_title = f'SASA_Trajectory_{amino_acid}'
        plot_filename = os.path.join(plot_dir, f'{plot_title}.png')
        plt.savefig(plot_filename)
        plt.show()

parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
analyze_trajectories(parent_directories)

## Absolute Change in pLDDT along trajectories

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from numpy.polynomial.polynomial import Polynomial

def extract_plddt(pdb_file):
    plddts = []
    with open(pdb_file, 'r') as file:
        for line in file:
            if line.startswith('ATOM') and line[13:15] == 'CA':
                pldtt = float(line[60:66].strip())
                plddts.append(pldtt)
    return plddts

def calculate_average_plddt(pdb_file):
    plddts = extract_plddt(pdb_file)
    average_plddt = np.mean(plddts)
    return average_plddt

def analyze_trajectories(parent_directories, reference_pdb, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    
    for amino_acid in amino_acids:
        all_plddts = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            plddts = [calculate_average_plddt(os.path.join(parent_directory, pdb_file)) for pdb_file in pdb_files]
            all_plddts.append(plddts)

        min_length = min(len(plddts) for plddts in all_plddts)
        trimmed_plddts = [plddts[:min_length] for plddts in all_plddts]

        all_plddts = np.array(trimmed_plddts)

        mean_plddts = np.mean(all_plddts, axis=0)

        # Calculate the absolute change in pLDDT along the trajectory
        abs_delta_plddts = np.abs(np.diff(mean_plddts))

        # Calculate the standard deviation of the absolute change in pLDDT
        std_abs_delta_plddts = np.std(np.abs(np.diff(all_plddts, axis=1)), axis=0)

        # Apply Gaussian smoothing to the absolute change and standard deviation
        smoothed_abs_delta_plddts = gaussian_filter1d(abs_delta_plddts, sigma=smoothing_sigma)
        smoothed_std_abs_delta_plddts = gaussian_filter1d(std_abs_delta_plddts, sigma=smoothing_sigma)

        # Scatter plot for the current amino acid
        plt.figure(figsize=(10, 6))
        plt.plot(range(len(smoothed_abs_delta_plddts)), smoothed_abs_delta_plddts, c='blue', alpha=0.5, label='|ΔpLDDT|')
        plt.fill_between(range(len(smoothed_abs_delta_plddts)), smoothed_abs_delta_plddts - smoothed_std_abs_delta_plddts, smoothed_abs_delta_plddts + smoothed_std_abs_delta_plddts, alpha=0.3, label='Standard Deviation')

        plt.title(f'Absolute Change in pLDDT Along the Trajectory for {amino_acid}')
        plt.xlabel('k-steps')
        plt.ylabel('Absolute Change in pLDDT (|ΔpLDDT|)')
        plt.legend()

        plot_dir = 'colabfold/PLOTS/Abs_Delta_pLDDT_along_trajectory/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_filename = os.path.join(plot_dir, f'Abs_Delta_pLDDT_along_trajectory_{amino_acid}_smoothed.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_trajectories(parent_directories, reference_pdb)


## Absolute Change in TM Score along trajectories

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from numpy.polynomial.polynomial import Polynomial

def analyze_trajectories(parent_directories, reference_pdb, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    
    for amino_acid in amino_acids:
        all_tmscores = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            tmscores = [tm_score(os.path.join(parent_directory, pdb_file), reference_pdb) for pdb_file in pdb_files]
            all_tmscores.append(tmscores)

        min_length = min(len(tmscores) for tmscores in all_tmscores)
        trimmed_tmscores = [tmscores[:min_length] for tmscores in all_tmscores]

        all_tmscores = np.array(trimmed_tmscores)

        mean_tmscores = np.mean(all_tmscores, axis=0)
        std_tmscores = np.std(all_tmscores, axis=0)

        # Calculate the absolute change in TM score along the trajectory
        abs_delta_tmscores = np.abs(np.diff(mean_tmscores))
        smoothed_abs_delta_tmscores = gaussian_filter1d(abs_delta_tmscores, sigma=smoothing_sigma)

        # Calculate the standard deviation of the absolute change in TM score
        std_abs_delta_tmscores = np.std(np.abs(np.diff(all_tmscores, axis=1)), axis=0)
        smoothed_std_abs_delta_tmscores = gaussian_filter1d(std_abs_delta_tmscores, sigma=smoothing_sigma)

        # Scatter plot for the current amino acid
        plt.figure(figsize=(10, 6))
        plt.plot(range(len(smoothed_abs_delta_tmscores)), smoothed_abs_delta_tmscores, c='blue', alpha=0.5, label='|ΔTM score|')
        plt.fill_between(range(len(smoothed_abs_delta_tmscores)), smoothed_abs_delta_tmscores - smoothed_std_abs_delta_tmscores, smoothed_abs_delta_tmscores + smoothed_std_abs_delta_tmscores, alpha=0.3, label='Standard Deviation')

        plt.title(f'Absolute Change in TM Score Along the Trajectory for {amino_acid}')
        plt.xlabel('k-steps')
        plt.ylabel('Absolute Change in TM score (|ΔTM score|)')
        plt.legend()

        plot_dir = 'colabfold/PLOTS/Abs_Delta_TM_score_along_trajectory/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_filename = os.path.join(plot_dir, f'Abs_Delta_TM_score_along_trajectory_{amino_acid}.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_trajectories(parent_directories, reference_pdb)



## Absolute Change in RMSD along trajectories

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d

def analyze_rmsd_trajectories(parent_directories, ref_path, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    
    for amino_acid in amino_acids:
        all_rmsd_values = []
        
        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            rmsd_values = []
            for pdb_file in pdb_files:
                pdb_path = os.path.join(parent_directory, pdb_file)
                rmsd_value = rmsd(pdb_path, ref_path)
                if rmsd_value is not None:
                    rmsd_values.append(rmsd_value)
                    
            all_rmsd_values.append(rmsd_values)

        min_length = min(len(rmsd_values) for rmsd_values in all_rmsd_values)
        trimmed_rmsd_values = [rmsd_values[:min_length] for rmsd_values in all_rmsd_values]

        all_rmsd_values = np.array(trimmed_rmsd_values)

        mean_rmsd_values = np.mean(all_rmsd_values, axis=0)
        abs_delta_rmsd_values = np.abs(np.diff(mean_rmsd_values))

        # Calculate the standard deviation of the absolute change in RMSD
        std_abs_delta_rmsd_values = np.std(np.abs(np.diff(all_rmsd_values, axis=1)), axis=0)

        # Apply Gaussian smoothing
        smoothed_abs_delta_rmsd_values = gaussian_filter1d(abs_delta_rmsd_values, sigma=smoothing_sigma)
        smoothed_std_abs_delta_rmsd_values = gaussian_filter1d(std_abs_delta_rmsd_values, sigma=smoothing_sigma)

        plt.figure(figsize=(10, 6))
        plt.plot(range(len(smoothed_abs_delta_rmsd_values)), smoothed_abs_delta_rmsd_values, c='blue', alpha=0.5, label='|ΔRMSD|')
        plt.fill_between(range(len(smoothed_abs_delta_rmsd_values)), smoothed_abs_delta_rmsd_values - smoothed_std_abs_delta_rmsd_values, smoothed_abs_delta_rmsd_values + smoothed_std_abs_delta_rmsd_values, alpha=0.3, label='Standard Deviation')
        
        plt.title(f'Absolute Change in RMSD Along the Trajectory for {amino_acid}')
        plt.xlabel('k-steps')
        plt.ylabel('Absolute Change in RMSD (|ΔRMSD|)')
        plt.legend()
        
        plot_dir = 'colabfold/PLOTS/Abs_Delta_RMSD_along_trajectory/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)
        
        plot_filename = os.path.join(plot_dir, f'Abs_Delta_RMSD_along_trajectory_{amino_acid}.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_rmsd_trajectories(parent_directories, reference_pdb, smoothing_sigma=0.8)



## Absolute Change in SASA along trajectories

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import freesasa

def analyze_sasa_trajectories(parent_directories, ref_path):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    
    def sasa(input_path):
        structure = freesasa.Structure(str(input_path))
        result = freesasa.calc(structure)
        total_sasa = result.totalArea()
        return total_sasa

    for amino_acid in amino_acids:
        all_sasa_values = []
        
        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            sasa_values = []
            for pdb_file in pdb_files:
                pdb_path = os.path.join(parent_directory, pdb_file)
                sasa_value = sasa(pdb_path)
                if sasa_value is not None:
                    sasa_values.append(sasa_value)
                    
            all_sasa_values.append(sasa_values)

        min_length = min(len(sasa_values) for sasa_values in all_sasa_values)
        trimmed_sasa_values = [sasa_values[:min_length] for sasa_values in all_sasa_values]

        mean_sasa_values = np.mean(trimmed_sasa_values, axis=0)
        abs_delta_sasa_values = np.abs(np.diff(mean_sasa_values))
        std_delta_sasa_values = np.std(np.abs(np.diff(trimmed_sasa_values, axis=1)), axis=0)

        plt.figure(figsize=(10, 6))
        plt.plot(range(len(abs_delta_sasa_values)), abs_delta_sasa_values, c='blue', alpha=0.5, label='|ΔSASA|')
        plt.fill_between(range(len(abs_delta_sasa_values)), abs_delta_sasa_values - std_delta_sasa_values, abs_delta_sasa_values + std_delta_sasa_values, alpha=0.3, label='Standard Deviation')
        
        plt.title(f'Absolute Change in SASA Along the Trajectory for {amino_acid}')
        plt.xlabel('k-steps')
        plt.ylabel('Absolute Change in SASA (|ΔSASA|)')
        plt.legend()
        
        plot_dir = 'colabfold/PLOTS/Abs_Delta_SASA_along_trajectory/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)
        
        plot_filename = os.path.join(plot_dir, f'Abs_Delta_SASA_along_trajectory_{amino_acid}.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_sasa_trajectories(parent_directories, reference_pdb)


## Correlation Analysis pLDDT vs TM Score

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d

def extract_plddt(pdb_file):
    plddts = []
    with open(pdb_file, 'r') as file:
        for line in file:
            if line.startswith('ATOM') and line[13:15] == 'CA':
                pldtt = float(line[60:66].strip())
                plddts.append(pldtt)
    return plddts

def calculate_average_plddt(pdb_file):
    plddts = extract_plddt(pdb_file)
    average_plddt = np.mean(plddts)
    return average_plddt

def analyze_trajectories(parent_directories, reference_pdb, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    
    for amino_acid in amino_acids:
        all_plddts = []
        all_tmscores = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            plddts = [calculate_average_plddt(os.path.join(parent_directory, pdb_file)) for pdb_file in pdb_files]
            tmscores = [tm_score(os.path.join(parent_directory, pdb_file), reference_pdb) for pdb_file in pdb_files]

            all_plddts.append(plddts)
            all_tmscores.append(tmscores)

        min_length = min(len(plddts) for plddts in all_plddts)
        trimmed_plddts = [plddts[:min_length] for plddts in all_plddts]
        trimmed_tmscores = [tmscores[:min_length] for tmscores in all_tmscores]

        all_plddts = np.array(trimmed_plddts)
        all_tmscores = np.array(trimmed_tmscores)

        mean_plddts = np.mean(all_plddts, axis=0)
        mean_tmscores = np.mean(all_tmscores, axis=0)

        # Apply Gaussian smoothing
        smoothed_mean_plddts = gaussian_filter1d(mean_plddts, sigma=smoothing_sigma)
        smoothed_mean_tmscores = gaussian_filter1d(mean_tmscores, sigma=smoothing_sigma)

        # Scatter plot for the current amino acid
        plt.figure(figsize=(10, 6))
        
        # Create a colormap that goes from light to dark
        cmap = plt.get_cmap('Blues_r')
        #cmap = plt.get_cmap('rainbow')
        
        # Scatter plot with color coding based on position
        plt.scatter(smoothed_mean_plddts, smoothed_mean_tmscores, c=np.arange(len(smoothed_mean_plddts)), cmap=cmap, marker='o', alpha=0.5, label='TM score vs pLDDT')
        
        
        plt.title(f'Scatter plot between pLDDT and TM score for {amino_acid}')
        plt.xlabel('Average pLDDT')
        plt.ylabel('Average TM score')
        plt.colorbar(label='Position along trajectory')
        plt.legend()

        plot_dir = 'colabfold/PLOTS/TEST/colorcode_Scatter_TMvsPLDDT/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_filename = os.path.join(plot_dir, f'Scatter_pLDDT_TMscore_{amino_acid}.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_trajectories(parent_directories, reference_pdb)


## Correlation Analysis pLDDT vs RMSD

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d

def extract_plddt(pdb_file):
    plddts = []
    with open(pdb_file, 'r') as file:
        for line in file:
            if line.startswith('ATOM') and line[13:15] == 'CA':
                pldtt = float(line[60:66].strip())
                plddts.append(pldtt)
    return plddts

def calculate_average_plddt(pdb_file):
    plddts = extract_plddt(pdb_file)
    average_plddt = np.mean(plddts)
    return average_plddt

def analyze_rmsd_vs_plddt(parent_directories, reference_pdb, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

    for amino_acid in amino_acids:
        all_plddts = []
        all_rmsd_values = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            plddts = [calculate_average_plddt(os.path.join(parent_directory, pdb_file)) for pdb_file in pdb_files]
            rmsd_values = [rmsd(os.path.join(parent_directory, pdb_file), reference_pdb) for pdb_file in pdb_files]

            all_plddts.append(plddts)
            all_rmsd_values.append(rmsd_values)

        min_length = min(len(plddts) for plddts in all_plddts)
        trimmed_plddts = [plddts[:min_length] for plddts in all_plddts]
        trimmed_rmsd_values = [rmsd_values[:min_length] for rmsd_values in all_rmsd_values]

        all_plddts = np.array(trimmed_plddts)
        all_rmsd_values = np.array(trimmed_rmsd_values)

        mean_plddts = np.mean(all_plddts, axis=0)
        mean_rmsd_values = np.mean(all_rmsd_values, axis=0)

        # Apply Gaussian smoothing
        smoothed_mean_plddts = gaussian_filter1d(mean_plddts, sigma=smoothing_sigma)
        smoothed_mean_rmsd_values = gaussian_filter1d(mean_rmsd_values, sigma=smoothing_sigma)

        # Scatter plot for the current amino acid
        plt.figure(figsize=(10, 6))

        # Create a colormap that goes from light to dark
        cmap = plt.get_cmap('Blues_r')
        #cmap = plt.get_cmap('rainbow')

        # Scatter plot with color coding based on position
        plt.scatter(smoothed_mean_plddts, smoothed_mean_rmsd_values, c=np.arange(len(smoothed_mean_plddts)), cmap=cmap, marker='o', alpha=0.5, label='RMSD vs pLDDT')

        # Fit and plot regression line using numpy.polyfit
        #coefficients = np.polyfit(smoothed_mean_plddts, smoothed_mean_rmsd_values, 1)
        #poly = np.poly1d(coefficients)
        #plt.plot(smoothed_mean_plddts, poly(smoothed_mean_plddts), color='green', linestyle='-', linewidth=2, label='Regression Line')

        plt.title(f'Scatter plot between pLDDT and RMSD for {amino_acid}')
        plt.xlabel('Average pLDDT')
        plt.ylabel('Average RMSD')
        plt.colorbar(label='Position along trajectory')
        plt.legend()

        plot_dir = 'colabfold/PLOTS/TEST/colorcode_Scatter_RMSDvsPLDDT/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_filename = os.path.join(plot_dir, f'Scatter_RMSD_pLDDT_{amino_acid}.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_rmsd_vs_plddt(parent_directories, reference_pdb)


## Correlation Analysis RMSD vs TM Score

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d
from scipy.stats import linregress

def analyze_tm_vs_rmsd(parent_directories, reference_pdb, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

    for amino_acid in amino_acids:
        all_rmsd_values = []
        all_tmscores = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            rmsd_values = [rmsd(os.path.join(parent_directory, pdb_file), reference_pdb) for pdb_file in pdb_files]
            tmscores = [tm_score(os.path.join(parent_directory, pdb_file), reference_pdb) for pdb_file in pdb_files]

            all_rmsd_values.append(rmsd_values)
            all_tmscores.append(tmscores)

        min_length = min(len(rmsd_values) for rmsd_values in all_rmsd_values)
        trimmed_rmsd_values = [rmsd_values[:min_length] for rmsd_values in all_rmsd_values]
        trimmed_tmscores = [tmscores[:min_length] for tmscores in all_tmscores]

        all_rmsd_values = np.array(trimmed_rmsd_values)
        all_tmscores = np.array(trimmed_tmscores)

        mean_rmsd_values = np.mean(all_rmsd_values, axis=0)
        mean_tmscores = np.mean(all_tmscores, axis=0)

        # Apply Gaussian smoothing
        smoothed_mean_rmsd_values = gaussian_filter1d(mean_rmsd_values, sigma=smoothing_sigma)
        smoothed_mean_tmscores = gaussian_filter1d(mean_tmscores, sigma=smoothing_sigma)

        # Scatter plot for the current amino acid
        plt.figure(figsize=(10, 6))

        # Create a colormap that goes from light to dark
        cmap = plt.get_cmap('Blues_r')

        # Scatter plot with color coding based on position
        plt.scatter(smoothed_mean_rmsd_values, smoothed_mean_tmscores, c=np.arange(len(smoothed_mean_rmsd_values)), cmap=cmap, marker='o', alpha=0.5, label='TM score vs RMSD')

        # Fit and plot regression line using scipy linregress
        #slope, intercept, _, _, _ = linregress(smoothed_mean_rmsd_values, smoothed_mean_tmscores)
        #plt.plot(smoothed_mean_rmsd_values, slope * smoothed_mean_rmsd_values + intercept, color='green', linestyle='-', linewidth=2, label='Regression Line')

        plt.title(f'Scatter plot between RMSD and TM score for {amino_acid}')
        plt.xlabel('Average RMSD')
        plt.ylabel('Average TM score')
        plt.colorbar(label='Position along trajectory')
        plt.legend()

        plot_dir = 'colabfold/PLOTS/TEST/colorcode_Scatter_RMSDvsTM/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_filename = os.path.join(plot_dir, f'Scatter_RMSD_TMscore_{amino_acid}.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_tm_vs_rmsd(parent_directories, reference_pdb)


## Correlation Analysis SASA vs RMSD

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d

def analyze_rmsd_vs_sasa(parent_directories, reference_pdb, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

    for amino_acid in amino_acids:
        all_sasa_values = []
        all_rmsd_values = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            sasa_values = [sasa(os.path.join(parent_directory, pdb_file)) for pdb_file in pdb_files]
            rmsd_values = [rmsd(os.path.join(parent_directory, pdb_file), reference_pdb) for pdb_file in pdb_files]

            all_sasa_values.append(sasa_values)
            all_rmsd_values.append(rmsd_values)

        min_length = min(len(sasa_values) for sasa_values in all_sasa_values)
        trimmed_sasa_values = [sasa_values[:min_length] for sasa_values in all_sasa_values]
        trimmed_rmsd_values = [rmsd_values[:min_length] for rmsd_values in all_rmsd_values]

        all_sasa_values = np.array(trimmed_sasa_values)
        all_rmsd_values = np.array(trimmed_rmsd_values)

        mean_sasa_values = np.mean(all_sasa_values, axis=0)
        mean_rmsd_values = np.mean(all_rmsd_values, axis=0)

        # Apply Gaussian smoothing
        smoothed_mean_sasa_values = gaussian_filter1d(mean_sasa_values, sigma=smoothing_sigma)
        smoothed_mean_rmsd_values = gaussian_filter1d(mean_rmsd_values, sigma=smoothing_sigma)

        # Scatter plot for the current amino acid
        plt.figure(figsize=(10, 6))

        # Create a colormap that goes from light to dark
        cmap = plt.get_cmap('Blues_r')

        # Scatter plot with color coding based on position
        plt.scatter(smoothed_mean_sasa_values, smoothed_mean_rmsd_values, c=np.arange(len(smoothed_mean_sasa_values)), cmap=cmap, marker='o', alpha=0.5, label='RMSD vs SASA')

        # Fit and plot regression line using numpy.polyfit
        #coefficients = np.polyfit(smoothed_mean_sasa_values, smoothed_mean_rmsd_values, 1)
        #poly = np.poly1d(coefficients)
        #plt.plot(smoothed_mean_sasa_values, poly(smoothed_mean_sasa_values), color='green', linestyle='-', linewidth=2, label='Regression Line')

        plt.title(f'Scatter plot between SASA and RMSD for {amino_acid}')
        plt.xlabel('Average SASA')
        plt.ylabel('Average RMSD')
        plt.colorbar(label='Position along trajectory')
        plt.legend()

        plot_dir = 'colabfold/PLOTS/TEST/colorcode_Scatter_RMSDvsSASA/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_filename = os.path.join(plot_dir, f'Scatter_RMSD_SASA_{amino_acid}.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_rmsd_vs_sasa(parent_directories, reference_pdb)


## Correlation Analysis SASA vs pLDDT

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d


def extract_plddt(pdb_file):
    plddts = []
    with open(pdb_file, 'r') as file:
        for line in file:
            if line.startswith('ATOM'):
                plddt = float(line[60:66].strip())
                plddts.append(plddt)
    return plddts

def calculate_average_plddt(pdb_file):
    plddts = extract_plddt(pdb_file)
    average_plddt = np.mean(plddts)
    return average_plddt

def analyze_plddt_vs_sasa(parent_directories, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

    for amino_acid in amino_acids:
        all_sasa_values = []
        all_plddt_values = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            sasa_values = [sasa(os.path.join(parent_directory, pdb_file)) for pdb_file in pdb_files]
            plddt_values = [calculate_average_plddt(os.path.join(parent_directory, pdb_file)) for pdb_file in pdb_files]

            all_sasa_values.append(sasa_values)
            all_plddt_values.append(plddt_values)

        min_length = min(len(sasa_values) for sasa_values in all_sasa_values)
        trimmed_sasa_values = [sasa_values[:min_length] for sasa_values in all_sasa_values]
        trimmed_plddt_values = [plddt_values[:min_length] for plddt_values in all_plddt_values]

        all_sasa_values = np.array(trimmed_sasa_values)
        all_plddt_values = np.array(trimmed_plddt_values)

        mean_sasa_values = np.mean(all_sasa_values, axis=0)
        mean_plddt_values = np.mean(all_plddt_values, axis=0)

        # Apply Gaussian smoothing
        smoothed_mean_sasa_values = gaussian_filter1d(mean_sasa_values, sigma=smoothing_sigma)
        smoothed_mean_plddt_values = gaussian_filter1d(mean_plddt_values, sigma=smoothing_sigma)

        plt.figure(figsize=(10, 6))

        # Create a colormap that goes from light to dark
        cmap = plt.get_cmap('Blues_r')
        plt.scatter(smoothed_mean_sasa_values, smoothed_mean_plddt_values, c=np.arange(len(smoothed_mean_sasa_values)), cmap=cmap, marker='o', alpha=0.5, label='pLDDT vs SASA')

        plt.title(f'Scatter plot between SASA and pLDDT for {amino_acid}')
        plt.xlabel('Average SASA')
        plt.ylabel('Average pLDDT')
        plt.colorbar(label='Position along trajectory')
        plt.legend()

        plot_dir = 'colabfold/PLOTS/TEST/colorcode_Scatter_pLDDTvsSASA/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_filename = os.path.join(plot_dir, f'Scatter_pLDDT_SASA_{amino_acid}.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_plddt_vs_sasa(parent_directories)


## Correlation Analysis SASA vs TM Score

In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter1d


def analyze_tm_vs_sasa(parent_directories, reference_pdb, smoothing_sigma=0.5):
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

    for amino_acid in amino_acids:
        all_sasa_values = []
        all_tm_values = []

        for parent_directory in parent_directories:
            pdb_files = [f for f in os.listdir(parent_directory) if f.startswith(f'{amino_acid}') and f.endswith('.pdb')]
            pdb_files.sort(key=lambda x: int(re.search(r'\d+', x).group()))

            sasa_values = [sasa(os.path.join(parent_directory, pdb_file)) for pdb_file in pdb_files]
            tm_values = [tm_score(os.path.join(parent_directory, pdb_file), reference_pdb) for pdb_file in pdb_files]

            all_sasa_values.append(sasa_values)
            all_tm_values.append(tm_values)

        min_length = min(len(sasa_values) for sasa_values in all_sasa_values)
        trimmed_sasa_values = [sasa_values[:min_length] for sasa_values in all_sasa_values]
        trimmed_tm_values = [tm_values[:min_length] for tm_values in all_tm_values]

        all_sasa_values = np.array(trimmed_sasa_values)
        all_tm_values = np.array(trimmed_tm_values)

        mean_sasa_values = np.mean(all_sasa_values, axis=0)
        mean_tm_values = np.mean(all_tm_values, axis=0)

        # Apply Gaussian smoothing
        smoothed_mean_sasa_values = gaussian_filter1d(mean_sasa_values, sigma=smoothing_sigma)
        smoothed_mean_tm_values = gaussian_filter1d(mean_tm_values, sigma=smoothing_sigma)

        plt.figure(figsize=(10, 6))

        # Create a colormap that goes from light to dark
        cmap = plt.get_cmap('Blues_r')
        plt.scatter(smoothed_mean_sasa_values, smoothed_mean_tm_values, c=np.arange(len(smoothed_mean_sasa_values)), cmap=cmap, marker='o', alpha=0.5, label='TM Score vs SASA')

        plt.title(f'Scatter plot between SASA and TM Score for {amino_acid}')
        plt.xlabel('Average SASA')
        plt.ylabel('Average TM Score')
        plt.colorbar(label='Position along trajectory')
        plt.legend()

        plot_dir = 'colabfold/PLOTS/TEST/colorcode_Scatter_TMvsSASA/'
        if not os.path.exists(plot_dir):
            os.makedirs(plot_dir)

        plot_filename = os.path.join(plot_dir, f'Scatter_TM_SASA_{amino_acid}.png')
        plt.savefig(plot_filename)
        plt.show()

# Example usage
parent_directories = ['colabfold/metrics/pdb_1/', 'colabfold/metrics/pdb_2/']
reference_pdb = 'ACE2(1).pdb'
analyze_tm_vs_sasa(parent_directories, reference_pdb)
