
# Contact Matrix
Author: Lilli Schuckert \
Date: Last updated 28.06.2024 10:40 CET \
Availability: This code is available on GitHub https://github.com/slilli/Protein-Sequence-Space under the MIT license.

This script calculates contact matrices from pdb files and analyzes the amount of contact points along the trajectories. \

Install and import the following dependencies:

In [None]:
import os
import re
import pandas as pd
from Bio.PDB import *
import numpy as np
import matplotlib.pyplot as plt
import itertools

## Calculate Contact Matrix

* Calculate contact matrices for all PDB files in the input folder and save them as CSV files in the output folder.

* Set cutoff to desired cutoff distance (4-8 Angström)

In [None]:
def get_contact_matrix(structure, cutoff=6.0):
    parser = PDBParser()  # Instantiate the PDB parser
    model = parser.get_structure('protein', structure)

    residues = Selection.unfold_entities(model, 'R')
    n_residues = len(residues)

    contact_matrix = np.zeros((n_residues, n_residues), dtype=bool)

    # Calculate distances and determine contacts
    for i in range(n_residues):
        for j in range(i + 1, n_residues):
            atoms_i = residues[i].get_unpacked_list()
            atoms_j = residues[j].get_unpacked_list()
            for atom_i in atoms_i:
                for atom_j in atoms_j:
                    distance = atom_i - atom_j
                    if distance <= cutoff:
                        contact_matrix[i, j] = True
                        contact_matrix[j, i] = True
                        break

    return contact_matrix

def calculate_contact_matrices(input_folder, output_folder, cutoff=8.0):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdb_files = [f for f in os.listdir(input_folder) if f.endswith('.pdb')]
    for pdb_file in pdb_files:
        pdb_path = os.path.join(input_folder, pdb_file)
        contact_matrix = get_contact_matrix(pdb_path, cutoff=cutoff)

        # Save contact matrix as CSV
        output_file = os.path.splitext(pdb_file)[0] + '_contact_matrix.csv'
        output_path = os.path.join(output_folder, output_file)
        pd.DataFrame(contact_matrix).to_csv(output_path, index=False)

# Example usage:
input_folder = "control_seqs/pdb_control_alphafold/"
output_folder = "variant_analysis/contact_matrix_permutated_seqs/alphafold/"

# Create output folder if it doesn't exist
calculate_contact_matrices(input_folder, output_folder, cutoff=8.0)

## ColabFold Contact Matrix along trajectories

In [None]:
contact_matrix_dir_1 = 'colabfold/contact_matrix/'
contact_matrix_dir_2 = 'colabfold/contact_matrix_2/'

# Function to extract the trajectory number and amino acid from the file name
def extract_trajectory_info(file_name):
    match = re.search(r"(.+?)_trajectory_(\d+)_", file_name)
    if match:
        amino_acid = match.group(1)
        trajectory_num = int(match.group(2))
        return amino_acid, trajectory_num
    else:
        return file_name.split("_")[0], -1

# Get a list of all files in the contact matrix directories
file_names_1 = [f for f in os.listdir(contact_matrix_dir_1) if f.endswith('.csv')]
file_names_2 = [f for f in os.listdir(contact_matrix_dir_2) if f.endswith('.csv')]


sorted_file_names_1 = sorted(file_names_1, key=lambda x: extract_trajectory_info(x))
sorted_file_names_2 = sorted(file_names_2, key=lambda x: extract_trajectory_info(x))
amino_acid_groups_1 = {k: list(g) for k, g in itertools.groupby(sorted_file_names_1, key=lambda x: extract_trajectory_info(x)[0])}
amino_acid_groups_2 = {k: list(g) for k, g in itertools.groupby(sorted_file_names_2, key=lambda x: extract_trajectory_info(x)[0])}

# Loop through each amino acid group and create the plot
for amino_acid in set(amino_acid_groups_1.keys()) & set(amino_acid_groups_2.keys()):
    group_file_names_1 = amino_acid_groups_1[amino_acid]
    group_file_names_2 = amino_acid_groups_2[amino_acid]
    true_counts_1 = []
    true_counts_2 = []
    trajectory_nums = []
    for file_name_1, file_name_2 in zip(group_file_names_1, group_file_names_2):
        _, trajectory_num_1 = extract_trajectory_info(file_name_1)
        _, trajectory_num_2 = extract_trajectory_info(file_name_2)
        if trajectory_num_1 == trajectory_num_2:
            contact_matrix_1 = pd.read_csv(os.path.join(contact_matrix_dir_1, file_name_1), index_col=0)
            contact_matrix_2 = pd.read_csv(os.path.join(contact_matrix_dir_2, file_name_2), index_col=0)
            true_count_1 = contact_matrix_1.to_numpy().sum()
            true_count_2 = contact_matrix_2.to_numpy().sum()
            true_counts_1.append(true_count_1)
            true_counts_2.append(true_count_2)
            trajectory_nums.append(trajectory_num_1)

    mean_true_counts = np.mean([true_counts_1, true_counts_2], axis=0)
    std_true_counts = np.std([true_counts_1, true_counts_2], axis=0)

    plt.figure(figsize=(8, 6))
    plt.plot(trajectory_nums, mean_true_counts, linestyle='-', label='Contact Points')
    plt.fill_between(trajectory_nums, mean_true_counts - std_true_counts, mean_true_counts + std_true_counts, alpha=0.2, label='Standard Deviation')
    plt.xlabel('k-steps')
    plt.ylabel('Number of contact points')
    plt.title(f'Contact Points for Amino Acid {amino_acid}')
    plt.legend()
    plt.tight_layout()
    plt.xlim(0, max(trajectory_nums))
    #plt.xlim(0, 195)

    plot_dir = 'colabfold/PLOTS/contact_matrix_2repetitions/'
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    title = f'contact_matrix_{amino_acid}'
    plot_filename = os.path.join(plot_dir, f'{title}.png')
    plt.savefig(plot_filename)

    plt.show()


## ESMFold Contact Matrix along trajectories

In [None]:
contact_matrix_dirs = [
    'ESM/contact_matrix/trajectories_1/',
    'ESM/contact_matrix/trajectories_2/',
    'ESM/contact_matrix/trajectories_3/'
]

# Function to extract the trajectory number and amino acid from the file name
def extract_trajectory_info(file_name):
    match = re.search(r"(.+?)_trajectory_(\d+)\.fasta_0_contact_matrix\.csv", file_name)
    if match:
        amino_acid = match.group(1)
        trajectory_num = int(match.group(2))
        return amino_acid, trajectory_num
    else:
        return file_name.split("_")[0], -1

# Get a list of all files in the contact matrix directories
file_names = []
for dir in contact_matrix_dirs:
    file_names.append([f for f in os.listdir(dir) if f.endswith('.csv')])

# Sort the file names based on the amino acid and trajectory number
sorted_file_names = []
for files in file_names:
    sorted_file_names.append(sorted(files, key=lambda x: extract_trajectory_info(x)))

# Group the file names by amino acid
amino_acid_groups = []
for files in sorted_file_names:
    amino_acid_groups.append({k: list(g) for k, g in itertools.groupby(files, key=lambda x: extract_trajectory_info(x)[0])})

# Loop through each amino acid group and create the plot
common_amino_acids = set(amino_acid_groups[0].keys())
for group in amino_acid_groups[1:]:
    common_amino_acids &= set(group.keys())

for amino_acid in common_amino_acids:
    group_file_names = [group[amino_acid] for group in amino_acid_groups]
    true_counts = [[] for _ in range(len(contact_matrix_dirs))]
    trajectory_nums = []

    for file_names in zip(*group_file_names):
        trajectory_num = extract_trajectory_info(file_names[0])[1]
        trajectory_nums.append(trajectory_num)
        for i, file_name in enumerate(file_names):
            contact_matrix = pd.read_csv(os.path.join(contact_matrix_dirs[i], file_name), index_col=0)
            true_count = contact_matrix.to_numpy().sum()
            true_counts[i].append(true_count)

    mean_true_counts = np.mean(true_counts, axis=0)
    std_true_counts = np.std(true_counts, axis=0)

    plt.figure(figsize=(8, 6))
    plt.plot(trajectory_nums, mean_true_counts, linestyle='-', label='Contact Points')
    plt.fill_between(trajectory_nums, mean_true_counts - std_true_counts, mean_true_counts + std_true_counts, alpha=0.2, label='Standard Deviation')
    plt.xlabel('k-steps')
    plt.ylabel('Number of contact points')
    plt.title(f'Contact Points for Amino Acid {amino_acid}')
    plt.legend()
    plt.tight_layout()
    plt.xlim(0, max(trajectory_nums))

    plot_dir = 'ESM/PLOTS_NEW/contact_matrix_3repetitions/'
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    title = f'contact_matrix_{amino_acid}'
    plot_filename = os.path.join(plot_dir, f'{title}.png')
    plt.savefig(plot_filename)

    plt.show()


## Contact Matrix along genetic trajectories

In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Directories containing contact matrix files
contact_matrix_dir_1 = 'colabfold/contact_matrix/'
contact_matrix_dir_2 = 'colabfold/contact_matrix_2/'
contact_matrix_dir_variant = 'variant_analysis/contact_matrix/'

# Function to extract the trajectory number and amino acid from the file name
def extract_trajectory_info(file_name):
    match = re.search(r"(.+?)_trajectory_(\d+)_", file_name)
    if match:
        amino_acid = match.group(1)
        trajectory_num = int(match.group(2))
        return amino_acid, trajectory_num
    else:
        return file_name.split("_")[0], -1

# Get a list of all files in the contact matrix directories
file_names_1 = [f for f in os.listdir(contact_matrix_dir_1) if f.endswith('.csv')]
file_names_2 = [f for f in os.listdir(contact_matrix_dir_2) if f.endswith('.csv')]
file_names_variant = [f for f in os.listdir(contact_matrix_dir_variant) if f.endswith('.csv')]

# Sort the file names based on the amino acid and trajectory number
sorted_file_names_1 = sorted(file_names_1, key=lambda x: extract_trajectory_info(x))
sorted_file_names_2 = sorted(file_names_2, key=lambda x: extract_trajectory_info(x))
sorted_file_names_variant = sorted(file_names_variant, key=lambda x: extract_trajectory_info(x))

# Group the file names by amino acid
amino_acid_groups_1 = {k: list(g) for k, g in itertools.groupby(sorted_file_names_1, key=lambda x: extract_trajectory_info(x)[0])}
amino_acid_groups_2 = {k: list(g) for k, g in itertools.groupby(sorted_file_names_2, key=lambda x: extract_trajectory_info(x)[0])}
amino_acid_groups_variant = {k: list(g) for k, g in itertools.groupby(sorted_file_names_variant, key=lambda x: extract_trajectory_info(x)[0])}

# Loop through each amino acid group and create the plot
for amino_acid in set(amino_acid_groups_1.keys()) & set(amino_acid_groups_2.keys()) & set(amino_acid_groups_variant.keys()):
    group_file_names_1 = amino_acid_groups_1[amino_acid]
    group_file_names_2 = amino_acid_groups_2[amino_acid]
    group_file_names_variant = amino_acid_groups_variant[amino_acid]

    true_counts_1 = []
    true_counts_2 = []
    true_counts_variant = []
    trajectory_nums = []

    for file_name_1, file_name_2, file_name_variant in zip(group_file_names_1, group_file_names_2, group_file_names_variant):
        _, trajectory_num_1 = extract_trajectory_info(file_name_1)
        _, trajectory_num_2 = extract_trajectory_info(file_name_2)
        _, trajectory_num_variant = extract_trajectory_info(file_name_variant)

        if trajectory_num_1 == trajectory_num_2 == trajectory_num_variant:
            contact_matrix_1 = pd.read_csv(os.path.join(contact_matrix_dir_1, file_name_1), index_col=0)
            contact_matrix_2 = pd.read_csv(os.path.join(contact_matrix_dir_2, file_name_2), index_col=0)
            contact_matrix_variant = pd.read_csv(os.path.join(contact_matrix_dir_variant, file_name_variant), index_col=0)

            true_count_1 = contact_matrix_1.to_numpy().sum()
            true_count_2 = contact_matrix_2.to_numpy().sum()
            true_count_variant = contact_matrix_variant.to_numpy().sum()

            true_counts_1.append(true_count_1)
            true_counts_2.append(true_count_2)
            true_counts_variant.append(true_count_variant)
            trajectory_nums.append(trajectory_num_variant)

    min_length = min(len(true_counts_1), len(true_counts_2), len(true_counts_variant))
    true_counts_1 = true_counts_1[:min_length]
    true_counts_2 = true_counts_2[:min_length]
    true_counts_variant = true_counts_variant[:min_length]
    trajectory_nums = trajectory_nums[:min_length]

    mean_true_counts = np.mean([true_counts_1, true_counts_2], axis=0)
    std_true_counts = np.std([true_counts_1, true_counts_2], axis=0)

    plt.figure(figsize=(8, 6))
    plt.plot(trajectory_nums, true_counts_variant, label='Contact Points (Genetic Sequences)', linewidth=1)
    plt.plot(trajectory_nums, mean_true_counts, label='Contact Points (Theoretical Sequences)', linestyle='dashed', linewidth=1)
    plt.fill_between(trajectory_nums, mean_true_counts - std_true_counts, mean_true_counts + std_true_counts,color='orange',  alpha=0.3, label='Standard Deviation (Theoretical)')

    plt.xlabel('k-steps')
    plt.ylabel('Number of contact points')
    plt.title(f'Contact Points for Amino Acid {amino_acid}')
    plt.legend()
    plt.tight_layout()
    plt.xlim(0, 195)

    plot_dir = 'variant_analysis/PLOTS/combined_theory_genetic/contact_matrix/'
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    title = f'contact_matrix_{amino_acid}'
    plot_filename = os.path.join(plot_dir, f'{title}.png')
    plt.savefig(plot_filename)

    plt.show()
