In [1]:
# proteusPy Disulfide Database Analysis
# Author: Eric G. Suchanek, PhD.
# Last revision: 2025-01-18 16:41:53
# Cα N, Cα, Sγ, Cβ, C',  Å ° ρ
#
# This notebook takes about 2 minutes 48 seconds to run on an
# M3 Max MacbookPro 2024 with 38GB of RAM

# RCSB Disulfide Bond Structural Geometry Analysis
Eric G. Suchanek, PhD, Flux-Frontiers, Cincinnati OH

## Summary
In this notebook I present an analysis of the geometry Disulfide Bonds extracted from the RCSB, using the ``proteusPy`` package. This analysis was performed against the entire unfiltered database in order to clean it for more rigorous statistical analysis. The fundamental issues being addressed revolve around the quality of the Disulfide Bond structures themselves. Since the structures studied are the result of X-Ray crystallography the atomic coordinates are subject to positional uncertainties. These uncertainties can lead to unusual and/or non-physical structures that shouldn't be included in subsequent structural analyses. 


In [None]:
from pathlib import Path
import os
import logging

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

# pio.renderers.default = "png"  # or 'svg'

import proteusPy as pp
from proteusPy import CA_CUTOFF, SG_CUTOFF

HOME = Path.home()
PDB = Path(os.getenv("PDB", HOME / "pdb"))

pp.configure_master_logger("database_analysis.log")
pp.set_logger_level_for_module("proteusPy", logging.WARNING)

# Get the theme for plotting
theme = pp.get_theme()
if theme == "light":
    pio.templates.default = "plotly_white"
    print("Light Theme")
else:
    pio.templates.default = "plotly_dark"
    print("Dark Theme")

In [2]:
def highlight_worst_structures(df, top_n=10):
    """
    Highlight the worst structures for distance and angle deviations and annotate their names.
    Also, add a subplot showing the worst structures aggregated by PDB_ID.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    :param top_n: Number of worst structures to highlight.
    :type top_n: int
    """
    # Identify the worst structures for Bond Length Deviation
    worst_distance = df.nlargest(top_n, "Bondlength_Deviation")

    # Identify the worst structures for angle deviation
    worst_angle = df.nlargest(top_n, "Angle_Deviation")

    # Identify the worst structures for Cα distance
    worst_ca = df.nlargest(top_n, "Ca_Distance")

    # Combine the worst structures
    worst_structures = pd.concat(
        [worst_distance, worst_angle, worst_ca]
    ).drop_duplicates()

    # Aggregate worst structures by PDB_ID
    worst_structures_agg = (
        worst_structures.groupby("PDB_ID").size().reset_index(name="Count")
    )

    # Scatter plot for all structures
    fig = px.scatter(
        df,
        x="Bondlength_Deviation",
        y="Angle_Deviation",
        title="Bond Length Deviation vs. Angle Deviation",
    )
    fig.add_scatter(
        x=worst_structures["Bondlength_Deviation"],
        y=worst_structures["Angle_Deviation"],
        mode="markers",
        marker=dict(color="red", size=10, symbol="x"),
        name="Worst Structures",
    )
    for i, row in worst_structures.iterrows():
        fig.add_annotation(
            x=row["Bondlength_Deviation"],
            y=row["Angle_Deviation"],
            text=row["SS_Name"],
            showarrow=True,
            arrowhead=1,
        )
    fig.show()

    # Bar plot for worst structures aggregated by PDB_ID
    fig = px.bar(
        worst_structures_agg,
        x="PDB_ID",
        y="Count",
        title="Worst Structures Aggregated by PDB_ID",
    )
    fig.update_layout(xaxis_title="PDB_ID", yaxis_title="Count")
    fig.show()


def get_unique_pdb_ids(data_frame):
    """
    Return a list of unique pdb_id values from the DataFrame.

    :param data_frame: DataFrame containing the disulfide information.
    :type data_frame: pd.DataFrame
    :return: List of unique pdb_id values.
    :rtype: list
    """
    unique_pdb_ids = data_frame["PDB_ID"].unique().tolist()
    return unique_pdb_ids

Global cutoff values for Cα-Cα and Sγ-Sγ distances, Å. We will initially set to -1, indicating no filtering. This will extract the entire database.

In [None]:
# instantiate the complete database from its source with no cutoffs
pdb_full = pp.DisulfideLoader(
    subset=False,
    verbose=True,
    quiet=False,
    cutoff=-1.0,
    sg_cutoff=-1.0,
)

As shown above, the default database has no Cα or Sγ cutoffs. This enables one to do the filtering by Cα-Cα and Sγ-Sγ distances to prune the database for unreasonable Disulfide bonds. Let's start by creating the bond length and bond angle deviation dataframe. This function compares the bond length and bond angle geometry for the input Disulfide against idealized bond lengths and bond angles, and creates a dataframe with this information.


In [None]:
pdb_full.SSList.plot_deviation_histograms(verbose=True, log=True)

In [None]:
pdb_full.SSList.plot_deviation_scatterplots(verbose=True)

Let's start by looking at the Ca-Ca distance distributions for the full dataset:

In [None]:
pdb_full.plot_distances("ca", cutoff=-1)

In [None]:
ca_distances = pdb_full.SSList.extract_distances(
    distance_type="ca", comparison="less", cutoff=-1
)
plot_distances(ca_distances, cutoff=-1, distance_type="ca", flip=False)

The Ca distances also show anomolous near (close contact) and far (> 8A) distances, which would be structurally impossible.

In [None]:
sg_distances = pdb_full.SSList.extract_distances(
    distance_type="sg", comparison="less", cutoff=SG_CUTOFF
)
plot_distances(sg_distances, cutoff=SG_CUTOFF, distance_type="sg", flip=False)

## Structural Statistical Cutoff Considerations

We can calculate overall distribution statistics for the Cα, Sγ distances as well as the bond lengths and bond angles in an effort to rationally filter the Disulfide database. The goal is to produce a high-quality database with geometrically reasonable/realistic structures. Since model quality varies greatly across the entire database as is shown by the distance distributions above this seems like a prudent step.

In [None]:
from scipy.stats import norm

# Calculate the Z-score for the 95th percentile
percentile = 95
z_score = norm.ppf(percentile / 100.0)

print(f"The Z-score for the {percentile}th percentile is approximately {z_score:.3f}")

# Assuming you have a DataFrame named dev_df with columns 'Distance_Deviation' and 'Angle_Deviation'

# Set some parameters for the standard deviation and percentile methods
std = z_score

distance_cutoff_std = pp.calculate_std_cutoff(
    dev_df, "Bondlength_Deviation", num_std=std
)
angle_cutoff_std = pp.calculate_std_cutoff(dev_df, "Angle_Deviation", num_std=std)
ca_cutoff_std = pp.calculate_std_cutoff(dev_df, "Ca_Distance", num_std=std)
sg_cutoff_std = pp.calculate_std_cutoff(dev_df, "Sg_Distance", num_std=std)

# Percentile Method
distance_cutoff_percentile = pp.calculate_percentile_cutoff(
    dev_df, "Bondlength_Deviation", percentile=percentile
)
angle_cutoff_percentile = pp.calculate_percentile_cutoff(
    dev_df, "Angle_Deviation", percentile=percentile
)
ca_cutoff_percentile = pp.calculate_percentile_cutoff(
    dev_df, "Ca_Distance", percentile=percentile
)
sg_cutoff_percentile = pp.calculate_percentile_cutoff(
    dev_df, "Sg_Distance", percentile=percentile
)

print(f"Bond Length Deviation Cutoff ({std:.2f} Std Dev): {distance_cutoff_std:.2f}")
print(f"Angle Deviation Cutoff ({std:.2f} Std Dev): {angle_cutoff_std:.2f}")
print(f"Ca Distance Cutoff ({std:.2f} Std Dev): {ca_cutoff_std:.2f}")
print(f"Sg Distance Cutoff ({std:.2f} Std Dev): {sg_cutoff_std:.2f}")

print(
    f"\nBond Length Deviation Cutoff ({percentile:.2f}th Percentile): {distance_cutoff_percentile:.2f}"
)
print(
    f"Angle Deviation Cutoff ({percentile:.2f}th Percentile): {angle_cutoff_percentile:.2f}"
)
print(f"Ca Distance Cutoff ({percentile:.2f}th Percentile): {ca_cutoff_percentile:.2f}")
print(f"Sg Distance Cutoff ({percentile:.2f}th Percentile): {sg_cutoff_percentile:.2f}")

In [30]:
CA_CUTOFF = 6.71
SG_CUTOFF = 2.12


We can create our own custom ``DisulfideLoader`` object on the fly by instantiating it with our desired Cα and Sg cutoffs. We first make one with Cα-Cα cutoff and no Sγ-Sγ cutoff, (setting to -1.0 means no filtering):

In [None]:
pdb_ca_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, cutoff=CA_CUTOFF, sg_cutoff=-1.0
)
ca_delta = pdb_full.TotalDisulfides - pdb_ca_filtered.TotalDisulfides
print(f"\nTotal Disulfides Removed by Cα filtering: {ca_delta}")

Now let's trying filtering against the Sg cutoff:

In [None]:
sg_distances = pdb_full.SSList.extract_distances(
    distance_type="sg", comparison="less", cutoff=SG_CUTOFF
)
plot_distances(sg_distances, distance_type="sg", cutoff=SG_CUTOFF, flip=False)

In [None]:
sg_distances = pdb_full.SSList.extract_distances(
    distance_type="sg", comparison="greater", cutoff=SG_CUTOFF
)
plot_distances(sg_distances, distance_type="sg", cutoff=SG_CUTOFF, flip=True)

Now filter by Sγ cutoff alone:

In [None]:
pdb_sg_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, cutoff=-1.0, sg_cutoff=SG_CUTOFF
)

In [None]:
sg_delta = pdb_full.TotalDisulfides - pdb_sg_filtered.TotalDisulfides
print(f"Total Disulfides Removed by Sγ filtering: {sg_delta}")

Finally, fully filtered:

In [None]:
pdb_fully_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, quiet=False, cutoff=CA_CUTOFF, sg_cutoff=SG_CUTOFF
)

total_delta = pdb_full.TotalDisulfides - pdb_fully_filtered.TotalDisulfides
print(f"\nTotal Disulfides Removed by Cα and Sγ filtering: {total_delta}")

This database therefore represents Disulfides at 95% confidence to have geometrically reasonable Ca-Ca and Sg-Sg distances. Let's have a look at the distance distributions for the filtered database. The utility functions enable us to extract Disulfides with the desired distance constraints:

In [None]:
pdb_fully_filtered.SSList.plot_distances(
    distance_type="ca", comparison="less", cutoff=CA_CUTOFF
)


In [None]:
sg_distances = pdb_fully_filtered.SSList.extract_distances(
    distance_type="sg", comparison="less", cutoff=SG_CUTOFF
)
plot_distances(sg_distances, cutoff=SG_CUTOFF, distance_type="sg", flip=False)

In [None]:
pdb_fully_filtered.SSList.plot_deviation_scatterplots(verbose=True, theme="auto")
