In [None]:
# proteusPy Disulfide Database Analysis
# Author: Eric G. Suchanek, PhD.
# Cα N, Cα, Sγ, Cβ, C',  Å ° ρ

# RCSB Disulfide Bond Structural Geometry Analysis
Eric G. Suchanek, PhD, Flux-Frontiers, Cincinnati OH

## Summary
In this notebook I present an analysis of the geometry Disulfide Bonds extracted from the RCSB, using the ``proteusPy`` package. This analysis was performed against the entire unfiltered database in order to clean it for more rigorous statistical analysis. The fundamental issues being addressed revolve around the quality of the Disulfide Bond structures themselves. Since the structures studied are the result of X-Ray crystallography the atomic coordinates are subject to positional uncertainties. These uncertainties can lead to unusual and/or non-physical structures that shouldn't be included in subsequent structural analyses. 


In [None]:
from pathlib import Path
import os
import logging

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

import proteusPy as pp

HOME = Path.home()
PDB = Path(os.getenv("PDB", HOME / "pdb"))

pp.configure_master_logger("database_analysis.log")
pp.set_logger_level_for_module("proteusPy", logging.WARNING)

# Get the theme for plotting
theme = pp.get_theme()
if theme == "light":
    pio.templates.default = "plotly_white"
    print("Light Theme")
else:
    pio.templates.default = "plotly_dark"
    print("Dark Theme")

In [2]:
def extract_distances(disulfide_list, distance_type="sg", comparison="less", cutoff=4):
    """
    Extract and filter the distance values from the disulfide list based on the specified type and comparison.

    :param disulfide_list: List of disulfide objects.
    :param distance_type: Type of distance to extract ('sg' or 'ca').
    :param comparison: Comparison operation ('less' or 'greater').
    :param cutoff: Cutoff value for filtering distances.
    :return: List of filtered distance values.
    """
    match distance_type:
        case "sg":
            distances = [ds.sg_distance for ds in disulfide_list]
        case "ca":
            distances = [ds.ca_distance for ds in disulfide_list]
        case _:
            raise ValueError("Invalid distance_type. Must be 'sg' or 'ca'.")

    if cutoff == -1.0:
        return distances

    match comparison:
        case "less":
            filtered_distances = [d for d in distances if d < cutoff]
        case "greater":
            filtered_distances = [d for d in distances if d >= cutoff]
        case _:
            raise ValueError("Invalid comparison. Must be 'less' or 'greater'.")

    return filtered_distances


def plot_distances(distances, distance_type="sg", cutoff=4, flip=False):
    """
    Plot the distance values as a histogram using plotly express.

    :param distances: List of distance values.
    :param distance_type: Type of distance to plot ('sg' or 'ca').
    :param cutoff: Cutoff value for the x-axis title.
    :param flip: Whether to flip the comparison in the x-axis title.
    """
    match distance_type:
        case "sg":
            column_name = "SG Distance"
            title = "Sγ Distance Distribution"
            if cutoff == -1.0:
                xtitle = "All Sγ-Sγ Distances"
            else:
                xtitle = (
                    f"Sγ Distance < {cutoff}Å"
                    if not flip
                    else f"Sγ-Sγ Distance >= {cutoff}Å"
                )
        case "ca":
            column_name = "Ca Distance"
            title = "Cα Distance Distribution"
            if cutoff == -1.0:
                xtitle = "All Cα-Cα Distances"
            else:
                xtitle = (
                    f"Cα Distance < {cutoff}Å"
                    if not flip
                    else f"Cα-Cα Distance >= {cutoff}Å"
                )
        case _:
            raise ValueError("Invalid distance_type. Must be 'sg' or 'ca'.")

    # Convert to a Pandas DataFrame with the appropriate column name
    df = pd.DataFrame(distances, columns=[column_name])

    fig = px.histogram(
        df,
        x=column_name,  # Use the column name for the x-axis
        nbins=100,
        title=title,
    )
    fig.update_layout(
        xaxis_title=xtitle,
        yaxis_title="Frequency",
        bargap=0.2,
    )
    fig.show()


def plot_histograms(df):
    """
    Plot histograms for Bondlength_Deviation, Angle_Deviation, and Ca_Distance.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    """
    fig = px.histogram(
        df,
        x="Bondlength_Deviation",
        nbins=300,
        title="Bond Length Deviation",
    )
    fig.update_layout(xaxis_title="Bond Length Deviation", yaxis_title="Frequency")
    fig.show()

    fig = px.histogram(df, x="Angle_Deviation", nbins=300, title="Angle Deviation")
    fig.update_layout(xaxis_title="Angle Deviation", yaxis_title="Frequency")
    fig.show()

    fig = px.histogram(df, x="Ca_Distance", nbins=300, title="Cα Distance Distribution")
    fig.update_layout(xaxis_title="Cα Distance", yaxis_title="Frequency")
    fig.show()


def highlight_worst_structures(df, top_n=10):
    """
    Highlight the worst structures for distance and angle deviations and annotate their names.
    Also, add a subplot showing the worst structures aggregated by PDB_ID.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    :param top_n: Number of worst structures to highlight.
    :type top_n: int
    """
    # Identify the worst structures for Bond Length Deviation
    worst_distance = df.nlargest(top_n, "Bondlength_Deviation")

    # Identify the worst structures for angle deviation
    worst_angle = df.nlargest(top_n, "Angle_Deviation")

    # Identify the worst structures for Cα distance
    worst_ca = df.nlargest(top_n, "Ca_Distance")

    # Combine the worst structures
    worst_structures = pd.concat(
        [worst_distance, worst_angle, worst_ca]
    ).drop_duplicates()

    # Aggregate worst structures by PDB_ID
    worst_structures_agg = (
        worst_structures.groupby("PDB_ID").size().reset_index(name="Count")
    )

    # Scatter plot for all structures
    fig = px.scatter(
        df,
        x="Bondlength_Deviation",
        y="Angle_Deviation",
        title="Bond Length Deviation vs. Angle Deviation",
    )
    fig.add_scatter(
        x=worst_structures["Bondlength_Deviation"],
        y=worst_structures["Angle_Deviation"],
        mode="markers",
        marker=dict(color="red", size=10, symbol="x"),
        name="Worst Structures",
    )
    for i, row in worst_structures.iterrows():
        fig.add_annotation(
            x=row["Bondlength_Deviation"],
            y=row["Angle_Deviation"],
            text=row["SS_Name"],
            showarrow=True,
            arrowhead=1,
        )
    fig.show()

    # Bar plot for worst structures aggregated by PDB_ID
    fig = px.bar(
        worst_structures_agg,
        x="PDB_ID",
        y="Count",
        title="Worst Structures Aggregated by PDB_ID",
    )
    fig.update_layout(xaxis_title="PDB_ID", yaxis_title="Count")
    fig.show()


def plot_scatterplots(df):
    """
    Plot scatter plots for Bondlength_Deviation, Angle_Deviation, and Ca_Distance
    with the row index as the x-axis.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    """
    fig = px.scatter(
        df, x=df.index, y="Bondlength_Deviation", title="Bondlength Deviation"
    )
    fig.update_layout(xaxis_title="Row Index", yaxis_title="Bondlength Deviation")
    fig.show()

    fig = px.scatter(df, x=df.index, y="Angle_Deviation", title="Angle Deviation")
    fig.update_layout(xaxis_title="Row Index", yaxis_title="Angle Deviation")
    fig.show()

    fig = px.scatter(df, x=df.index, y="Ca_Distance", title="Cα Distance")
    fig.update_layout(xaxis_title="Row Index", yaxis_title="Cα Distance")

    fig = px.scatter(df, x=df.index, y="Sg_Distance", title="Sg Distance")
    fig.update_layout(xaxis_title="Row Index", yaxis_title="Sg Distance")
    fig.show()


def get_unique_pdb_ids(data_frame):
    """
    Return a list of unique pdb_id values from the DataFrame.

    :param data_frame: DataFrame containing the disulfide information.
    :type data_frame: pd.DataFrame
    :return: List of unique pdb_id values.
    :rtype: list
    """
    unique_pdb_ids = data_frame["PDB_ID"].unique().tolist()
    return unique_pdb_ids

Global cutoff values for Cα-Cα and Sγ-Sγ distances, Å. We will use 8.0A for Ca-Ca, and  2.5A Sg-Sg bondlength.

In [3]:
CA_CUTOFF = -1.0
SG_CUTOFF = -1.0

In [None]:
# instantiate the complete database from its source with no cutoffs
pdb_full = pp.DisulfideLoader(
    subset=False,
    verbose=True,
    cutoff=-1.0,
    sg_cutoff=-1.0,
)
pdb_full.describe(memusg=False)

As shown above, the default database has no Cα or Sγ cutoffs. This enables one to do the filtering by Cα-Cα and Sγ-Sγ distances to prune the database for unreasonable Disulfide bonds. Let's start by creating the bond length and bond angle deviation dataframe. This function compares the bond length and bond angle geometry for the input Disulfide against idealized bond lengths and bond angles, and creates a dataframe with this information.


In [None]:
sslist = pdb_full.SSList
dev_df = sslist.create_deviation_dataframe()

In [None]:
dev_df.describe()

Let's start by looking at the Ca-Ca distance distributions for the full dataset:

In [None]:
ca_distances = extract_distances(
    pdb_full.SSList, distance_type="ca", comparison="less", cutoff=-1
)
plot_distances(ca_distances, cutoff=CA_CUTOFF, distance_type="ca", flip=False)

The Ca distances also show anomolous near (close contact) and far (> 8A) distances, which would be structurally impossible.

In [None]:
sg_distances = extract_distances(
    pdb_full.SSList, distance_type="sg", comparison="less", cutoff=SG_CUTOFF
)
plot_distances(sg_distances, cutoff=SG_CUTOFF, distance_type="sg", flip=False)

## Structural Statistical Cutoff Considerations

We can calculate overall distribution statistics for the Cα, Sγ distances as well as the bond lengths and bond angles in an effort to rationally filter the Disulfide database. The goal is to produce a high-quality database with geometrically reasonable/realistic structures. Since model quality varies greatly across the entire database as is shown by the distance distributions above this seems like a prudent step.

In [None]:
from scipy.stats import norm

# Calculate the Z-score for the 95th percentile
percentile = 95
z_score = norm.ppf(percentile / 100.0)

print(f"The Z-score for the {percentile}th percentile is approximately {z_score:.3f}")

# Assuming you have a DataFrame named dev_df with columns 'Distance_Deviation' and 'Angle_Deviation'

# Set some parameters for the standard deviation and percentile methods
std = z_score

distance_cutoff_std = pp.calculate_std_cutoff(
    dev_df, "Bondlength_Deviation", num_std=std
)
angle_cutoff_std = pp.calculate_std_cutoff(dev_df, "Angle_Deviation", num_std=std)
ca_cutoff_std = pp.calculate_std_cutoff(dev_df, "Ca_Distance", num_std=std)
sg_cutoff_std = pp.calculate_std_cutoff(dev_df, "Sg_Distance", num_std=std)

# Percentile Method
distance_cutoff_percentile = pp.calculate_percentile_cutoff(
    dev_df, "Bondlength_Deviation", percentile=percentile
)
angle_cutoff_percentile = pp.calculate_percentile_cutoff(
    dev_df, "Angle_Deviation", percentile=percentile
)
ca_cutoff_percentile = pp.calculate_percentile_cutoff(
    dev_df, "Ca_Distance", percentile=percentile
)
sg_cutoff_percentile = pp.calculate_percentile_cutoff(
    dev_df, "Sg_Distance", percentile=percentile
)

print(f"Bond Length Deviation Cutoff ({std:.2f} Std Dev): {distance_cutoff_std:.2f}")
print(f"Angle Deviation Cutoff ({std:.2f} Std Dev): {angle_cutoff_std:.2f}")
print(f"Ca Distance Cutoff ({std:.2f} Std Dev): {ca_cutoff_std:.2f}")
print(f"Sg Distance Cutoff ({std:.2f} Std Dev): {sg_cutoff_std:.2f}")

print(
    f"\nBond Length Deviation Cutoff ({percentile:.2f}th Percentile): {distance_cutoff_percentile:.2f}"
)
print(
    f"Angle Deviation Cutoff ({percentile:.2f}th Percentile): {angle_cutoff_percentile:.2f}"
)
print(f"Ca Distance Cutoff ({percentile:.2f}th Percentile): {ca_cutoff_percentile:.2f}")
print(f"Sg Distance Cutoff ({percentile:.2f}th Percentile): {sg_cutoff_percentile:.2f}")

In [14]:
CA_CUTOFF = 6.69
SG_CUTOFF = 2.11


We can create our own custom ``DisulfideLoader`` object on the fly by instantiating it with our desired Cα and Sg cutoffs. We first make one with Cα-Cα cutoff and no Sγ-Sγ cutoff, (setting to -1.0 means no filtering):

In [None]:
pdb_ca_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, cutoff=CA_CUTOFF, sg_cutoff=-1.0
)
pdb_ca_filtered.describe(memusg=True)
ca_delta = pdb_full.TotalDisulfides - pdb_ca_filtered.TotalDisulfides
print(f"Total Disulfides Removed by Cα filtering: {ca_delta}")

Now let's trying filtering against the Sg cutoff:

In [None]:
sg_distances = extract_distances(
    pdb_full.SSList, distance_type="sg", comparison="less", cutoff=SG_CUTOFF
)
plot_distances(sg_distances, distance_type="sg", cutoff=SG_CUTOFF, flip=False)


sg_distances = extract_distances(
    pdb_full.SSList, distance_type="sg", comparison="greater", cutoff=SG_CUTOFF
)
plot_distances(sg_distances, distance_type="sg", cutoff=SG_CUTOFF, flip=True)

In [None]:
pdb_sg_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, cutoff=-1.0, sg_cutoff=SG_CUTOFF
)
pdb_sg_filtered.describe(memusg=False)

In [None]:
sg_delta = pdb_full.TotalDisulfides - pdb_sg_filtered.TotalDisulfides
print(f"Total Disulfides Removed by Sg filtering: {sg_delta}")

Finally, fully filtered:

In [None]:
pdb_fully_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, cutoff=CA_CUTOFF, sg_cutoff=SG_CUTOFF
)
pdb_fully_filtered.describe(memusg=True)

total_delta = pdb_full.TotalDisulfides - pdb_fully_filtered.TotalDisulfides
print(f"Total Disulfides Removed by Cα and Sg filtering: {total_delta}")

This database therefore represents Disulfides at 95% confidence to have geometrically reasonable Ca-Ca and Sg-Sg distances. Let's have a look at the distance distributions for the unfiltered database. The utility functions enable us to extract Disulfides with the desired distance constraints: