In [None]:
# proteusPy Disulfide Database Analysis
# Author: Eric G. Suchanek, PhD.
# Last revision: 2025-03-06 18:28:53
#
# Cα N, Cα, Sγ, Cβ, C',  Å ° ρ
#
# This notebook takes about 5 minutes 20 seconds to run on an
# M3 Max MacbookPro 2024 with 38GB of RAM

# RCSB Disulfide Bond Structural Geometry Analysis
Eric G. Suchanek, PhD, Flux-Frontiers, Cincinnati OH

## Summary
In this notebook I present an analysis of the geometry Disulfide Bonds extracted from the RCSB, using the ``proteusPy`` package. This analysis was performed against the entire unfiltered database in order to clean it for more rigorous statistical analysis. The fundamental issues being addressed revolve around the quality of the Disulfide Bond structures themselves. Since the structures studied are the result of X-Ray crystallography the atomic coordinates are subject to positional uncertainties. These uncertainties can lead to unusual and/or non-physical structures that shouldn't be included in subsequent structural analyses. 


In [None]:
from pathlib import Path
import os
import logging

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "png"  # or 'svg'

import proteusPy as pp
from proteusPy import CA_CUTOFF, SG_CUTOFF
from proteusPy import DisulfideStats, DisulfideVisualization


HOME = Path.home()
PDB = Path(os.getenv("PDB", HOME / "pdb"))

pp.configure_master_logger("database_analysis.log")
pp.set_logger_level_for_module("proteusPy", logging.ERROR)

# Get the theme for plotting
pp.set_plotly_theme(theme="auto")
CA_CUTOFF = SG_CUTOFF = -1.0
CA_MIN_CUTOFF = 1.0
SG_MIN_CUTOFF = 1.0

In [None]:
def get_unique_pdb_ids(data_frame: pd.DataFrame) -> list:
    """
    Return a list of unique pdb_id values from the DataFrame.

    :param data_frame: DataFrame containing the disulfide information.
    :type data_frame: pd.DataFrame
    :return: List of unique pdb_id values.
    :rtype: list
    """
    unique_pdb_ids = data_frame["PDB_ID"].unique().tolist()
    return unique_pdb_ids

Global cutoff values for Cα-Cα and Sγ-Sγ distances, Å. We will initially set to -1, indicating no filtering. This will extract the entire database.

In [None]:
# instantiate the complete database from its source with no cutoffs
pdb_full = pp.DisulfideLoader(
    subset=False,
    verbose=True,
    cutoff=-1.0,
    sg_cutoff=-1.0,
)
# pdb_full.save()

In [None]:
# pdb_full = pp.Load_PDB_SS(verbose=True)

As shown above, the default database has no Cα or Sγ cutoffs. This enables one to do the filtering by Cα-Cα and Sγ-Sγ distances to prune the database for unreasonable Disulfide bonds. Let's look at the bond length and bond angle deviations. This function compares the bond length and bond angle geometry for the input Disulfide against idealized bond lengths and bond angles, and plots the result. The function available to any ``DisulfideList``. By using ``pdb_full.SSList`` we use the entire database.


In [None]:
pdb_full.SSList.plot_deviation_histograms()

In [None]:
pdb_full.SSList.plot_deviation_scatterplots()

In [None]:
pdb_full.SSList.display_worst_structures(top_n=10, sample_percent=20)

Let's start by looking at the Ca-Ca  and Sg-Sg distance distributions for the full dataset:

In [None]:
pdb_full.SSList.plot_distances(
    distance_type="ca", comparison="greater", cutoff=CA_CUTOFF
)

In [None]:
pdb_full.SSList.plot_distances(
    distance_type="sg", comparison="greater", cutoff=SG_CUTOFF
)

The Ca distances also show anomolous near (close contact) and far (> 8A) distances, which would be structurally impossible.

## Structural Statistical Cutoff Considerations

We can calculate overall distribution statistics for the Cα, Sγ distances as well as the bond lengths and bond angles in an effort to rationally filter the Disulfide database. The goal is to produce a high-quality database with geometrically reasonable/realistic structures. Since model quality varies greatly across the entire database as is shown by the distance distributions above this seems like a prudent step.

First we create the bond length and bond angle deviation dataframe:

In [None]:
cutoffs = DisulfideStats.calculate_cutoff_from_percentile(
    pdb_full.SSList, percentile=95, verbose=True
)

In [None]:
# Example usage
fig, ax = DisulfideVisualization.plot_percentile_cutoffs(
    pdb_full,
    percentile_range=(80, 99),
    num_steps=20,
    save_path="/Users/egs/repos/proteusPy_priv/Disulfide_Chapter/SpringerBookChapter/Figures/percentile_cutoffs.png",
)

# Show the plot

In [None]:
CA_CUTOFF = 6.71
SG_CUTOFF = 2.12


We can create our own custom ``DisulfideLoader`` object on the fly by instantiating it with our desired Cα and Sg cutoffs. We first make one with Cα-Cα cutoff and no Sγ-Sγ cutoff, (setting to -1.0 means no filtering):

In [None]:
pdb_ca_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, cutoff=CA_CUTOFF, sg_cutoff=-1.0
)
ca_delta = pdb_full.TotalDisulfides - pdb_ca_filtered.TotalDisulfides
print(f"\nTotal Disulfides Removed by Cα filtering: {ca_delta}")

In [None]:
pdb_ca_filtered.SSList.plot_distances(
    distance_type="ca", comparison="less", cutoff=CA_CUTOFF, log=False
)

Now let's trying filtering against the Sg cutoff:

In [None]:
pdb_sg_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, cutoff=-1.0, sg_cutoff=SG_CUTOFF
)
sg_delta = pdb_full.TotalDisulfides - pdb_sg_filtered.TotalDisulfides
print(f"\nTotal Disulfides Removed by Sγ filtering: {sg_delta}")

In [None]:
pdb_sg_filtered.SSList.plot_distances(
    distance_type="sg", comparison="less", cutoff=SG_CUTOFF, log=False
)

Now filter by Sγ cutoff alone:

In [None]:
pdb_sg_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, cutoff=-1.0, sg_cutoff=SG_CUTOFF
)

In [None]:
sg_delta = pdb_full.TotalDisulfides - pdb_sg_filtered.TotalDisulfides
print(f"Total Disulfides Removed by Sγ filtering: {sg_delta}")

Finally, fully filtered:

In [None]:
pdb_fully_filtered = pp.DisulfideLoader(
    subset=False, verbose=True, quiet=False, cutoff=CA_CUTOFF, sg_cutoff=SG_CUTOFF
)

total_delta = pdb_full.TotalDisulfides - pdb_fully_filtered.TotalDisulfides
print(f"\nTotal Disulfides Removed by Cα and Sγ filtering: {total_delta}")

This database therefore represents Disulfides at 95% confidence to have geometrically reasonable Ca-Ca and Sg-Sg distances. Let's have a look at the distance distributions for the filtered database. The utility functions enable us to extract Disulfides with the desired distance constraints:

In [None]:
pdb_fully_filtered.SSList.plot_distances(
    distance_type="ca", comparison="less", cutoff=CA_CUTOFF
)

In [None]:
pdb_fully_filtered.SSList.plot_distances(
    distance_type="sg", comparison="less", cutoff=SG_CUTOFF, log=False
)

In [None]:
pdb_fully_filtered.SSList.plot_deviation_scatterplots(verbose=True, theme="auto")

In [None]:
# end of notebook