In [None]:
# proteusPy Disulfide Database Analysis
# Author: Eric G. Suchanek, PhD.
# Cα N, Cα, Cβ, C', Sγ Å ° ρ

In [2]:
from pathlib import Path
import matplotlib.pyplot as plt
import pyvista as pv
import logging

import os
import pandas as pd
import numpy as np
import plotly.io as pio
import plotly.express as px

from proteusPy import (
    Load_PDB_SS,
    Disulfide,
    DisulfideList,
    DisulfideLoader,
    create_logger,
    configure_master_logger,
    get_theme,
)

HOME = Path.home()
PDB = Path(os.getenv("PDB", HOME / "pdb"))
PBAR_COLS = 78

configure_master_logger("database_analysis.log")

theme = get_theme()
if theme == "light":
    pio.templates.default = "plotly_white"
    print("Light Theme")
else:
    pio.templates.default = "plotly_dark"
    print("Dark Theme")


Dark Theme


In [3]:
pdb_full = Load_PDB_SS(subset=False, verbose=True)
pdb_full.describe(quick=True)

-> load_PDB_SS(): Reading /Users/egs/miniforge3/envs/proteusPy/lib/python3.11/site-packages/proteusPy/data/PDB_SS_ALL_LOADER.pkl... 
-> load_PDB_SS(): Done Reading /Users/egs/miniforge3/envs/proteusPy/lib/python3.11/site-packages/proteusPy/data/PDB_SS_ALL_LOADER.pkl... 
PDB IDs present:                    36968
Disulfides loaded:                  175277
Average structure resolution:       2.19 Å
Lowest Energy Disulfide:            2q7q_75D_140D
Highest Energy Disulfide:           6vxk_801B_806B
Cα distance cutoff:                 -1.00 Å
Sγ distance cutoff:                 -1.00 Å


In [8]:
pdb_ca_filtered = DisulfideLoader(
    subset=False, verbose=True, cutoff=8.0, sg_cutoff=3.0
)


proteusPy: INFO 2024-11-20 22:05:31,585 - proteusPy.DisulfideLoader.__init__ - Filtering Ca: old: 175277, new: 174718
proteusPy: INFO 2024-11-20 22:05:31,764 - proteusPy.DisulfideLoader.__init__ - Filtering SG: old: 174718, new: 173598
proteusPy: INFO 2024-11-20 22:05:44,660 - proteusPy.DisulfideClass_Constructor.__init__ - Building SS classes...
proteusPy: INFO 2024-11-20 22:05:44,665 - proteusPy.DisulfideClass_Constructor.build_yourself - Creating binary SS classes...
proteusPy: INFO 2024-11-20 22:05:47,449 - proteusPy.DisulfideClass_Constructor.build_yourself - Creating sixfold SS classes...
proteusPy: INFO 2024-11-20 22:05:48,046 - proteusPy.DisulfideClass_Constructor.build_yourself - Creating eightfold SS classes...
proteusPy: INFO 2024-11-20 22:05:48,711 - proteusPy.DisulfideClass_Constructor.build_yourself - Initialization complete.


In [7]:
pdb_ca_filtered.describe()

IndexError: list index out of range

In [4]:
import matplotlib.pyplot as plt


def extract_sg_distances(disulfide_list, cutoff=4):
    """
    Extract and filter the sg_distance values from the disulfide list.

    :param disulfide_list: List of disulfide objects.
    :return: List of sg_distance values less than 10.
    """
    distances = [ds.sg_distance for ds in disulfide_list if ds.sg_distance < cutoff]
    return distances


def extract_sg_bad_distances(disulfide_list, cutoff=4):
    """
    Extract and filter the sg_distance values from the disulfide list.

    :param disulfide_list: List of disulfide objects.
    :return: List of sg_distance values less than 10.
    """
    distances = [ds.sg_distance for ds in disulfide_list if ds.sg_distance >= cutoff]
    return distances


def extract_ca_distances(disulfide_list, cutoff=4):
    """
    Extract and filter the sg_distance values from the disulfide list.

    :param disulfide_list: List of disulfide objects.
    :return: List of sg_distance values less than 10.
    """
    distances = [ds.ca_distance for ds in disulfide_list if ds.ca_distance < cutoff]
    return distances


def extract_ca_bad_distances(disulfide_list, cutoff=4):
    """
    Extract and filter the sg_distance values from the disulfide list.

    :param disulfide_list: List of disulfide objects.
    :return: List of sg_distance values less than 10.
    """
    distances = [ds.ca_distance for ds in disulfide_list if ds.ca_distance >= cutoff]
    return distances


def plot_sg_distances(sg_distances, cutoff=4, flip=False):
    """
    Plot the sg_distance values as a histogram using plotly express.

    :param sg_distances: List of sg_distance values.
    """
    # Convert to a Pandas DataFrame with a column named 'SG Distance'
    df = pd.DataFrame(sg_distances, columns=["SG Distance"])
    xtitle = f"Sγ Distance < {cutoff}" if not flip else f"Sγ Distance >= {cutoff}"

    fig = px.histogram(
        df,
        x="SG Distance",  # Use the column name for the x-axis
        nbins=50,
        title="Sγ Distance Distribution",
    )
    fig.update_layout(
        xaxis_title=xtitle,
        yaxis_title="Frequency",
        bargap=0.2,
    )
    fig.show()


def plot_ca_distances(ca_distances, cutoff=4, flip=False):
    """
    Plot the ca_distance values as a histogram using plotly express.

    :param ca_distances: List of ca_distance values.
    """
    # Convert to a Pandas DataFrame with a column named 'SG Distance'
    df = pd.DataFrame(ca_distances, columns=["Ca Distance"])
    xtitle = f"Cα Distance < {cutoff}" if not flip else f"Cα Distance >= {cutoff}"

    fig = px.histogram(
        df,
        x="Ca Distance",  # Use the column name for the x-axis
        nbins=50,
        title="Cα Distance Distribution",
    )
    fig.update_layout(
        xaxis_title=xtitle,
        yaxis_title="Frequency",
        bargap=0.2,
    )
    fig.show()

In [None]:
sg_distances = extract_sg_distances(pdb_full.SSList, cutoff=3)
plot_sg_distances(sg_distances, cutoff=5)

In [None]:
sg_distances = extract_sg_bad_distances(pdb_full.SSList, cutoff=4)
plot_sg_distances(sg_distances, cutoff=4, flip=True)

In [None]:
ca_distances = extract_ca_distances(pdb_full.SSList, cutoff=8)
plot_ca_distances(ca_distances, cutoff=8, flip=False)

In [None]:
ca_distances = extract_ca_bad_distances(pdb_full.SSList, cutoff=8)
plot_ca_distances(ca_distances, cutoff=8, flip=True)

We can filter the database by instantiating the ``DisulfideLoader`` object directly with appropriate cutoff for Ca and Sγ-Sγ distances. This intantiates the ``DisulfideLoader`` object and rebuilds its internal state by reading the master disulfide list file.

proteusPy: INFO 2024-11-20 21:33:33,441 - proteusPy.DisulfideClass_Constructor.__init__ - Building SS classes...
proteusPy: INFO 2024-11-20 21:33:33,449 - proteusPy.DisulfideClass_Constructor.build_yourself - Creating binary SS classes...
proteusPy: INFO 2024-11-20 21:33:36,189 - proteusPy.DisulfideClass_Constructor.build_yourself - Creating sixfold SS classes...
proteusPy: INFO 2024-11-20 21:33:36,754 - proteusPy.DisulfideClass_Constructor.build_yourself - Creating eightfold SS classes...
proteusPy: INFO 2024-11-20 21:33:37,402 - proteusPy.DisulfideClass_Constructor.build_yourself - Initialization complete.


173676

In [6]:
pdb_ca_filtered.describe()

PDB IDs present:                    36796
Disulfides loaded:                  173676
Average structure resolution:       2.19 Å
Lowest Energy Disulfide:            2q7q_75D_140D
Highest Energy Disulfide:           6vxk_801B_806B
Cα distance cutoff:                 8.00 Å
Sγ distance cutoff:                 -1.00 Å


In [None]:
calculate_total_size(pdb_ca_filtered)

In [None]:
pdb_sg_filtered = DisulfideLoader(
    subset=False, verbose=True, cutoff=-1.0, sg_cutoff=2.1
)
tot_ss_sg = pdb_sg_filtered.TotalDisulfides
tot_ss_sg

In [None]:
pdb_sg_filtered.describe()

In [None]:
pdb_filtered = DisulfideLoader(subset=False, verbose=True, cutoff=8.0, sg_cutoff=2.1)
tot_ss_filt = pdb_filtered.TotalDisulfides
tot_ss_filt

In [None]:
pdb_filtered.describe()

In [None]:
filtered_ss = pdb_filtered.SSList
filtered2 = filtered_ss.filter_by_bond_ideality(5)
len(filtered2)