In [None]:
# ZZ_scratch.ipynb - This is a general scratchpad used for code development and testing. Don't expect it to make a lot of sense.
# Eric G. Suchanek, PhD. 2024

In [None]:
from pathlib import Path
import os
import pandas as pd
from proteusPy import Load_PDB_SS, Disulfide, DisulfideList, DisulfideLoader

HOME = Path.home()
PDB = Path(os.getenv("PDB", HOME / "pdb"))

DATA_DIR = PDB / "data"
SAVE_DIR = HOME / "Documents" / "proteusPyDocs" / "classes"
REPO_DIR = HOME / "repos" / "proteusPy" / "data"

OCTANT = SAVE_DIR / "octant"
OCTANT.mkdir(parents=True, exist_ok=True)

BINARY = SAVE_DIR / "binary"
BINARY.mkdir(parents=True, exist_ok=True)

SEXTANT = SAVE_DIR / "sextant"
SEXTANT.mkdir(parents=True, exist_ok=True)

PBAR_COLS = 78

In [7]:
PDB_SS = Load_PDB_SS(subset=False, verbose=True)
PDB_SS.describe()

-> load_PDB_SS(): Reading /opt/homebrew/Caskroom/mambaforge/base/envs/ppydev/lib/python3.11/site-packages/proteusPy/data/PDB_SS_ALL_LOADER.pkl... 
-> load_PDB_SS(): Done reading /opt/homebrew/Caskroom/mambaforge/base/envs/ppydev/lib/python3.11/site-packages/proteusPy/data/PDB_SS_ALL_LOADER.pkl... 
PDB IDs present:                    35818
Disulfides loaded:                  120494
Average structure resolution:       2.34 Å
Lowest Energy Disulfide:            2q7q_75D_140D
Highest Energy Disulfide:           1toz_456A_467A
Cα distance cutoff:                 8.00 Å
Total RAM Used:                     30.72 GB.


In [5]:
df = pd.DataFrame()
fig = PDB_SS.plot_count_vs_class_df(
    df,
    title="Binary",
    save=True,
    savedir=BINARY,
    verbose=True,
    base=2,
)
fig.show()


proteusPy: INFO 2024-08-31 14:58:56,893 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving Binary plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/binary/Binary_Binary.png


In [6]:
PDB_SS.plot_binary_to_eightclass_incidence(
    theme="light", save=True, verbose=True, savedir=OCTANT
)

AttributeError: 'DisulfideClass_Constructor' object has no attribute 'eightclass_df'

In [None]:
PDB_SS.plot_binary_to_sixclass_incidence(
    theme="light", save=True, verbose=True, savedir=SEXTANT
)

In [None]:
clslist6 = PDB_SS.tclass.sslist_from_classid("55555", base=6)
clslist6

In [None]:
clslist8 = PDB_SS.tclass.sslist_from_classid("77778", base=8)
clslist8

In [None]:
df2 = PDB_SS.tclass.classdf
df6 = PDB_SS.tclass.sixclass_df
df8 = PDB_SS.tclass.eightclass_df

In [None]:
df2.head(32)

In [None]:
df8

In [None]:
df6

In [None]:
def enumerate_class_fromlist(loader: DisulfideLoader, sslist):
    import pandas as pd

    x = []
    y = []

    for cls in sslist:
        if cls is not None:
            _y = loader.tclass.sslist_from_classid(cls)
            # it's possible to have 0 SS in a class
            if _y is not None:
                # only append if we have both.
                x.append(cls)
                y.append(len(_y))

    sslist_df = pd.DataFrame(columns=["class_id", "count"])
    sslist_df["class_id"] = x
    sslist_df["count"] = y
    return sslist_df

In [None]:
cls = "00000"
sixcls = PDB_SS.tclass.binary_to_six_class(cls)
df = enumerate_class_fromlist(PDB_SS, sixcls)
# Assuming 'count' is the column by which we want to sort
sorted_df = df.sort_values(by="count", ascending=False)

# Display the sorted DataFrame
print(sorted_df)

In [None]:
cls = "00000"
eightcls = PDB_SS.tclass.binary_to_eight_class(cls)
# eightcls

df = enumerate_class_fromlist(PDB_SS, eightcls)
# Assuming 'count' is the column by which we want to sort
sorted_df = df.sort_values(by="count", ascending=False)

# Display the sorted DataFrame
print(sorted_df)

In [None]:
PDB_SS.plot_binary_to_eightclass_incidence(theme="light")

In [None]:
PDB_SS.plot_count_vs_class_df(df, cls, theme="light")

In [None]:
df = PDB_SS.tclass.sixclass_df
df

In [None]:
plot_binary_to_sixclass_incidence(PDB_SS, theme="light")

In [None]:
plot_binary_to_eightclass_incidence(PDB_SS, theme="light")

In [None]:
PDB_SS.plot_binary_to_sixclass_incidence(light=False)

In [None]:
eight = PDB_SS.tclass.eightclass_df
eight.head(-10)

In [None]:
sslist2 = PDB_SS.extract_class("87784")
sslist2

In [None]:
PDB_SS.plot_classes_vs_cutoff(0.1, 20)

In [None]:
ssdict = PDB_SS.SSDict
ssdict

In [None]:
def build_ss_from_idlist(loader, idlist):
    """
    Return a DisulfideList of Disulfides for a given list of PDBIDs

    :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
    :return: DisulfideList
    """
    res = DisulfideList([], "RCSB_list")
    for k, v in loader.SSDict.items():
        if k in idlist:
            for ss_index in range(len(v)):
                res.append(loader.SSList[v[ss_index]])
    return res

In [None]:
sslist = build_ss_from_idlist(PDB_SS, ["4yys", "2q7q"])
sslist

In [None]:
slice = PDB_SS[10:20]
slice.pdb_id

In [None]:
sslist = PDB_SS.SSList
slice2 = sslist[100:200]
slice2.pdb_id

In [None]:
tors = [-60, -60, 90, -60, -60]
ss1 = Disulfide(torsions=tors)
ss1.pprint_all()

In [None]:
ss1 = PDB_SS["2q7q_75D_140D"]
ss1.QUIET = False
ss1.bond_length_ideality
ss1.bond_angle_ideality

In [None]:
dihed = ss1.dihedrals
modelled_min = Disulfide("model", quiet=False)
modelled_min.dihedrals = dihed
modelled_min.build_yourself()
modelled_min.bond_length_ideality

In [None]:
# investigate the distal N->Ca distance. 8/15/24 -egs-

modelled_min.bond_angle_ideality

In [None]:
sslist = PDB_SS.SSList
len(sslist)

In [None]:
import pandas as pd
from tqdm import tqdm


def create_disulfide_dataframe(disulfide_list):
    """
    Create a DataFrame with columns PDB_ID, SS_Name, Angle_Deviation, Distance_Deviation
    from a list of disulfides.

    :param disulfide_list: List of disulfide objects.
    :type proteusPy.DisulfideList: list
    :return: DataFrame containing the disulfide information.
    :rtype: pd.DataFrame
    """
    data = {
        "PDB_ID": [],
        "SS_Name": [],
        "Angle_Deviation": [],
        "Distance_Deviation": [],
        "Ca_Distance": [],
    }

    for ss in tqdm(disulfide_list, desc="Processing Disulfides"):
        pdb_id = ss.pdb_id
        ca_distance = ss.ca_distance
        angle_deviation = ss.bond_angle_ideality
        distance_deviation = ss.bond_length_ideality

        data["PDB_ID"].append(pdb_id)
        data["SS_Name"].append(ss.name)
        data["Angle_Deviation"].append(angle_deviation)
        data["Distance_Deviation"].append(distance_deviation)
        data["Ca_Distance"].append(ca_distance)

    df = pd.DataFrame(data)
    return df


# Example usage
# Assuming you have a list of disulfide objects called disulfide_list
# df = create_disulfide_dataframe(disulfide_list)
# print(df)

In [None]:
dev_df = create_disulfide_dataframe(sslist)

In [None]:
dev_df.describe()

In [None]:
dev_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_histograms(df):
    """
    Plot histograms for Distance_Deviation, Angle_Deviation, and Ca_Distance.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    """
    plt.figure(figsize=(21, 6))

    # Distance Deviation Histogram
    plt.subplot(1, 3, 1)
    sns.histplot(df["Distance_Deviation"], kde=True, bins=30)
    plt.title("Distance Deviation Distribution")
    plt.xlabel("Distance Deviation")
    plt.ylabel("Frequency")

    # Angle Deviation Histogram
    plt.subplot(1, 3, 2)
    sns.histplot(df["Angle_Deviation"], kde=True, bins=30)
    plt.title("Angle Deviation Distribution")
    plt.xlabel("Angle Deviation")
    plt.ylabel("Frequency")

    # Ca_Distance Histogram
    plt.subplot(1, 3, 3)
    sns.histplot(df["Ca_Distance"], kde=True, bins=30)
    plt.title("Ca Distance Distribution")
    plt.xlabel("Ca Distance")
    plt.ylabel("Frequency")

    plt.tight_layout()
    plt.show()

In [None]:
# plot_histograms(dev_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def highlight_worst_structures(df, top_n=10):
    """
    Highlight the worst structures for distance and angle deviations and annotate their names.
    Also, add a subplot showing the worst structures aggregated by PDB_ID.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    :param top_n: Number of worst structures to highlight.
    :type top_n: int
    """
    # Identify the worst structures for distance deviation
    worst_distance = df.nlargest(top_n, "Distance_Deviation")

    # Identify the worst structures for angle deviation
    worst_angle = df.nlargest(top_n, "Angle_Deviation")

    # Combine the worst structures
    worst_structures = pd.concat([worst_distance, worst_angle]).drop_duplicates()

    # Aggregate worst structures by PDB_ID
    worst_structures_agg = (
        worst_structures.groupby("PDB_ID").size().reset_index(name="Count")
    )

    fig, axes = plt.subplots(1, 2, figsize=(20, 6))

    # Scatter plot for all structures
    sns.scatterplot(
        x="Distance_Deviation",
        y="Angle_Deviation",
        data=df,
        label="All Structures",
        ax=axes[0],
    )

    # Highlight the worst structures
    sns.scatterplot(
        x="Distance_Deviation",
        y="Angle_Deviation",
        data=worst_structures,
        color="red",
        label="Worst Structures",
        marker="X",
        s=100,
        ax=axes[0],
    )

    # Annotate the worst structures with their names
    for i, row in worst_structures.iterrows():
        axes[0].annotate(
            row["SS_Name"],
            (row["Distance_Deviation"], row["Angle_Deviation"]),
            textcoords="offset points",
            xytext=(5, 5),
            ha="right",
        )

    axes[0].set_title("Distance Deviation vs. Angle Deviation")
    axes[0].set_xlabel("Distance Deviation")
    axes[0].set_ylabel("Angle Deviation")
    axes[0].legend()

    # Bar plot for worst structures aggregated by PDB_ID
    sns.barplot(x="PDB_ID", y="Count", data=worst_structures_agg, ax=axes[1])
    axes[1].set_title("Worst Structures Aggregated by PDB_ID")
    axes[1].set_xlabel("PDB_ID")
    axes[1].set_ylabel("Count")
    axes[1].tick_params(axis="x", rotation=90)

    plt.tight_layout()
    plt.show()


# Example usage
# Assuming you have a DataFrame named dev_df
# highlight_worst_structures(dev_df, top_n=10)

In [None]:
highlight_worst_structures(dev_df, top_n=10)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def highlight_worst_structures2(df, top_n=10):
    """
    Highlight the worst structures for distance and angle deviations and annotate their names.
    Also, add subplots showing the worst structures aggregated by PDB_ID for distance and angle deviations.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    :param top_n: Number of worst structures to highlight.
    :type top_n: int
    """
    # Identify the worst structures for distance deviation
    worst_distance = df.nlargest(top_n, "Distance_Deviation")

    # Identify the worst structures for angle deviation
    worst_angle = df.nlargest(top_n, "Angle_Deviation")

    # Combine the worst structures
    worst_structures = pd.concat([worst_distance, worst_angle]).drop_duplicates()

    # Aggregate worst structures by PDB_ID for distance and angle deviations
    worst_distance_agg = (
        worst_distance.groupby("PDB_ID").size().reset_index(name="Count")
    )
    worst_angle_agg = worst_angle.groupby("PDB_ID").size().reset_index(name="Count")

    fig, axes = plt.subplots(1, 3, figsize=(24, 6))

    # Histogram for distance deviation
    sns.histplot(df["Distance_Deviation"], kde=True, bins=30, ax=axes[0])
    axes[0].set_title("Distance Deviation Distribution")
    axes[0].set_xlabel("Distance Deviation")
    axes[0].set_ylabel("Frequency")

    # Histogram for angle deviation
    sns.histplot(df["Angle_Deviation"], kde=True, bins=30, ax=axes[1])
    axes[1].set_title("Angle Deviation Distribution")
    axes[1].set_xlabel("Angle Deviation")
    axes[1].set_ylabel("Frequency")

    # Bar plots for worst structures aggregated by PDB_ID
    sns.barplot(
        x="PDB_ID",
        y="Count",
        data=worst_distance_agg,
        ax=axes[2],
        color="blue",
        label="Distance Deviation",
    )
    sns.barplot(
        x="PDB_ID",
        y="Count",
        data=worst_angle_agg,
        ax=axes[2],
        color="green",
        label="Angle Deviation",
        alpha=0.6,
    )
    axes[2].set_title("Worst Structures Aggregated by PDB_ID")
    axes[2].set_xlabel("PDB_ID")
    axes[2].set_ylabel("Count")
    axes[2].tick_params(axis="x", rotation=90)
    axes[2].legend()

    plt.tight_layout()
    plt.show()


# Example usage
# Assuming you have a DataFrame named dev_df
# highlight_worst_structures(dev_df, top_n=10)

In [None]:
highlight_worst_structures2(dev_df, top_n=10)

In [None]:
import pandas as pd
import numpy as np


def calculate_std_cutoff(df, column, num_std=2):
    """
    Calculate cutoff based on standard deviation.

    :param df: DataFrame containing the deviations.
    :type df: pd.DataFrame
    :param column: Column name for which to calculate the cutoff.
    :type column: str
    :param num_std: Number of standard deviations to use for the cutoff.
    :type num_std: int
    :return: Cutoff value.
    :rtype: float
    """
    mean = df[column].mean()
    std = df[column].std()
    cutoff = mean + num_std * std
    return cutoff


def calculate_percentile_cutoff(df, column, percentile=95):
    """
    Calculate cutoff based on percentile.

    :param df: DataFrame containing the deviations.
    :type df: pd.DataFrame
    :param column: Column name for which to calculate the cutoff.
    :type column: str
    :param percentile: Percentile to use for the cutoff.
    :type percentile: int
    :return: Cutoff value.
    :rtype: float
    """
    cutoff = np.percentile(df[column].dropna(), percentile)
    return cutoff


# Example usage
# distance_cutoff = calculate_percentile_cutoff(dev_df, 'Distance_Deviation', percentile=95)
# angle_cutoff = calculate_percentile_cutoff(dev_df, 'Angle_Deviation', percentile=95)

In [None]:
import pandas as pd
import numpy as np

# Assuming you have a DataFrame named dev_df with columns 'Distance_Deviation' and 'Angle_Deviation'

# Standard Deviation Method
distance_cutoff_std = calculate_std_cutoff(dev_df, "Distance_Deviation", num_std=3)
angle_cutoff_std = calculate_std_cutoff(dev_df, "Angle_Deviation", num_std=3)
ca_cutoff_std = calculate_std_cutoff(dev_df, "Ca_Distance", num_std=3)

# Percentile Method
distance_cutoff_percentile = calculate_percentile_cutoff(
    dev_df, "Distance_Deviation", percentile=98
)
angle_cutoff_percentile = calculate_percentile_cutoff(
    dev_df, "Angle_Deviation", percentile=98
)
ca_cutoff_percentile = calculate_percentile_cutoff(dev_df, "Ca_Distance", percentile=98)

print(f"Distance Deviation Cutoff (3 Std Dev): {distance_cutoff_std}")
print(f"Angle Deviation Cutoff (3 Std Dev): {angle_cutoff_std}")
print(f"Ca Distance Cutoff (3 Std Dev): {ca_cutoff_std}\n")

print(f"Distance Deviation Cutoff (98th Percentile): {distance_cutoff_percentile}")
print(f"Angle Deviation Cutoff (98th Percentile): {angle_cutoff_percentile}")
print(f"Ca Distance Cutoff (98th Percentile): {ca_cutoff_percentile}")

In [None]:
import pandas as pd


def filter_by_cutoffs(df, distance_cutoff, angle_cutoff):
    """
    Filter the DataFrame based on distance and angle cutoffs.

    :param df: DataFrame containing the deviations.
    :type df: pd.DataFrame
    :param distance_cutoff: Cutoff value for distance deviation.
    :type distance_cutoff: float
    :param angle_cutoff: Cutoff value for angle deviation.
    :type angle_cutoff: float
    :return: Filtered DataFrame.
    :rtype: pd.DataFrame
    """
    filtered_df = df[
        (df["Distance_Deviation"] <= distance_cutoff)
        & (df["Angle_Deviation"] <= angle_cutoff)
    ]
    return filtered_df


# Example usage
# Assuming you have a DataFrame named dev_df
# distance_cutoff = 5.0
# angle_cutoff = 10.0
# filtered_df = filter_by_cutoffs(dev_df, distance_cutoff, angle_cutoff)
# print(filtered_df)

In [None]:
dev_df.shape[0]

In [None]:
filt = filter_by_cutoffs(dev_df, 1.0, 10.0)
filt.shape[0]

In [None]:
dev_df.shape[0]

In [None]:
print(dev_df["Angle_Deviation"].isna().sum())

In [None]:
import pandas as pd
import numpy as np


def calculate_std_cutoff(df, column, num_std=2):
    mean = df[column].mean()
    std = df[column].std()
    cutoff = mean + num_std * std
    return cutoff


def calculate_percentile_cutoff(df, column, percentile=95):
    cutoff = np.percentile(df[column].dropna(), percentile)
    return cutoff


# Assuming you have a DataFrame named dev_df with columns 'Distance_Deviation' and 'Angle_Deviation'

# Check for missing values
print(f"Missing values in 'Angle_Deviation': {dev_df['Angle_Deviation'].isna().sum()}")

# Drop missing values for the calculation
dev_df = dev_df.dropna(subset=["Angle_Deviation", "Distance_Deviation"])

# Standard Deviation Method
distance_cutoff_std = calculate_std_cutoff(dev_df, "Distance_Deviation", num_std=3)
angle_cutoff_std = calculate_std_cutoff(dev_df, "Angle_Deviation", num_std=3)

# Percentile Method
distance_cutoff_percentile = calculate_percentile_cutoff(
    dev_df, "Distance_Deviation", percentile=98
)
angle_cutoff_percentile = calculate_percentile_cutoff(
    dev_df, "Angle_Deviation", percentile=98
)

print(f"Distance Deviation Cutoff (3 Std Dev): {distance_cutoff_std}")
print(f"Angle Deviation Cutoff (3 Std Dev): {angle_cutoff_std}")
print(f"Distance Deviation Cutoff (98th Percentile): {distance_cutoff_percentile}")
print(f"Angle Deviation Cutoff (98th Percentile): {angle_cutoff_percentile}")