In [None]:
# ZZ_scratch.ipynb - This is a general scratchpad used for code development and testing. Don't expect it to make a lot of sense.
# Eric G. Suchanek, PhD. 2024

In [1]:
from pathlib import Path
import os
import pandas as pd
from proteusPy import Load_PDB_SS, Disulfide, DisulfideList, DisulfideLoader

HOME = Path.home()
PDB = Path(os.getenv("PDB", HOME / "pdb"))

DATA_DIR = PDB / "data"
SAVE_DIR = HOME / "Documents" / "proteusPyDocs" / "classes"
REPO_DIR = HOME / "repos" / "proteusPy" / "data"

OCTANT = SAVE_DIR / "octant"
OCTANT.mkdir(parents=True, exist_ok=True)

BINARY = SAVE_DIR / "binary"
BINARY.mkdir(parents=True, exist_ok=True)

SEXTANT = SAVE_DIR / "sextant"
SEXTANT.mkdir(parents=True, exist_ok=True)

PBAR_COLS = 78

In [2]:
PDB_SS = Load_PDB_SS(subset=False, verbose=True)
PDB_SS.describe()

-> load_PDB_SS(): Reading /Volumes/NeuroTunes/Users/suchanek/mambaforge/envs/ppydev/lib/python3.11/site-packages/proteusPy/data/PDB_SS_ALL_LOADER.pkl... 
-> load_PDB_SS(): Done reading /Volumes/NeuroTunes/Users/suchanek/mambaforge/envs/ppydev/lib/python3.11/site-packages/proteusPy/data/PDB_SS_ALL_LOADER.pkl... 
PDB IDs present:                    37075
Disulfides loaded:                  175649
Average structure resolution:       2.07 Å
Lowest Energy Disulfide:            2q7q_75D_140D
Highest Energy Disulfide:           6vxk_801B_806B
Cα distance cutoff:                 -1.00 Å
Total RAM Used:                     44.35 GB.


In [3]:
df = pd.DataFrame()
fig = PDB_SS.plot_count_vs_class_df(
    df,
    title="Binary",
    save=True,
    savedir=BINARY,
    verbose=True,
    base=2,
)
fig.show()


proteusPy: INFO 2024-09-01 13:05:30,695 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving Binary plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/binary/Binary_Binary.png


In [4]:
PDB_SS.plot_binary_to_eightclass_incidence(
    theme="light", save=True, verbose=True, savedir=OCTANT
)

proteusPy: INFO 2024-09-01 13:05:34,434 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00000 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/octant/00000_Octant.png
proteusPy: INFO 2024-09-01 13:05:34,844 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00002 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/octant/00002_Octant.png
proteusPy: INFO 2024-09-01 13:05:35,243 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00020 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/octant/00020_Octant.png
proteusPy: INFO 2024-09-01 13:05:35,644 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00022 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/octant/00022_Octant.png
proteusPy: INFO 2024-09-01 13:05:36,038 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00200 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes

In [5]:
PDB_SS.plot_binary_to_sixclass_incidence(
    theme="light", save=True, verbose=True, savedir=SEXTANT
)

proteusPy: INFO 2024-09-01 13:05:49,073 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00000 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/sextant/00000_Sextant.png
proteusPy: INFO 2024-09-01 13:05:49,174 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00002 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/sextant/00002_Sextant.png
proteusPy: INFO 2024-09-01 13:05:49,270 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00020 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/sextant/00020_Sextant.png
proteusPy: INFO 2024-09-01 13:05:49,369 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00022 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs/classes/sextant/00022_Sextant.png
proteusPy: INFO 2024-09-01 13:05:49,464 - proteusPy.DisulfideLoader.plot_count_vs_class_df - Saving 00200 plot to /Volumes/NeuroTunes/Users/suchanek/Documents/proteusPyDocs

In [6]:
clslist6 = PDB_SS.tclass.sslist_from_classid("55555", base=6)
clslist6

array(['7rtr_172D_180E', '4w6y_63A_83A', '7bdc_440A_445A',
       '1nys_104A_109A', '7bdb_440A_445A', '4w6x_63A_83A',
       '6fur_160A_167B', '1tfs_53A_58A', '2r52_96A_96B', '1e33_493P_499P',
       '2r53_96A_96B', '6cph_170D_187E', '1e2s_493P_499P',
       '3c0o_159A_164A', '4ww1_174A_181B', '7bdf_440A_445A',
       '3c0m_159A_164A', '1e3c_493P_499P', '4u5d_6E_6F', '5o0p_266A_335A',
       '5xea_106A_159A', '7bo1_440A_445A', '4glu_54A_97A',
       '7sx3_191B_261B', '7bnb_440A_445A', '4udt_160A_170B',
       '2cdx_54A_59A', '2h5f_69A_74A', '5o0r_266A_335A', '7b84_440A_445A',
       '7s8i_159A_168B', '7sx4_191B_261B', '5o0w_50E_117E',
       '7b8a_440A_445A', '7l8x_74A_561B', '6oaz_131E_136E',
       '1z52_159A_164A', '5o0n_266A_335A', '4ruq_100A_153A',
       '1ljz_60A_65A', '7bm1_440A_445A', '5o0o_266A_335A',
       '7bm3_440A_445A', '3qeu_157D_172E', '7bap_440A_445A',
       '4rus_100A_153A', '6oau_131D_136D', '7b9n_440A_445A',
       '5o0k_266A_335A', '2bnu_162A_169B', '6xo4_68A_78

In [7]:
clslist8 = PDB_SS.tclass.sslist_from_classid("77778", base=8)
clslist8

array(['4n42_157A_186A', '1kr0_159A_188A', '1kr1_159A_188A',
       '1kqz_159A_188A', '2xuc_208B_237B', '3oih_157A_186A',
       '3hu7_157A_186A', '7zy9_200A_228A', '2xvn_208A_237A',
       '3mu7_158A_187A', '4b16_162A_191A', '3o9n_157A_186A',
       '3eyo_102D_107D', '4b15_162A_191A', '3m7s_157A_186A',
       '2gsj_158A_187A', '3alu_1B_1C', '4tx6_208A_237A', '2xtk_208A_237A',
       '7zya_200A_228A', '3d5h_157A_186A', '5ine_92A_239A',
       '2xvp_208A_237A'], dtype=object)

In [8]:
df2 = PDB_SS.tclass.classdf
df6 = PDB_SS.tclass.sixclass_df
df8 = PDB_SS.tclass.eightclass_df

In [9]:
df2.head(32)

Unnamed: 0,class_id,SS_Classname,FXN,ss_id,count,incidence,percentage
0,0,-LHSpiral,UNK,"[2qhe_27A_126A, 2qhe_44A_105A, 2qhe_51A_98A, 2...",41229,0.234724,23.47238
1,2,00002,UNK,"[1mfe_395H_450H, 2qhe_84A_96A, 4nz3_185A_200A,...",9391,0.053465,5.346458
2,20,-LHHook,UNK,"[1j5h_88A_93A, 3l4o_38C_86C, 3l75_144E_160E, 3...",4845,0.027583,2.758342
3,22,00022,UNK,"[3ejj_31A_31B, 2hew_70F_163F, 2vnc_505B_519B, ...",2426,0.013812,1.381164
4,200,-RHStaple,Allosteric,"[1j5h_37A_47A, 4lb7_2D_30D, 3l4o_181D_196D, 2h...",16147,0.091928,9.192765
5,202,00202,UNK,"[1zed_467A_474A, 2zxt_396A_398A, 7yeg_344E_361...",1396,0.007948,0.794767
6,220,00220,UNK,"[3rik_18A_23A, 1chv_14S_38S, 2qhe_29A_45A, 4nz...",7240,0.041219,4.121857
7,222,00222,UNK,"[4yys_56A_98A, 3l4o_36C_121C, 1qvn_58B_105B, 7...",6658,0.037905,3.790514
8,2000,02000,UNK,"[6dmb_234A_327A, 1mfe_137L_196L, 6fuf_110A_187...",7108,0.040467,4.046707
9,2002,02002,UNK,"[6snc_134A_194A, 3ejj_42X_84X, 4nzr_134L_194L,...",8044,0.045796,4.579588


In [10]:
df8

Unnamed: 0,class_id,ss_id,count,incidence,percentage
0,11212,"[3c34_202B_256B, 3c36_202B_256B, 4uip_195A_207...",5,0.000028,0.002847
1,11221,[2g6z_197A_219A],1,0.000006,0.000569
2,11222,"[5qj2_115B_125B, 7s1b_534A_587A, 6snw_151E_169...",19,0.000108,0.010817
3,11223,[2crd_13A_33A],1,0.000006,0.000569
4,11224,[2kd3_84A_142A],1,0.000006,0.000569
...,...,...,...,...,...
9694,88647,"[4okr_308A_311A, 4oku_308A_311A]",2,0.000011,0.001139
9695,88664,[6p48_352A_356A],1,0.000006,0.000569
9696,88738,[3cu7_1654A_1657A],1,0.000006,0.000569
9697,88745,[2ifi_3A_8A],1,0.000006,0.000569


In [11]:
df6

Unnamed: 0,class_id,ss_id,count,incidence,percentage
0,11111,"[3p11_171A_183A, 2lvf_8A_60A, 1nql_538A_555A, ...",5,0.000028,0.002847
1,11112,"[6u46_19A_58A, 7ewr_354B_392B, 7ewp_354B_392B,...",9,0.000051,0.005124
2,11121,"[3ltf_478A_491A, 7om4_227A_236A, 7bhf_485B_498...",35,0.000199,0.019926
3,11122,"[7lfs_170C_183C, 7oxf_3A_15A, 5w9m_912D_925D, ...",12,0.000068,0.006832
4,11123,"[6xk0_277A_288A, 6mdq_277A_288A, 5orf_90B_101B...",14,0.000080,0.007970
...,...,...,...,...,...
4698,66562,[4kiv_1A_80A],1,0.000006,0.000569
4699,66566,[1wuu_391A_391B],1,0.000006,0.000569
4700,66614,[4dxr_563A_8774B],1,0.000006,0.000569
4701,66632,[7dgd_524B_544B],1,0.000006,0.000569


In [12]:
def enumerate_class_fromlist(loader: DisulfideLoader, sslist):
    import pandas as pd

    x = []
    y = []

    for cls in sslist:
        if cls is not None:
            _y = loader.tclass.sslist_from_classid(cls)
            # it's possible to have 0 SS in a class
            if _y is not None:
                # only append if we have both.
                x.append(cls)
                y.append(len(_y))

    sslist_df = pd.DataFrame(columns=["class_id", "count"])
    sslist_df["class_id"] = x
    sslist_df["count"] = y
    return sslist_df

In [13]:
cls = "00000"
sixcls = PDB_SS.tclass.binary_to_six_class(cls)
df = enumerate_class_fromlist(PDB_SS, sixcls)
# Assuming 'count' is the column by which we want to sort
sorted_df = df.sort_values(by="count", ascending=False)

# Display the sorted DataFrame
print(sorted_df)

   class_id  count
30    45654     44
51    46664     38
94    56665     28
52    46665     27
29    45645     25
..      ...    ...
59    54555      1
63    54645      1
72    55545      1
64    54646      1
54    54454      1

[109 rows x 2 columns]


In [14]:
cls = "00000"
eightcls = PDB_SS.tclass.binary_to_eight_class(cls)
# eightcls

df = enumerate_class_fromlist(PDB_SS, eightcls)
# Assuming 'count' is the column by which we want to sort
sorted_df = df.sort_values(by="count", ascending=False)

# Display the sorted DataFrame
print(sorted_df)

    class_id  count
35     56757   5292
13     55757    464
73     57775    235
219    77767    140
61     57675     93
..       ...    ...
129    67875      1
130    68658      1
131    68675      1
132    68677      1
276    88555      1

[277 rows x 2 columns]


In [15]:
PDB_SS.plot_binary_to_eightclass_incidence(theme="light")

In [16]:
PDB_SS.plot_count_vs_class_df(df, cls, theme="light")

In [None]:
PDB_SS.plot_binary_to_sixclass_incidence(light=False)

In [None]:
eight = PDB_SS.tclass.eightclass_df
eight.head(-10)

In [None]:
sslist2 = PDB_SS.extract_class("87784")
sslist2

In [None]:
PDB_SS.plot_classes_vs_cutoff(0.1, 20)

In [None]:
ssdict = PDB_SS.SSDict
ssdict

In [None]:
def build_ss_from_idlist(loader, idlist):
    """
    Return a DisulfideList of Disulfides for a given list of PDBIDs

    :param idlist: List of PDBIDs, e.g. ['4yys', '2q7q']
    :return: DisulfideList
    """
    res = DisulfideList([], "RCSB_list")
    for k, v in loader.SSDict.items():
        if k in idlist:
            for ss_index in range(len(v)):
                res.append(loader.SSList[v[ss_index]])
    return res

In [None]:
sslist = build_ss_from_idlist(PDB_SS, ["4yys", "2q7q"])
sslist

In [None]:
slice = PDB_SS[10:20]
slice.pdb_id

In [None]:
sslist = PDB_SS.SSList
slice2 = sslist[100:200]
slice2.pdb_id

In [None]:
tors = [-60, -60, 90, -60, -60]
ss1 = Disulfide(torsions=tors)
ss1.pprint_all()

In [None]:
ss1 = PDB_SS["2q7q_75D_140D"]
ss1.QUIET = False
ss1.bond_length_ideality
ss1.bond_angle_ideality

In [None]:
dihed = ss1.dihedrals
modelled_min = Disulfide("model", quiet=False)
modelled_min.dihedrals = dihed
modelled_min.build_yourself()
modelled_min.bond_length_ideality

In [None]:
# investigate the distal N->Ca distance. 8/15/24 -egs-

modelled_min.bond_angle_ideality

In [None]:
sslist = PDB_SS.SSList
len(sslist)

In [None]:
import pandas as pd
from tqdm import tqdm


def create_disulfide_dataframe(disulfide_list):
    """
    Create a DataFrame with columns PDB_ID, SS_Name, Angle_Deviation, Distance_Deviation
    from a list of disulfides.

    :param disulfide_list: List of disulfide objects.
    :type proteusPy.DisulfideList: list
    :return: DataFrame containing the disulfide information.
    :rtype: pd.DataFrame
    """
    data = {
        "PDB_ID": [],
        "SS_Name": [],
        "Angle_Deviation": [],
        "Distance_Deviation": [],
        "Ca_Distance": [],
    }

    for ss in tqdm(disulfide_list, desc="Processing Disulfides"):
        pdb_id = ss.pdb_id
        ca_distance = ss.ca_distance
        angle_deviation = ss.bond_angle_ideality
        distance_deviation = ss.bond_length_ideality

        data["PDB_ID"].append(pdb_id)
        data["SS_Name"].append(ss.name)
        data["Angle_Deviation"].append(angle_deviation)
        data["Distance_Deviation"].append(distance_deviation)
        data["Ca_Distance"].append(ca_distance)

    df = pd.DataFrame(data)
    return df


# Example usage
# Assuming you have a list of disulfide objects called disulfide_list
# df = create_disulfide_dataframe(disulfide_list)
# print(df)

In [None]:
dev_df = create_disulfide_dataframe(sslist)

In [None]:
dev_df.describe()

In [None]:
dev_df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def plot_histograms(df):
    """
    Plot histograms for Distance_Deviation, Angle_Deviation, and Ca_Distance.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    """
    plt.figure(figsize=(21, 6))

    # Distance Deviation Histogram
    plt.subplot(1, 3, 1)
    sns.histplot(df["Distance_Deviation"], kde=True, bins=30)
    plt.title("Distance Deviation Distribution")
    plt.xlabel("Distance Deviation")
    plt.ylabel("Frequency")

    # Angle Deviation Histogram
    plt.subplot(1, 3, 2)
    sns.histplot(df["Angle_Deviation"], kde=True, bins=30)
    plt.title("Angle Deviation Distribution")
    plt.xlabel("Angle Deviation")
    plt.ylabel("Frequency")

    # Ca_Distance Histogram
    plt.subplot(1, 3, 3)
    sns.histplot(df["Ca_Distance"], kde=True, bins=30)
    plt.title("Ca Distance Distribution")
    plt.xlabel("Ca Distance")
    plt.ylabel("Frequency")

    plt.tight_layout()
    plt.show()

In [None]:
# plot_histograms(dev_df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def highlight_worst_structures(df, top_n=10):
    """
    Highlight the worst structures for distance and angle deviations and annotate their names.
    Also, add a subplot showing the worst structures aggregated by PDB_ID.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    :param top_n: Number of worst structures to highlight.
    :type top_n: int
    """
    # Identify the worst structures for distance deviation
    worst_distance = df.nlargest(top_n, "Distance_Deviation")

    # Identify the worst structures for angle deviation
    worst_angle = df.nlargest(top_n, "Angle_Deviation")

    # Combine the worst structures
    worst_structures = pd.concat([worst_distance, worst_angle]).drop_duplicates()

    # Aggregate worst structures by PDB_ID
    worst_structures_agg = (
        worst_structures.groupby("PDB_ID").size().reset_index(name="Count")
    )

    fig, axes = plt.subplots(1, 2, figsize=(20, 6))

    # Scatter plot for all structures
    sns.scatterplot(
        x="Distance_Deviation",
        y="Angle_Deviation",
        data=df,
        label="All Structures",
        ax=axes[0],
    )

    # Highlight the worst structures
    sns.scatterplot(
        x="Distance_Deviation",
        y="Angle_Deviation",
        data=worst_structures,
        color="red",
        label="Worst Structures",
        marker="X",
        s=100,
        ax=axes[0],
    )

    # Annotate the worst structures with their names
    for i, row in worst_structures.iterrows():
        axes[0].annotate(
            row["SS_Name"],
            (row["Distance_Deviation"], row["Angle_Deviation"]),
            textcoords="offset points",
            xytext=(5, 5),
            ha="right",
        )

    axes[0].set_title("Distance Deviation vs. Angle Deviation")
    axes[0].set_xlabel("Distance Deviation")
    axes[0].set_ylabel("Angle Deviation")
    axes[0].legend()

    # Bar plot for worst structures aggregated by PDB_ID
    sns.barplot(x="PDB_ID", y="Count", data=worst_structures_agg, ax=axes[1])
    axes[1].set_title("Worst Structures Aggregated by PDB_ID")
    axes[1].set_xlabel("PDB_ID")
    axes[1].set_ylabel("Count")
    axes[1].tick_params(axis="x", rotation=90)

    plt.tight_layout()
    plt.show()


# Example usage
# Assuming you have a DataFrame named dev_df
# highlight_worst_structures(dev_df, top_n=10)

In [None]:
highlight_worst_structures(dev_df, top_n=10)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def highlight_worst_structures2(df, top_n=10):
    """
    Highlight the worst structures for distance and angle deviations and annotate their names.
    Also, add subplots showing the worst structures aggregated by PDB_ID for distance and angle deviations.

    :param df: DataFrame containing the disulfide information.
    :type df: pd.DataFrame
    :param top_n: Number of worst structures to highlight.
    :type top_n: int
    """
    # Identify the worst structures for distance deviation
    worst_distance = df.nlargest(top_n, "Distance_Deviation")

    # Identify the worst structures for angle deviation
    worst_angle = df.nlargest(top_n, "Angle_Deviation")

    # Combine the worst structures
    worst_structures = pd.concat([worst_distance, worst_angle]).drop_duplicates()

    # Aggregate worst structures by PDB_ID for distance and angle deviations
    worst_distance_agg = (
        worst_distance.groupby("PDB_ID").size().reset_index(name="Count")
    )
    worst_angle_agg = worst_angle.groupby("PDB_ID").size().reset_index(name="Count")

    fig, axes = plt.subplots(1, 3, figsize=(24, 6))

    # Histogram for distance deviation
    sns.histplot(df["Distance_Deviation"], kde=True, bins=30, ax=axes[0])
    axes[0].set_title("Distance Deviation Distribution")
    axes[0].set_xlabel("Distance Deviation")
    axes[0].set_ylabel("Frequency")

    # Histogram for angle deviation
    sns.histplot(df["Angle_Deviation"], kde=True, bins=30, ax=axes[1])
    axes[1].set_title("Angle Deviation Distribution")
    axes[1].set_xlabel("Angle Deviation")
    axes[1].set_ylabel("Frequency")

    # Bar plots for worst structures aggregated by PDB_ID
    sns.barplot(
        x="PDB_ID",
        y="Count",
        data=worst_distance_agg,
        ax=axes[2],
        color="blue",
        label="Distance Deviation",
    )
    sns.barplot(
        x="PDB_ID",
        y="Count",
        data=worst_angle_agg,
        ax=axes[2],
        color="green",
        label="Angle Deviation",
        alpha=0.6,
    )
    axes[2].set_title("Worst Structures Aggregated by PDB_ID")
    axes[2].set_xlabel("PDB_ID")
    axes[2].set_ylabel("Count")
    axes[2].tick_params(axis="x", rotation=90)
    axes[2].legend()

    plt.tight_layout()
    plt.show()


# Example usage
# Assuming you have a DataFrame named dev_df
# highlight_worst_structures(dev_df, top_n=10)

In [None]:
highlight_worst_structures2(dev_df, top_n=10)

In [None]:
import pandas as pd
import numpy as np


def calculate_std_cutoff(df, column, num_std=2):
    """
    Calculate cutoff based on standard deviation.

    :param df: DataFrame containing the deviations.
    :type df: pd.DataFrame
    :param column: Column name for which to calculate the cutoff.
    :type column: str
    :param num_std: Number of standard deviations to use for the cutoff.
    :type num_std: int
    :return: Cutoff value.
    :rtype: float
    """
    mean = df[column].mean()
    std = df[column].std()
    cutoff = mean + num_std * std
    return cutoff


def calculate_percentile_cutoff(df, column, percentile=95):
    """
    Calculate cutoff based on percentile.

    :param df: DataFrame containing the deviations.
    :type df: pd.DataFrame
    :param column: Column name for which to calculate the cutoff.
    :type column: str
    :param percentile: Percentile to use for the cutoff.
    :type percentile: int
    :return: Cutoff value.
    :rtype: float
    """
    cutoff = np.percentile(df[column].dropna(), percentile)
    return cutoff


# Example usage
# distance_cutoff = calculate_percentile_cutoff(dev_df, 'Distance_Deviation', percentile=95)
# angle_cutoff = calculate_percentile_cutoff(dev_df, 'Angle_Deviation', percentile=95)

In [None]:
import pandas as pd
import numpy as np

# Assuming you have a DataFrame named dev_df with columns 'Distance_Deviation' and 'Angle_Deviation'

# Standard Deviation Method
distance_cutoff_std = calculate_std_cutoff(dev_df, "Distance_Deviation", num_std=3)
angle_cutoff_std = calculate_std_cutoff(dev_df, "Angle_Deviation", num_std=3)
ca_cutoff_std = calculate_std_cutoff(dev_df, "Ca_Distance", num_std=3)

# Percentile Method
distance_cutoff_percentile = calculate_percentile_cutoff(
    dev_df, "Distance_Deviation", percentile=98
)
angle_cutoff_percentile = calculate_percentile_cutoff(
    dev_df, "Angle_Deviation", percentile=98
)
ca_cutoff_percentile = calculate_percentile_cutoff(dev_df, "Ca_Distance", percentile=98)

print(f"Distance Deviation Cutoff (3 Std Dev): {distance_cutoff_std}")
print(f"Angle Deviation Cutoff (3 Std Dev): {angle_cutoff_std}")
print(f"Ca Distance Cutoff (3 Std Dev): {ca_cutoff_std}\n")

print(f"Distance Deviation Cutoff (98th Percentile): {distance_cutoff_percentile}")
print(f"Angle Deviation Cutoff (98th Percentile): {angle_cutoff_percentile}")
print(f"Ca Distance Cutoff (98th Percentile): {ca_cutoff_percentile}")

In [None]:
import pandas as pd


def filter_by_cutoffs(df, distance_cutoff, angle_cutoff):
    """
    Filter the DataFrame based on distance and angle cutoffs.

    :param df: DataFrame containing the deviations.
    :type df: pd.DataFrame
    :param distance_cutoff: Cutoff value for distance deviation.
    :type distance_cutoff: float
    :param angle_cutoff: Cutoff value for angle deviation.
    :type angle_cutoff: float
    :return: Filtered DataFrame.
    :rtype: pd.DataFrame
    """
    filtered_df = df[
        (df["Distance_Deviation"] <= distance_cutoff)
        & (df["Angle_Deviation"] <= angle_cutoff)
    ]
    return filtered_df


# Example usage
# Assuming you have a DataFrame named dev_df
# distance_cutoff = 5.0
# angle_cutoff = 10.0
# filtered_df = filter_by_cutoffs(dev_df, distance_cutoff, angle_cutoff)
# print(filtered_df)

In [None]:
dev_df.shape[0]

In [None]:
filt = filter_by_cutoffs(dev_df, 1.0, 10.0)
filt.shape[0]

In [None]:
dev_df.shape[0]

In [None]:
print(dev_df["Angle_Deviation"].isna().sum())

In [None]:
import pandas as pd
import numpy as np


def calculate_std_cutoff(df, column, num_std=2):
    mean = df[column].mean()
    std = df[column].std()
    cutoff = mean + num_std * std
    return cutoff


def calculate_percentile_cutoff(df, column, percentile=95):
    cutoff = np.percentile(df[column].dropna(), percentile)
    return cutoff


# Assuming you have a DataFrame named dev_df with columns 'Distance_Deviation' and 'Angle_Deviation'

# Check for missing values
print(f"Missing values in 'Angle_Deviation': {dev_df['Angle_Deviation'].isna().sum()}")

# Drop missing values for the calculation
dev_df = dev_df.dropna(subset=["Angle_Deviation", "Distance_Deviation"])

# Standard Deviation Method
distance_cutoff_std = calculate_std_cutoff(dev_df, "Distance_Deviation", num_std=3)
angle_cutoff_std = calculate_std_cutoff(dev_df, "Angle_Deviation", num_std=3)

# Percentile Method
distance_cutoff_percentile = calculate_percentile_cutoff(
    dev_df, "Distance_Deviation", percentile=98
)
angle_cutoff_percentile = calculate_percentile_cutoff(
    dev_df, "Angle_Deviation", percentile=98
)

print(f"Distance Deviation Cutoff (3 Std Dev): {distance_cutoff_std}")
print(f"Angle Deviation Cutoff (3 Std Dev): {angle_cutoff_std}")
print(f"Distance Deviation Cutoff (98th Percentile): {distance_cutoff_percentile}")
print(f"Angle Deviation Cutoff (98th Percentile): {angle_cutoff_percentile}")