In [6]:
from pathlib import Path

import pandas as pd
from biopandas.pdb import PandasPdb


In [7]:
def read_pdb_to_dataframe(
    pdb_path: Path,
) -> pd.DataFrame:
    """
    Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.

    Args:
        pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.

    Returns
    -------
        pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, removed of it's
        hydrogen atoms with one row per atom.
    """
    atomic_df = PandasPdb().read_pdb(pdb_path).df["ATOM"].query("element_symbol!='H'")
    atomic_df["IMGT"] = atomic_df["residue_number"].astype(str) + atomic_df["insertion"].astype(str)
    return atomic_df

In [8]:
df = read_pdb_to_dataframe("/home/athenes/Paraplume/data_with_scripts/6b0s/6b0s.pdb")

In [15]:
print(df.columns)

Index(['record_name', 'atom_number', 'blank_1', 'atom_name', 'alt_loc',
       'residue_name', 'blank_2', 'chain_id', 'residue_number', 'insertion',
       'blank_3', 'x_coord', 'y_coord', 'z_coord', 'occupancy', 'b_factor',
       'blank_4', 'segment_id', 'element_symbol', 'charge', 'line_idx',
       'IMGT'],
      dtype='object')


In [19]:
selected = df.query("chain_id.isin(['L','H'])")[["occupancy","b_factor","chain_id","IMGT"]].drop_duplicates()

In [21]:
print(selected[["occupancy","b_factor"]])

      occupancy  b_factor
495         0.0      0.01
504         0.0      0.10
511         0.0      0.00
520         0.0      0.00
528         0.0      0.00
...         ...       ...
2176        0.0      0.00
2185        0.0      0.00
2193        0.0      0.00
2200        0.0      0.00
2207        0.0      0.00

[228 rows x 2 columns]


In [25]:
# Binarize predictions using threshold 0.5
selected["predicted"] = (selected["b_factor"] >= 0.5).astype(int)

# Confusion matrix components
tp = ((selected["predicted"] == 1) & (selected["occupancy"] == 1)).sum()
fp = ((selected["predicted"] == 1) & (selected["occupancy"] == 0)).sum()
fn = ((selected["predicted"] == 0) & (selected["occupancy"] == 1)).sum()
tn = ((selected["predicted"] == 0) & (selected["occupancy"] == 0)).sum()

# Rates
tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

# Print results
print(f"True Positives: {tp}")
print(f"False Positives: {fp}")
print(f"Total Actual Positives: {tp + fn}")
print(f"True Positive Rate (TPR): {tpr:.3f} ({tp} over {tp+fn})")
print(f"False Positive Rate (FPR): {fpr:.3f} ({fp} over {fp+tn})")


True Positives: 23
False Positives: 9
Total Actual Positives: 23
True Positive Rate (TPR): 1.000 (23 over 23)
False Positive Rate (FPR): 0.044 (9 over 205)
