In [3]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.PDB import PDBParser, Superimposer
from Bio.PDB.PDBExceptions import PDBException


def analyze_pairs(pairs_root="pdb_mutant_pairs"):
    root = Path(pairs_root)
    records = []
    parser = PDBParser(QUIET=True)
    skipped = {'no_fasta': 0, 'no_pdb': 0, 'mismatch_ca': 0, 'rmsd_error': 0}

    for pair_dir in sorted(root.glob("pair_*")):
        if not pair_dir.is_dir() or pair_dir.name.count('_') < 2:
            continue
        # Load sequences
        fasta_files = list(pair_dir.glob("*.fasta"))
        if len(fasta_files) != 2:
            skipped['no_fasta'] += 1
            continue
        seqs = []
        for ff in fasta_files:
            recs = list(SeqIO.parse(ff, 'fasta'))
            if not recs:
                skipped['no_fasta'] += 1
                continue
            seqs.append(str(recs[0].seq))
        if len(seqs) != 2:
            continue
        seq1, seq2 = seqs
        length = len(seq1)
        # Sequence identity
        matches = sum(a == b for a, b in zip(seq1, seq2))
        identity_pct = 100.0 * matches / length if length > 0 else 0
        mutations = length - matches

        # Load structures
        pdb_files = list(pair_dir.glob("*.pdb"))
        if len(pdb_files) != 2:
            skipped['no_pdb'] += 1
            continue
        s1 = parser.get_structure("s1", pdb_files[0])
        s2 = parser.get_structure("s2", pdb_files[1])
        ca1 = [atom for atom in s1.get_atoms() if atom.get_name() == "CA"]
        ca2 = [atom for atom in s2.get_atoms() if atom.get_name() == "CA"]
        if len(ca1) != len(ca2):
            skipped['mismatch_ca'] += 1
            continue
        # Superimpose
        sup = Superimposer()
        try:
            sup.set_atoms(ca1, ca2)
        except PDBException:
            skipped['rmsd_error'] += 1
            continue
        rmsd = sup.rms

        # Record
        records.append({
            "pair": pair_dir.name,
            "length": length,
            "mutations": mutations,
            "identity_pct": round(identity_pct, 2),
            "rmsd": round(rmsd, 3),
            "ca_count": len(ca1)
        })

    # Create DataFrame
    df = pd.DataFrame(records)
    df.sort_values("identity_pct", ascending=False, inplace=True)

    # Summary statistics
    summary = df.describe()[["length", "mutations", "identity_pct", "rmsd", "ca_count"]]
    print("Descriptive statistics for all pairs:")
    print(summary)

    # Print skip counts
    print("\nSkipped pairs:")
    for k, v in skipped.items():
        print(f"  {k}: {v}")

    # Save detailed results
    csv_path = root / "pairs_analysis_summary.csv"
    df.to_csv(csv_path, index=False)
    print(f"Detailed per-pair data saved to {csv_path}\n")

    # Top 10 lowest RMSD
    if not df.empty:
        print("Top 10 lowest RMSD pairs:")
        print(df.nsmallest(10, "rmsd"))

    return df, summary


if __name__ == "__main__":
    analyze_pairs()

Descriptive statistics for all pairs:
            length   mutations  identity_pct        rmsd      ca_count
count   122.000000  122.000000    122.000000  122.000000    122.000000
mean    451.426230    1.090164     99.087623    1.052139   1463.655738
std     589.306553    1.233178      1.928971    0.550857   2110.735533
min      13.000000    0.000000     88.570000    0.123000     15.000000
25%      67.000000    0.000000     99.400000    0.556500    365.000000
50%     165.500000    1.000000     99.830000    1.052500    784.000000
75%     560.000000    2.000000    100.000000    1.557500   1768.750000
max    2563.000000    6.000000    100.000000    1.994000  13160.000000

Skipped pairs:
  no_fasta: 0
  no_pdb: 11
  mismatch_ca: 1
  rmsd_error: 0
Detailed per-pair data saved to pdb_mutant_pairs/pairs_analysis_summary.csv

Top 10 lowest RMSD pairs:
               pair  length  mutations  identity_pct   rmsd  ca_count
96   pair_8DEC_8DEE    1317          2         99.85  0.123      4000
57  