In [None]:
# SAbDab Sequence Extractor (No Intercaat)
import os
import gzip
import pandas as pd
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1

def extract_chain_sequence(structure, chain_id):
    sequence = ""
    if chain_id not in structure[0]:
        return ""
    chain = structure[0][chain_id]
    for residue in chain:
        if residue.id[0] != " ":
            continue
        try:
            aa = seq1(residue.get_resname())
        except Exception:
            aa = "X"
        sequence += aa
    return sequence

df = pd.read_csv("sabdab_summary_all.tsv", sep="\t")
output_rows = []
parser = PDBParser(QUIET=True)

for _, row in df.iterrows():
    pdb_id = row["pdb"]
    pdb_file = os.path.join("pdbs", f"{pdb_id}.pdb.gz")
    if not os.path.exists(pdb_file):
        print(f"[SKIP] File not found: {pdb_file}")
        continue
    try:
        with gzip.open(pdb_file, "rt") as handle:
            structure = parser.get_structure(pdb_id, handle)
    except Exception as e:
        print(f"[SKIP] Error parsing {pdb_id}: {e}")
        continue

    heavy_chain = str(row["Hchain"]) if pd.notna(row["Hchain"]) else ""
    light_chain = str(row["Lchain"]) if pd.notna(row["Lchain"]) else ""
    antigen_chains = str(row["antigen_chain"]).split(" | ")

    heavy_seq = extract_chain_sequence(structure, heavy_chain) if heavy_chain else ""
    light_seq = extract_chain_sequence(structure, light_chain) if light_chain else ""
    antigen_seqs = [extract_chain_sequence(structure, chain.strip()) for chain in antigen_chains if chain.strip()]
    antigen_seq = "|".join(antigen_seqs)

    output_rows.append({
        "pdb": pdb_id,
        "Heavy_chain": heavy_chain,
        "Light_chain": light_chain,
        "Antigen_Chains": " | ".join(antigen_chains),
        "Heavy_Chain_Seq": heavy_seq,
        "Light_Chain_Seq": light_seq,
        "Antigen_Seq": antigen_seq
    })

out_df = pd.DataFrame(output_rows)
out_df.to_csv("sabdab_sequences_only.tsv", sep="\t", index=False)
print("Saved to sabdab_sequences_only.tsv")

[SKIP] File not found: pdbs\9bew.pdb.gz
[SKIP] File not found: pdbs\9bew.pdb.gz
[SKIP] File not found: pdbs\9bew.pdb.gz
[SKIP] File not found: pdbs\9bew.pdb.gz
[SKIP] File not found: pdbs\9bew.pdb.gz
[SKIP] File not found: pdbs\9bew.pdb.gz
[SKIP] File not found: pdbs\9g13.pdb.gz
[SKIP] File not found: pdbs\9g13.pdb.gz
[SKIP] File not found: pdbs\9g13.pdb.gz
[SKIP] File not found: pdbs\9g13.pdb.gz
[SKIP] File not found: pdbs\8zss.pdb.gz
[SKIP] File not found: pdbs\8zsv.pdb.gz
[SKIP] File not found: pdbs\9bir.pdb.gz
[SKIP] File not found: pdbs\9euo.pdb.gz
[SKIP] File not found: pdbs\8w1v.pdb.gz
[SKIP] File not found: pdbs\8w1v.pdb.gz
[SKIP] File not found: pdbs\9bjk.pdb.gz
[SKIP] File not found: pdbs\8pn0.pdb.gz
[SKIP] File not found: pdbs\8pn0.pdb.gz
[SKIP] File not found: pdbs\8yky.pdb.gz
[SKIP] File not found: pdbs\8oyt.pdb.gz
[SKIP] File not found: pdbs\8oyt.pdb.gz
[SKIP] File not found: pdbs\8oyt.pdb.gz
[SKIP] File not found: pdbs\8s2e.pdb.gz
[SKIP] File not found: pdbs\8xio.pdb.gz


In [3]:
print("Unique PDB entries:", df["pdb"].nunique())
print("Total rows in input:", len(df))

Unique PDB entries: 9636
Total rows in input: 18971


In [4]:
import pandas as pd

# Load the previously saved output
df = pd.read_csv("sabdab_sequences_only.tsv", sep="\t")

# Drop duplicates based on pdb, heavy, light, and antigen sequences
dedup_df = df.drop_duplicates(
    subset=["pdb", "Heavy_Chain_Seq", "Light_Chain_Seq", "Antigen_Seq"]
)

# Save the deduplicated file
dedup_df.to_csv("sabdab_sequences_only_dedup.tsv", sep="\t", index=False)
print(f"Deduplicated file saved: {len(dedup_df)} unique entries")

Deduplicated file saved: 14482 unique entries
