In [None]:
# SAbDab Sequence Extractor (No Intercaat)
import os
import gzip
import pandas as pd
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1

def extract_chain_sequence(structure, chain_id):
    sequence = ""
    if chain_id not in structure[0]:
        return ""
    chain = structure[0][chain_id]
    for residue in chain:
        if residue.id[0] != " ":
            continue
        try:
            aa = seq1(residue.get_resname())
        except Exception:
            aa = "X"
        sequence += aa
    return sequence

df = pd.read_csv("sabdab_summary_all.tsv", sep="\t")
output_rows = []
parser = PDBParser(QUIET=True)

for _, row in df.iterrows():
    pdb_id = row["pdb"]
    pdb_file = os.path.join("pdbs", f"{pdb_id}.pdb.gz")
    if not os.path.exists(pdb_file):
        print(f"[SKIP] File not found: {pdb_file}")
        continue
    try:
        with gzip.open(pdb_file, "rt") as handle:
            structure = parser.get_structure(pdb_id, handle)
    except Exception as e:
        print(f"[SKIP] Error parsing {pdb_id}: {e}")
        continue

    heavy_chain = str(row["Hchain"]) if pd.notna(row["Hchain"]) else ""
    light_chain = str(row["Lchain"]) if pd.notna(row["Lchain"]) else ""
    antigen_chains = str(row["antigen_chain"]).split(" | ")

    heavy_seq = extract_chain_sequence(structure, heavy_chain) if heavy_chain else ""
    light_seq = extract_chain_sequence(structure, light_chain) if light_chain else ""
    antigen_seqs = [extract_chain_sequence(structure, chain.strip()) for chain in antigen_chains if chain.strip()]
    antigen_seq = "|".join(antigen_seqs)

    output_rows.append({
        "pdb": pdb_id,
        "Hchain": heavy_chain,
        "Lchain": light_chain,
        "AntigenChains": " | ".join(antigen_chains),
        "HeavySeq": heavy_seq,
        "LightSeq": light_seq,
        "AntigenSeq": antigen_seq
    })

out_df = pd.DataFrame(output_rows)
out_df.to_csv("sabdab_sequences_only.tsv", sep="\t", index=False)
print("Saved to sabdab_sequences_only.tsv")