In [None]:
# Intercaat Interface Annotator
import pandas as pd
from intercaat.interface import InterfaceAnalyzer

def extract_chain_sequence_and_mask(structure, chain_id, interface_residue_ids):
    sequence = ""
    masked_sequence = ""
    if chain_id not in structure[0]:
        return "", ""
    chain = structure[0][chain_id]
    i = 0
    for residue in chain:
        if residue.id[0] != " ":
            continue
        try:
            aa = seq1(residue.get_resname())
        except Exception:
            aa = "X"
        i += 1
        sequence += aa
        if i in interface_residue_ids:
            masked_sequence += f"[{aa}]"
        else:
            masked_sequence += aa
    return sequence, masked_sequence

In [None]:
def extract_res_nums(interface_residues):
    # Try to extract residue numbers from the interface_residues output
    # Adjust this depending on the actual structure of interface_residues
    res_nums = []
    for res in interface_residues:
        # If res is a tuple like ('H', 101, 'LEU'), use res[1]
        # If res is a dict or object, adjust accordingly
        if isinstance(res, tuple) and len(res) > 1:
            res_nums.append(res[1])
        elif hasattr(res, 'id'):
            res_nums.append(res.id[1])
        else:
            try:
                res_nums.append(int(res))
            except Exception:
                pass
    return res_nums

In [None]:
import os
import gzip
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1

df = pd.read_csv("sabdab_sequences_only.tsv", sep="\t")
output_rows = []
parser = PDBParser(QUIET=True)

for i, row in df.iterrows():
    pdb_id = row["pdb"]
    pdb_file = os.path.join("pdbs", f"{pdb_id}.pdb.gz")
    if not os.path.exists(pdb_file):
        print(f"[SKIP] File not found: {pdb_file}")
        continue
    try:
        with gzip.open(pdb_file, "rt") as handle:
            structure = parser.get_structure(pdb_id, handle)
    except Exception as e:
        print(f"[SKIP] Error parsing {pdb_id}: {e}")
        continue

    heavy_chain = row["Hchain"]
    light_chain = row["Lchain"]
    antigen_chains = []
    if pd.notna(row["AntigenChains"]):
        antigen_chains = str(row["AntigenChains"]).split(" | ")
    antigen_chain = antigen_chains[0].strip() if antigen_chains else ""

    heavy_res_ids = []
    light_res_ids = []
    try:
        if heavy_chain and antigen_chain:
            analyzer = InterfaceAnalyzer(pdb_file, heavy_chain, [antigen_chain])
            heavy_interface = analyzer.get_interface_residues()
            heavy_res_ids = extract_res_nums(heavy_interface)
        if light_chain and antigen_chain:
            analyzer = InterfaceAnalyzer(pdb_file, light_chain, [antigen_chain])
            light_interface = analyzer.get_interface_residues()
            light_res_ids = extract_res_nums(light_interface)
    except Exception as e:
        print(f"[WARNING] Intercaat failed for {pdb_id}: {e}")

    heavy_seq, heavy_mask = extract_chain_sequence_and_mask(structure, heavy_chain, heavy_res_ids) if heavy_chain else ("", "")
    light_seq, light_mask = extract_chain_sequence_and_mask(structure, light_chain, light_res_ids) if light_chain else ("", "")

    output_rows.append({
        "pdb": pdb_id,
        "Hchain": heavy_chain,
        "Lchain": light_chain,
        "AntigenChains": " | ".join(antigen_chains),
        "HeavySeq": heavy_seq,
        "LightSeq": light_seq,
        "HeavyInterfaceResNums": heavy_res_ids,
        "LightInterfaceResNums": light_res_ids,
        "HeavyInterfaceMask": heavy_mask,
        "LightInterfaceMask": light_mask
    })

    # Save every 20 entries
    if (i + 1) % 20 == 0:
        pd.DataFrame(output_rows).to_csv("sabdab_with_sequences_and_interfaces.tsv", sep="\t", index=False)
        print(f"Saved {i + 1} entries to sabdab_with_sequences_and_interfaces.tsv")

# Save any remaining entries at the end
pd.DataFrame(output_rows).to_csv("sabdab_with_sequences_and_interfaces.tsv", sep="\t", index=False)
print("Saved all entries to sabdab_with_sequences_and_interfaces.tsv")