# SAbDab Sequence Extractor
This notebook reads a SAbDab summary TSV file, downloads PDB structures, extracts the sequences for heavy, light, and antigen chains, and outputs the results into a new TSV file.

In [1]:
# SAbDab Sequence Extractor with Intercaat
# This notebook reads a SAbDab summary TSV file, extracts PDB structures and sequences, 
# and annotates interface residues using Intercaat.

import os
import time
import requests
import pandas as pd
import gzip
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# Check if the pdbs directory exists, if not, extract it
if not os.path.exists("pdbs"):
    import tarfile
    with tarfile.open("pdbs.tar.gz", "r") as tar:
        tar.extractall()
    print("Extracted pdbs.tar")
else:
    print("pdbs folder already exists")

# Import Intercaat
from intercaat.interface import InterfaceAnalyzer



pdbs folder already exists


In [2]:
# Function to extract and optionally mask interface residues
def extract_chain_sequence_and_mask(structure, chain_id, interface_residue_ids):
    sequence = ""
    masked_sequence = ""
    chain = structure[0][chain_id]
    i = 0  # for renumbering residues from 1
    for residue in chain:
        if residue.id[0] != " ":  # skip hetero/water
            continue
        try:
            aa = seq1(residue.get_resname())
        except Exception:
            aa = "X"
        i += 1
        sequence += aa
        if i in interface_residue_ids:
            masked_sequence += f"[{aa}]"
        else:
            masked_sequence += aa
    return sequence, masked_sequence


## Function to extract sequence from a chain

## Main processing block

In [None]:
df = pd.read_csv("sabdab_summary_all.tsv", sep="\t")
output_rows = []
parser = PDBParser(QUIET=True)

for _, row in df.iterrows():
    pdb_id = row["pdb"]
    pdb_file = os.path.join("pdbs", f"{pdb_id}.pdb.gz")

    if not os.path.exists(pdb_file):
        print(f"[SKIP] File not found: {pdb_file}")
        continue

    try:
        with gzip.open(pdb_file, "rt") as handle:
            structure = parser.get_structure(pdb_id, handle)
    except Exception as e:
        print(f"[SKIP] Error parsing {pdb_id}: {e}")
        continue

    heavy_chain = str(row["Hchain"]) if pd.notna(row["Hchain"]) else ""
    light_chain = str(row["Lchain"]) if pd.notna(row["Lchain"]) else ""
    antigen_chains = str(row["antigen_chain"]).split(" | ")
    antigen_chain = antigen_chains[0].strip() if antigen_chains else ""

    heavy_res_ids = []
    light_res_ids = []

    try:
        analyzer = InterfaceAnalyzer(structure)
        if heavy_chain and antigen_chain:
            heavy_interface = analyzer.get_interface(chain_1=heavy_chain, chain_2=antigen_chain)
            heavy_res_ids = [i + 1 for i, _ in enumerate(heavy_interface.residues_1)]
        if light_chain and antigen_chain:
            light_interface = analyzer.get_interface(chain_1=light_chain, chain_2=antigen_chain)
            light_res_ids = [i + 1 for i, _ in enumerate(light_interface.residues_1)]
    except Exception as e:
        print(f"[WARNING] Intercaat failed for {pdb_id}: {e}")

heavy_seq, heavy_mask = ("", "")
if heavy_chain and heavy_chain in structure[0]:
    heavy_seq, heavy_mask = extract_chain_sequence_and_mask(structure, heavy_chain, heavy_res_ids)
else:
    if heavy_chain:
        print(f"[WARNING] Heavy chain '{heavy_chain}' not found in {pdb_id}")

light_seq, light_mask = ("", "")
if light_chain and light_chain in structure[0]:
    light_seq, light_mask = extract_chain_sequence_and_mask(structure, light_chain, light_res_ids)
else:
    if light_chain:
        print(f"[WARNING] Light chain '{light_chain}' not found in {pdb_id}")

antigen_seqs = []
for chain in antigen_chains:
    chain_id = chain.strip()
    if chain_id and chain_id in structure[0]:
        antigen_seqs.append(extract_chain_sequence_and_mask(structure, chain_id, [])[0])
    else:
        if chain_id:
            print(f"[WARNING] Antigen chain '{chain_id}' not found in {pdb_id}")
antigen_seq = "|".join(antigen_seqs)
output_rows.append({
        "pdb": pdb_id,
        "Hchain": heavy_chain,
        "Lchain": light_chain,
        "AntigenChains": " | ".join(antigen_chains),
        "HeavySeq": heavy_seq,
        "LightSeq": light_seq,
        "AntigenSeq": antigen_seq,
        "HeavyInterfaceResNums": heavy_res_ids,
        "LightInterfaceResNums": light_res_ids,
        "HeavyInterfaceMask": heavy_mask,
        "LightInterfaceMask": light_mask
    })


IndentationError: unexpected indent (142847881.py, line 62)

## Write results to a TSV file

In [None]:
out_df = pd.DataFrame(output_rows)
out_df.to_csv("sabdab_with_sequences_and_interfaces.tsv", sep="\t", index=False)
print("Saved to sabdab_with_sequences_and_interfaces.tsv")
