# SAbDab Sequence Extractor
This notebook reads a SAbDab summary TSV file, downloads PDB structures, extracts the sequences for heavy, light, and antigen chains, and outputs the results into a new TSV file.

In [6]:
import os
import time
import requests
import pandas as pd
import gzip
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO


## Function to extract sequence from a chain

In [7]:
def extract_chain_sequence(structure, chain_id):
    for model in structure:
        for chain in model:
            if chain.id == chain_id:
                residues = [res for res in chain if res.id[0] == " "]
                seq = "".join(seq1(res.get_resname()) for res in residues)
                return seq
    return None


## Main processing block

In [8]:
df = pd.read_csv("sabdab_summary_all.tsv", sep="\t")
output_rows = []
parser = PDBParser(QUIET=True)

for _, row in df.iterrows():
    pdb_id = row["pdb"]
    pdb_file = f"{pdb_id}.pdb.gz"  # Compressed local file

    if not os.path.exists(pdb_file):
        print(f"[SKIP] File not found: {pdb_file}")
        continue

    try:
        with gzip.open(pdb_file, "rt") as handle:
            structure = parser.get_structure(pdb_id, handle)
    except Exception as e:
        print(f"[SKIP] Error parsing {pdb_id}: {e}")
        continue

    heavy_chain = str(row["Hchain"]) if pd.notna(row["Hchain"]) else ""
    light_chain = str(row["Lchain"]) if pd.notna(row["Lchain"]) else ""
    antigen_chains = str(row["antigen_chain"]).split(" | ")

    heavy_seq = extract_chain_sequence(structure, heavy_chain) if heavy_chain else ""
    light_seq = extract_chain_sequence(structure, light_chain) if light_chain else ""
    antigen_seqs = [extract_chain_sequence(structure, chain.strip()) for chain in antigen_chains]
    antigen_seq = "|".join(filter(None, antigen_seqs))

    output_rows.append({
        "pdb": pdb_id,
        "Hchain": heavy_chain,
        "Lchain": light_chain,
        "AntigenChains": " | ".join(antigen_chains),
        "HeavySeq": heavy_seq,
        "LightSeq": light_seq,
        "AntigenSeq": antigen_seq
    })


[SKIP] File not found: 9cph.pdb.gz
[SKIP] File not found: 9d7o.pdb.gz
[SKIP] File not found: 9ffm.pdb.gz
[SKIP] File not found: 9iut.pdb.gz
[SKIP] File not found: 9iut.pdb.gz
[SKIP] File not found: 9ml7.pdb.gz
[SKIP] File not found: 9e7d.pdb.gz
[SKIP] File not found: 9e7d.pdb.gz
[SKIP] File not found: 9isf.pdb.gz
[SKIP] File not found: 9ish.pdb.gz
[SKIP] File not found: 9ish.pdb.gz
[SKIP] File not found: 9btl.pdb.gz
[SKIP] File not found: 9f91.pdb.gz
[SKIP] File not found: 9f91.pdb.gz
[SKIP] File not found: 9gf8.pdb.gz
[SKIP] File not found: 9gs5.pdb.gz
[SKIP] File not found: 9mi0.pdb.gz
[SKIP] File not found: 9mi0.pdb.gz
[SKIP] File not found: 9mi0.pdb.gz
[SKIP] File not found: 9mi0.pdb.gz
[SKIP] File not found: 9mi0.pdb.gz
[SKIP] File not found: 9mi0.pdb.gz
[SKIP] File not found: 9fnt.pdb.gz
[SKIP] File not found: 9fnt.pdb.gz
[SKIP] File not found: 9gu0.pdb.gz
[SKIP] File not found: 9gu0.pdb.gz
[SKIP] File not found: 8zca.pdb.gz
[SKIP] File not found: 8zca.pdb.gz
[SKIP] File not foun

## Write results to a TSV file

In [9]:
out_df = pd.DataFrame(output_rows)
out_df.to_csv("sabdab_with_sequences.tsv", sep="\t", index=False)
print("Saved to sabdab_with_sequences.tsv")

Saved to sabdab_with_sequences.tsv


In [10]:
import os

# List all files in the current directory that end with .pdb.gz
pdb_gz_files = [f for f in os.listdir() if f.endswith('.pdb.gz')]

# Print the count
print(f"Number of .pdb.gz files: {len(pdb_gz_files)}")


Number of .pdb.gz files: 7460
