# SAbDab Sequence Extractor
This notebook reads a SAbDab summary TSV file, downloads PDB structures, extracts the sequences for heavy, light, and antigen chains, and outputs the results into a new TSV file.

In [None]:
import os
import time
import requests
import pandas as pd
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

## Function to download PDB files

In [None]:
def download_pdb(pdb_id, out_dir="pdbs"):
    os.makedirs(out_dir, exist_ok=True)
    pdb_file = os.path.join(out_dir, f"{pdb_id}.pdb")
    if not os.path.exists(pdb_file):
        url = f"https://files.rcsb.org/download/{pdb_id.upper()}.pdb"
        response = requests.get(url)
        if response.status_code == 200:
            with open(pdb_file, "w") as f:
                f.write(response.text)
            time.sleep(1)  # Wait 1 second between downloads
        else:
            print(f"Failed to download {pdb_id}")
            return None
    return pdb_file

## Function to extract sequence from a chain

In [None]:
def extract_chain_sequence(structure, chain_id):
    for model in structure:
        for chain in model:
            if chain.id == chain_id:
                residues = [res for res in chain if res.id[0] == " "]
                seq = "".join(seq1(res.get_resname()) for res in residues)
                return seq
    return None

## Main processing block

In [None]:
df = pd.read_csv("sabdab_summary_all.tsv", sep="\t")
output_rows = []
parser = PDBParser(QUIET=True)

for _, row in df.iterrows():
    pdb_id = row["pdb"]
    heavy_chain = str(row["Hchain"]) if pd.notna(row["Hchain"]) else ""
    light_chain = str(row["Lchain"]) if pd.notna(row["Lchain"]) else ""
    antigen_chains = str(row["antigen_chain"]).split(" | ")

    pdb_file = download_pdb(pdb_id)
    if not pdb_file:
        continue

    try:
        structure = parser.get_structure(pdb_id, pdb_file)
    except Exception as e:
        print(f"Error parsing {pdb_id}: {e}")
        continue

    heavy_seq = extract_chain_sequence(structure, heavy_chain) if heavy_chain else ""
    light_seq = extract_chain_sequence(structure, light_chain) if light_chain else ""
    antigen_seqs = [extract_chain_sequence(structure, chain.strip()) for chain in antigen_chains]
    antigen_seq = "|".join(filter(None, antigen_seqs))

    output_rows.append({
        "pdb": pdb_id,
        "Hchain": heavy_chain,
        "Lchain": light_chain,
        "AntigenChains": " | ".join(antigen_chains),
        "HeavySeq": heavy_seq,
        "LightSeq": light_seq,
        "AntigenSeq": antigen_seq
    })

## Write results to a TSV file

In [None]:
out_df = pd.DataFrame(output_rows)
out_df.to_csv("sabdab_with_sequences.tsv", sep="\t", index=False)
print("Saved to sabdab_with_sequences.tsv")