# SAbDab Sequence Extractor

In [None]:
import os
import pandas as pd
from biopandas.pdb import PandasPdb
from Bio.SeqUtils import seq1

In [4]:
def extract_sequence_from_pdb(pdb_df:PandasPdb, chain_id:str, record_name='ATOM'):
    """
    Extracts the sequence of a specified chain from a BioPandas PDB DataFrame.
    Args:
        pdb_df (PandasPdb): A PandasPdb object containing the PDB data.
        chain_id (str): The chain ID for which to extract the sequence.
        record_name (str): The type of record to extract the sequence from ('SEQRES' or 'ATOM').

    Returns:
        str: The single-letter amino acid sequence of the specified chain.
    """
    
    ## Get sequence from SEQRES records
    if record_name == 'SEQRES':
        other_records = pdb_df.df['OTHERS']
        seqres_records = other_records[other_records['record_name'] == 'SEQRES']

        sequence = ''
        for index, row in seqres_records.iterrows():
            entry = row['entry'].split(" ")
            print(entry)
            id = entry[3]
            chain_id = entry[4]
            residues = entry[8:]

            if chain_id == chain_id:
                for residue in residues:
                    sequence += seq1(residue)
        return sequence
    
    ## Get sequence from ATOM records
    elif record_name == 'ATOM':
        sequence = ''
        seen_residues = set()
        for index, row in pdb_df.df['ATOM'].iterrows():
            resn = row['residue_number']
            if row['chain_id'] == chain_id and resn not in seen_residues:
                seen_residues.add(resn)
                sequence += seq1(row['residue_name'])
        return sequence
    
    else:
        raise ValueError("Invalid record name. Use 'SEQRES' or 'ATOM'.")

In [None]:
## Read in structure data from the SAbDab summary file
structures_df = pd.read_csv("sabdab_summary_all.tsv", sep="\t")

## Filter NaN sequences in Hchain, Lchain, and antigen_chain
structures_df = structures_df[~structures_df['Hchain'].isna()]
structures_df = structures_df[~structures_df['Lchain'].isna()]
structures_df = structures_df[~structures_df['antigen_chain'].isna()]

## Filter records where the antigen chain is the same as the heavy or light chain
structures_df = structures_df[structures_df['antigen_chain'] != structures_df['Hchain']]
structures_df = structures_df[structures_df['antigen_chain'] != structures_df['Lchain']]

# structures_df.head()
print(f"Total structures: {len(structures_df)}")

Total structures: 11289


In [13]:
## Setup DataFrame to store sequences
sequences_df = pd.DataFrame(columns=["pdb_id", "h_chain_id", "l_chain_id", "antigen_ids", "h_chain_seq", "l_chain_seq", "antigen_seqs"])

## Set base path to PDB files
# base_path_to_pdbs = "./pdbs_test"
base_path_to_pdbs = "./pdbs"

In [None]:
for index, row in structures_df.iterrows():
    pdb_id = row['pdb']
    h_chain_id = row['Hchain']
    l_chain_id = row['Lchain']
    antigen_ids = row['antigen_chain'].split(' | ')
    
    pdb_file = f"{base_path_to_pdbs}/{pdb_id}.pdb.gz"
    
    if not os.path.exists(pdb_file):
        print(f"File {pdb_file} does not exist. Skipping.")
        continue
    else:
        print(f"Processing {pdb_file}")
        pdb_df = PandasPdb().read_pdb(pdb_file)
    
        print("  -> Getting H chain sequence")
        h_chain_seq = extract_sequence_from_pdb(pdb_df, h_chain_id, record_name='ATOM')
        print("  -> Getting L chain sequence")
        l_chain_seq = extract_sequence_from_pdb(pdb_df, l_chain_id, record_name='ATOM')
        print("  -> Getting Antigen sequences")
        antigen_seqs = ''
        for antigen_id in antigen_ids:
            antigen_seq = extract_sequence_from_pdb(pdb_df, antigen_id, record_name='ATOM')
            antigen_seqs += antigen_seq + '|'

        sequence_df_row = pd.DataFrame({
            "pdb_id": pdb_id,
            "h_chain_id": h_chain_id,
            "l_chain_id": l_chain_id,
            "antigen_ids": '|'.join(antigen_ids),
            "h_chain_seq": h_chain_seq,
            "l_chain_seq": l_chain_seq,
            "antigen_seqs": antigen_seqs.strip('|')
        }, index=[0])
        
        sequences_df = pd.concat([sequences_df, sequence_df_row], ignore_index=True)

Processing ./pdbs/9ml4.pdb.gz
  -> Getting H chain sequence
  -> Getting L chain sequence


In [12]:
sequences_df

Unnamed: 0,pdb_id,h_chain_id,l_chain_id,antigen_ids,h_chain_seq,l_chain_seq,antigen_seqs
0,1a3r,H,L,P,VQLQQSGAELVRPGASVKLSCTTSGFNIKDIYIHWVKQRPEQGLEW...,DIVMTQSPSSLTVTTGEKVTMTCKSSQTQKNYLTWYQQKPGQSPKL...,VKAETRLNPDLQPTE
1,1a2y,B,A,C,QVQLQESGPGLVAPSQSLSITCTVSGFSLTGYGVNWVRQPPGKGLE...,DIVLTQSPASLSASVGETVTITCRASGNIHNYLAWYQQKQGKSPQL...,KVFGRCELAAAMKRHGLANYRGYSLGNWVCAAKFESNFNTQATNRN...
2,1a14,H,L,N,QVQLQQSGAELVKPGASVRMSCKASGYTFTNYNMYWVKQSPGQGLE...,DIELTQTTSSLSASLGDRVTISCRASQDISNYLNWYQQNPDGTVKL...,RDFNNLTKGLCTINSWHIYGKDNAVRIGEDSDVLVTREPYVSCDPD...
