## Training Data Preparation Pipeline (Antigen Chain Matching)

This pipeline loads TheraSAbDab data, downloads PDBs, fetches antigen/target sequences from UniProt, matches antigens to PDB chains, and enables downstream analysis with correct chain IDs.

In [2]:
import os
import requests
import pandas as pd
from Bio.PDB import PDBParser, PPBuilder
from Bio import pairwise2
from Bio.Seq import Seq

In [3]:
# Configuration
source_file = "TheraSAbDab_SeqStruc_OnlineDownload.csv"
pdb_dir = "./pdb_files"
os.makedirs(pdb_dir, exist_ok=True)

In [4]:
# Load Dataset
df = pd.read_csv(source_file)[[
    'Therapeutic', '100% SI Structure', '99% SI Structure', '95-98% SI Structure', 'Target', 'HeavySequence', 'LightSequence']]

structure_cols = ['100% SI Structure', '99% SI Structure', '95-98% SI Structure']

# Filter out rows with structure is 'None;None' in all structure columns
df = df[~df[structure_cols].apply(lambda x: x.str.contains('None;None', na=False)).all(axis=1)]
# Filter out rows with structure is None in all structure columns
df = df[~df[structure_cols].apply(lambda x: x.isnull()).all(axis=1)]

# Replace 'None;None' with None
df[structure_cols] = df[structure_cols].replace('None;None', None)
# Replace "None;" with "" in structure columns
df[structure_cols] = df[structure_cols].replace("None;", "", regex=True)

In [5]:
# Define representative PDB structure for each row
def get_representative_structure(row):
    for col in structure_cols:
        if pd.notna(row[col]) and row[col] != 'None;None':
            pdb_id = row[col].split('/')[0].strip().split(':')[0]
            pdb_chains = row[col].split('/')[0].strip().split(':')[1]
            return {'pdb_id': pdb_id, 'pdb_chains': pdb_chains}
    return None

df['rep_struct'] = df.apply(get_representative_structure, axis=1)
df['rep_struct_pdb_id'] = df['rep_struct'].apply(lambda x: x['pdb_id'] if x is not None else None)
df['rep_struct_pdb_chains'] = df['rep_struct'].apply(lambda x: x['pdb_chains'] if x is not None else None)
df = df.drop(columns=['rep_struct', '100% SI Structure', '99% SI Structure', '95-98% SI Structure'])

In [6]:
# PDB Downloader Function
def download_pdb(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id.upper()}.pdb"
    out_path = os.path.join(pdb_dir, f"{pdb_id}.pdb")
    if not os.path.exists(out_path):
        r = requests.get(url)
        r.raise_for_status()
        with open(out_path, 'w') as f:
            f.write(r.text)
    return out_path

In [7]:
# UniProt and sequence matching helpers
def get_uniprot_sequence(target_name):
    """
    Query UniProt for the canonical sequence given a target name. Returns the first protein sequence found.
    """
    # Clean up target name (e.g., use just the first synonym, replace /)
    target_name = target_name.split(';')[0].replace('/', '_').replace(' ', '')
    url = f"https://rest.uniprot.org/uniprotkb/search?query={target_name}&fields=accession"
    resp = requests.get(url)
    if resp.ok:
        results = resp.json()
        if "results" in results and results["results"]:
            uniprot_id = results["results"][0]["primaryAccession"]
            # Now get the sequence
            seq_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
            seq_fasta = requests.get(seq_url).text
            lines = seq_fasta.splitlines()
            seq = ''.join([l for l in lines if not l.startswith(">")])
            return seq
    return None

def get_pdb_chain_sequences(pdb_path):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("pdb", pdb_path)
    ppb = PPBuilder()
    chain_seqs = {}
    for model in structure:
        for chain in model:
            seqs = [str(pp.get_sequence()) for pp in ppb.build_peptides(chain)]
            if seqs:
                # Concatenate all peptide fragments in this chain
                chain_seqs[chain.id] = ''.join(seqs)
    return chain_seqs

def find_best_chain_match(chain_seqs, antigen_seq):
    best_chain = None
    best_score = -1
    for chain_id, seq in chain_seqs.items():
        score = pairwise2.align.globalxx(seq, antigen_seq, score_only=True)
        if score > best_score:
            best_score = score
            best_chain = chain_id
    return best_chain, best_score

In [9]:
# Main matching loop: Download PDB, get antigen sequence, match to PDB chain, store result
for idx, row in df.iterrows():
    pdb_id = row['rep_struct_pdb_id']
    pdb_chains = row['rep_struct_pdb_chains']  # e.g., "HL"
    target = row['Target']
    pdb_path = os.path.join(pdb_dir, f"{pdb_id}.pdb")

    # Download PDB if needed
    try:
        download_pdb(pdb_id)
    except Exception as e:
        print(f"Failed to download {pdb_id}. Skipping... {e}")
        continue

    # 1. Get UniProt sequence for antigen
    antigen_seq = get_uniprot_sequence(target)
    if not antigen_seq:
        print(f"Could not find UniProt sequence for target: {target}")
        df.at[idx, "antigen_chain"] = None
        continue

    # 2. Get all chain sequences in PDB
    chain_seqs = get_pdb_chain_sequences(pdb_path)
    if not chain_seqs:
        print(f"No chain sequences found in {pdb_id}")
        df.at[idx, "antigen_chain"] = None
        continue

    # 3. Find best matching chain to antigen
    antigen_chain, score = find_best_chain_match(chain_seqs, antigen_seq)
    if not antigen_chain:
        print(f"No matching antigen chain for {pdb_id}")
        df.at[idx, "antigen_chain"] = None
        continue

    ab_chains = list(pdb_chains)
    print(f"{row['Therapeutic']}: Downloaded {pdb_id}.pdb. Antibody chains = {ab_chains}, Antigen chain = {antigen_chain}, Score = {score}")

    # Store result for downstream use
    df.at[idx, "antigen_chain"] = antigen_chain

Failed to download 7vux. Skipping... HTTPSConnectionPool(host='files.rcsb.org', port=443): Max retries exceeded with url: /download/7VUX.pdb (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000013388D8F520>: Failed to resolve 'files.rcsb.org' ([Errno 11001] getaddrinfo failed)"))


ConnectionError: HTTPSConnectionPool(host='rest.uniprot.org', port=443): Max retries exceeded with url: /uniprotkb/search?query=ITGA2B_CD41&fields=accession (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000013388D8F0D0>: Failed to resolve 'rest.uniprot.org' ([Errno 11001] getaddrinfo failed)"))

In [10]:
# Save updated dataframe with antigen chain info
df.to_csv("therasabdab_with_antigen_chain.csv", index=False)