## Training Data Preparation Pipeline

In [1]:
import os
import subprocess
import requests
import pandas as pd
# from Bio.SeqUtils import seq1

In [None]:
## Configuration
source_file = "TheraSAbDab_SeqStruc_OnlineDownload.csv"
pdb_dir = "./pdb_files"
os.makedirs(pdb_dir, exist_ok=True)

In [40]:
## Load Dataset
df = pd.read_csv(source_file)[[
    'Therapeutic', '100% SI Structure', '99% SI Structure', '95-98% SI Structure', 'Target', 'HeavySequence', 'LightSequence']]

structure_cols = ['100% SI Structure', '99% SI Structure', '95-98% SI Structure']

## Filter out rows with structure is 'None;None' in all structure columns
df = df[~df[structure_cols].apply(lambda x: x.str.contains('None;None', na=False)).all(axis=1)]
## Filter out rows with structure is None in all structure columns
df = df[~df[structure_cols].apply(lambda x: x.isnull()).all(axis=1)]

## Replace 'None;None' with None
df[structure_cols] = df[structure_cols].replace('None;None', None)
## Replace "None;" with "" in structure columns
df[structure_cols] = df[structure_cols].replace("None;", "", regex=True)

In [42]:
## Define representative PDB structure for each row
def get_representative_structure(row):
    for col in structure_cols:
        if pd.notna(row[col]) and row[col] != 'None;None':
            pdb_id = row[col].split('/')[0].strip().split(':')[0]
            pdb_chains = row[col].split('/')[0].strip().split(':')[1]
            return {'pdb_id': pdb_id, 'pdb_chains': pdb_chains}
        else:
            continue
    return None

In [None]:
## PDB Downloader Function
def download_pdb(pdb_id):
    url = f"https://files.rcsb.org/download/{pdb_id.upper()}.pdb"
    out_path = os.path.join(pdb_dir, f"{pdb_id}.pdb")
    if not os.path.exists(out_path):
        r = requests.get(url)
        r.raise_for_status()
        with open(out_path, 'w') as f:
            f.write(r.text)
    return out_path

In [46]:
## Get representative structure information
df['rep_struct'] = df.apply(get_representative_structure, axis=1)

df['rep_struct_pdb_id'] = df['rep_struct'].apply(lambda x: x['pdb_id'] if x is not None else None)
df['rep_struct_pdb_chains'] = df['rep_struct'].apply(lambda x: x['pdb_chains'] if x is not None else None)

df = df.drop(columns=['rep_struct', '100% SI Structure', '99% SI Structure', '95-98% SI Structure'])

In [48]:
df

Unnamed: 0,Therapeutic,Target,HeavySequence,LightSequence,rep_struct_pdb_id,rep_struct_pdb_chains
1,Abazistobart,PDCD1/CD279/PD1,EVKLVESGGGLVQPGGSLRLSCAASGFAFSSYDMSWVRQAPGKRLE...,EIVLTQSPATLSLSPGERATLSCRASQSISNFLHWYQQKPGQAPRL...,7vux,HL
2,Abciximab,ITGA2B/CD41,EVQLQQSGTVLARPGASVKMSCEASGYTFTNYWMHWVKQRPGQGLE...,EIVLTQSPVTLSVTPGDSVSLSCRASRDISNNLHWFQQTSHESPRL...,6v4p,CD
3,Abelacimab,F11,QVQLLESGGGLVQPGGSLRLSCAASGFTFSTAAMSWVRQAPGKGLE...,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNDVSWYQQLPGTAPK...,6r8x,CB
10,Acimtamig,FCGR3A/CD16/CD16A;TNFRSF8/CD30,QVQLVQSGAEVKKPGESLKVSCKASGYTFTSYYMHWVRQAPGQGLE...,SYVLTQPSSVSVAPGQTATISCGGHNIGSKNVHWYQQRPGQSPVLV...,7seg,AB
14,Adalimumab,TNF/TNFA,EVQLVESGGGLVQPGRSLRLSCAASGFTFDDYAMHWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQGIRNYLAWYQQKPGKAPKL...,6cr1,HL
...,...,...,...,...,...,...
1120,Zemocimig,F9a;F10,QVQLVESGGGLVQPGGSLRLSCAASGFTFSYYDIQWVRQAPGKGLE...,EIVLTQSPATLSVSPGERATLSCRASRSVRRELAWYQQKPGQAPEL...,8gv0,AB
1121,Zenocutuzumab,ERBB3/HER3;ERBB2/CD340/HER2,QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...,5o4o,BA
1122,Zeripatamig,IAP/CD47;CD19,EVQLLESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLE...,DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKL...,8rp8,AB
1128,Zinlirvimab,HIV-1 gp120 V3,QVQLQESGPGLVKPSETLSVTCSVSGDSMNNYYWTWIRQSPGKGLE...,SYVRPLSVALGETARISCGRQALGSRAVQWYQHRPGQAPILLIYNN...,6orn,AB


In [None]:
## Loop through each row and download the PDB file
for _, row in df.iterrows():
    therapeutic = row['Therapeutic']
    rep_struct_pdb_id = row['rep_struct_pdb_id']
    rep_struct_pdb_chain = row['rep_struct_pdb_chains']
    
    try:
        pdb_path = download_pdb(rep_struct_pdb_id)
        print(f"{therapeutic}: Downloaded {rep_struct_pdb_id} with chains {rep_struct_pdb_chain} to {pdb_path}")
    except:
        print(f"Failed to download {rep_struct_pdb_id}. Skipping...")
        continue


Abazistobart: Downloaded 7vux with chains HL to ./pdb_files\7vux.pdb
Abciximab: Downloaded 6v4p with chains CD to ./pdb_files\6v4p.pdb
Abelacimab: Downloaded 6r8x with chains CB to ./pdb_files\6r8x.pdb
Acimtamig: Downloaded 7seg with chains AB to ./pdb_files\7seg.pdb
Adalimumab: Downloaded 6cr1 with chains HL to ./pdb_files\6cr1.pdb
Adintrevimab: Downloaded 7u2d with chains HL to ./pdb_files\7u2d.pdb
Aducanumab: Downloaded 6cnr with chains HL to ./pdb_files\6cnr.pdb
Afasevikumab: Downloaded 6ppg with chains HL to ./pdb_files\6ppg.pdb
Alemtuzumab: Downloaded 1bey with chains HL to ./pdb_files\1bey.pdb
Alomfilimab: Downloaded 7joo with chains HL to ./pdb_files\7joo.pdb
Amatuximab: Downloaded 7ued with chains HL to ./pdb_files\7ued.pdb
Amivantamab: Downloaded 6wvz with chains HL to ./pdb_files\6wvz.pdb
Amlenetug: Downloaded 8b9v with chains HL to ./pdb_files\8b9v.pdb
Amubarvimab: Downloaded 7cdi with chains HL to ./pdb_files\7cdi.pdb
Amulirafusp: Downloaded 4kaq with chains HL to ./pdb_fi

In [None]:
for _, row in df.iterrows():
    pdb_id_full = row['100% SI Structure'] or row['99% SI Structure'] or row['95-98% SI Structure']
    pdb_code = pdb_id_full.split(':')[0]
    chain_ids = pdb_id_full.split(':')[1] if ':' in pdb_id_full else ''

    try:
        pdb_path = download_pdb(pdb_code)