Generate Boltz YAML files from a CSV containing SMILES codes.
Each molecule gets its own YAML file for protein-ligand structure prediction.

Code generated with claude sonnet 4.5

In [None]:
#!/usr/bin/env python3


import pandas as pd
import os
import yaml
from pathlib import Path

# ============================================================================
# CONFIGURATION - Edit these values
# ============================================================================

# Input CSV file (output from previous script)
INPUT_CSV = "P08173_random_subset.csv"

# Protein sequence
PROTEIN_SEQUENCE = """RYETVEMVFIATVTGSLSLVTVVGNILVMLSIKVNRQLQTVNNYFLFSLACADLIIGAFSMNLYTVYIIKGYWPLGAVVCDLWLALDYVVSNASVMNLLIISFDRYFCVTKPLTYPARRTTKMAGLMIAAAWVLSFVLWAPAILFWQFVVGKRTVPDNQCFIQFLSNPAVTFGTAIAAFYLPVVIMTVLYIHISLASRSAARERKVTRTIFAILLAFILTWTPYNVMVLVNTFCQSCIPDTVWSIGYWLCYVNSTINPACYALCNATFKKTFRHLL"""

# Protein ID
PROTEIN_ID = "[A]"

# Ligand ID
LIGAND_ID = "[B]"

# MSA file path (relative or absolute)
MSA_PATH = "./msa/protein.a3m"

# Output directory for YAML files
OUTPUT_DIR = "boltz_inputs"

# SMILES column name in CSV (adjust if different)
SMILES_COLUMN = "std_smiles"  # or "canonical_smiles"

# ============================================================================
# AFFINITY PREDICTION SETTINGS
# ============================================================================

# Enable affinity prediction
PREDICT_AFFINITY = True

# Binder chain ID for affinity prediction (e.g., "[A]" for protein, "[B]" for ligand)
AFFINITY_BINDER = "[B]"

# ============================================================================
# TEMPLATE SETTINGS
# ============================================================================

# Enable template usage
USE_TEMPLATE = True

# PDB file path for template
PDB_DIRECTORY = "M4R_receptor.pdb"

# Force template usage
TEMPLATE_FORCE = True

# Template threshold
TEMPLATE_THRESHOLD = 1

# ============================================================================


def clean_protein_sequence(sequence: str) -> str:
    """
    Remove whitespace and newlines from protein sequence.
    
    Parameters:
    sequence: Raw protein sequence string
    
    Returns:
    Cleaned single-line sequence
    """
    return ''.join(sequence.split())


def create_boltz_yaml(protein_seq: str, protein_id: str, smiles: str, 
                      ligand_id: str, msa_path: str, predict_affinity: bool = False,
                      affinity_binder: str = None, use_template: bool = False,
                      pdb_path: str = None, template_force: bool = False,
                      template_threshold: int = 1) -> dict:
    """
    Create a dictionary representing the Boltz YAML structure.
    
    Parameters:
    protein_seq: Protein amino acid sequence
    protein_id: Protein chain ID
    smiles: Ligand SMILES string
    ligand_id: Ligand chain ID
    msa_path: Path to MSA file
    predict_affinity: Whether to include affinity prediction
    affinity_binder: Chain ID for affinity binder
    use_template: Whether to use template
    pdb_path: Path to PDB template file
    template_force: Whether to force template usage
    template_threshold: Template threshold value
    
    Returns:
    Dictionary that can be written as YAML
    """
    yaml_dict = {
        'version': 1,
        'sequences': [
            {
                'protein': {
                    'id': protein_id,
                    'sequence': protein_seq,
                    'msa': msa_path
                }
            },
            {
                'ligand': {
                    'id': ligand_id,
                    'smiles': smiles
                }
            }
        ]
    }
    
    # Add properties section with affinity prediction if enabled
    if predict_affinity and affinity_binder:
        yaml_dict['properties'] = [
            {
                'affinity': {
                    'binder': affinity_binder
                }
            }
        ]
    
    # Add templates section if enabled
    if use_template and pdb_path:
        yaml_dict['templates'] = [
            {
                'pdb': pdb_path,
                'force': template_force,
                'threshold': template_threshold
            }
        ]
    
    return yaml_dict


def generate_yaml_files(csv_file: str, protein_seq: str, protein_id: str,
                       ligand_id: str, msa_path: str, output_dir: str,
                       smiles_column: str, predict_affinity: bool = False,
                       affinity_binder: str = None, use_template: bool = False,
                       pdb_path: str = None, template_force: bool = False,
                       template_threshold: int = 1):
    """
    Read CSV and generate YAML files for each SMILES code.
    
    Parameters:
    csv_file: Path to input CSV file
    protein_seq: Protein sequence
    protein_id: Protein chain identifier
    ligand_id: Ligand chain identifier
    msa_path: Path to MSA file
    output_dir: Directory to save YAML files
    smiles_column: Name of column containing SMILES codes
    predict_affinity: Whether to include affinity prediction
    affinity_binder: Chain ID for affinity binder
    use_template: Whether to use template
    pdb_path: Path to PDB template file
    template_force: Whether to force template usage
    template_threshold: Template threshold value
    """
    
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Read CSV file
    print(f"Reading CSV file: {csv_file}")
    try:
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: Cannot find CSV file '{csv_file}'")
        print("Please check the file path and try again.")
        return
    
    print(f"Found {len(df)} molecules in CSV")
    
    # Check if SMILES column exists
    if smiles_column not in df.columns:
        print(f"\nError: Column '{smiles_column}' not found in CSV")
        print(f"Available columns: {', '.join(df.columns)}")
        return
    
    # Clean protein sequence
    clean_seq = clean_protein_sequence(protein_seq)
    
    print(f"\nProtein sequence length: {len(clean_seq)} amino acids")
    print(f"Output directory: {output_dir}/")
    
    # Print affinity prediction settings if enabled
    if predict_affinity:
        print(f"Affinity prediction: ENABLED (binder chain: {affinity_binder})")
    else:
        print(f"Affinity prediction: DISABLED")
    
    # Print template settings if enabled
    if use_template:
        print(f"Templates: ENABLED")
        print(f"  PDB: {pdb_path}")
        print(f"  Force: {template_force}")
        print(f"  Threshold: {template_threshold}")
    else:
        print(f"Templates: DISABLED")
    
    print(f"\nGenerating YAML files...\n")
    
    # Generate YAML for each molecule
    success_count = 0
    
    for idx, row in df.iterrows():
        smiles = row[smiles_column]
        
        # Skip if SMILES is missing or invalid
        if pd.isna(smiles) or smiles == '':
            print(f"⚠️  Skipping row {idx}: No SMILES code")
            continue
        
        # Create YAML dictionary
        yaml_data = create_boltz_yaml(
            protein_seq=clean_seq,
            protein_id=protein_id,
            smiles=smiles,
            ligand_id=ligand_id,
            msa_path=msa_path,
            predict_affinity=predict_affinity,
            affinity_binder=affinity_binder,
            use_template=use_template,
            pdb_path=pdb_path,
            template_force=template_force,
            template_threshold=template_threshold
        )
        
        # Generate filename
        # Use ligand_chembl_id if available, otherwise use molregno, otherwise use index
        if 'ligand_chembl_id' in df.columns and not pd.isna(row['ligand_chembl_id']):
            filename = f"{row['ligand_chembl_id']}.yaml"
        elif 'molregno' in df.columns and not pd.isna(row['molregno']):
            filename = f"molecule_{int(row['molregno'])}.yaml"
        else:
            filename = f"molecule_{idx:04d}.yaml"
        
        output_path = os.path.join(output_dir, filename)
        
        # Write YAML file
        try:
            with open(output_path, 'w') as f:
                yaml.dump(yaml_data, f, default_flow_style=False, sort_keys=False)
            
            success_count += 1
            
            # Print progress
            if success_count % 10 == 0:
                print(f"  Generated {success_count} YAML files...")
        
        except Exception as e:
            print(f"⚠️  Error writing {filename}: {e}")
            continue
    
    print(f"\n{'='*60}")
    print(f"SUMMARY")
    print(f"{'='*60}")
    print(f"Successfully generated: {success_count} YAML files")
    print(f"Output directory: {output_dir}/")
    print(f"{'='*60}\n")
    
    # Show example of first file
    if success_count > 0:
        first_file = sorted(os.listdir(output_dir))[0]
        print(f"Example YAML file ({first_file}):")
        print("-" * 60)
        with open(os.path.join(output_dir, first_file), 'r') as f:
            print(f.read())


def main():
    """
    Main function to run the YAML generation process.
    """
    print("=" * 60)
    print("BOLTZ YAML FILE GENERATOR")
    print("=" * 60)
    print()
    
    generate_yaml_files(
        csv_file=INPUT_CSV,
        protein_seq=PROTEIN_SEQUENCE,
        protein_id=PROTEIN_ID,
        ligand_id=LIGAND_ID,
        msa_path=MSA_PATH,
        output_dir=OUTPUT_DIR,
        smiles_column=SMILES_COLUMN,
        predict_affinity=PREDICT_AFFINITY,
        affinity_binder=AFFINITY_BINDER,
        use_template=USE_TEMPLATE,
        pdb_path=PDB_DIRECTORY,
        template_force=TEMPLATE_FORCE,
        template_threshold=TEMPLATE_THRESHOLD
    )


if __name__ == "__main__":
    main()


BOLTZ YAML FILE GENERATOR

Reading CSV file: P08173_random_subset.csv
Found 100 molecules in CSV

Protein sequence length: 276 amino acids
Output directory: boltz_inputs/
Affinity prediction: ENABLED (binder chain: [B])

Generating YAML files...

  Generated 10 YAML files...
  Generated 20 YAML files...
  Generated 30 YAML files...
  Generated 40 YAML files...
  Generated 50 YAML files...
  Generated 60 YAML files...
  Generated 70 YAML files...
  Generated 80 YAML files...
  Generated 90 YAML files...
  Generated 100 YAML files...

SUMMARY
Successfully generated: 100 YAML files
Output directory: boltz_inputs/

Example YAML file (CHEMBL1020.yaml):
------------------------------------------------------------
version: 1
sequences:
- protein:
    id: '[A]'
    sequence: RYETVEMVFIATVTGSLSLVTVVGNILVMLSIKVNRQLQTVNNYFLFSLACADLIIGAFSMNLYTVYIIKGYWPLGAVVCDLWLALDYVVSNASVMNLLIISFDRYFCVTKPLTYPARRTTKMAGLMIAAAWVLSFVLWAPAILFWQFVVGKRTVPDNQCFIQFLSNPAVTFGTAIAAFYLPVVIMTVLYIHISLASRSAARERKVTRTIFAILLAFILTWT