## Imports and Setup

In [3]:
from pathlib import Path
import pandas as pd
import logging

# Configure basic logging for the notebook
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Imports from moremi_biokit.proteins
from moremi_biokit.proteins import (
    BatchAntibodyProcessor,
    ProteinValidator,
    ProteinRanker,
    ScoringConfig,       # For custom scoring (optional)
    MetricCategory,      # For understanding metrics (optional)
)

from moremi_biokit.proteins.utils import(
    list_internal_pdb_ids
)

# Pandas display options
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

## Define Output Directory and Example Sequences

In [4]:
# Define a directory to store results from this notebook
NOTEBOOK_OUTPUT_DIR = Path("protein_notebook_outputs")
NOTEBOOK_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
print(f"Output directory for this notebook: {NOTEBOOK_OUTPUT_DIR.resolve()}\n")

# Example protein sequences
protein_sequences_list = [
    {
        "name": "AB_001_Heavy",
        "sequence": "EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKAGISGSGGSYFDYWGQGTLVTVSS",
        "antigen_id": "1FBI" # Optional: specify an antigen PDB ID for this protein
    },
    {
        "name": "AB_002_Light",
        "sequence": "DIQMTQSPSSLSASVGDRVTITCRASQGISRWLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQYNSYPPTFGQGTKVEIK",
        # No antigen_id specified, will use default or validator's target_antigen_pdb_id
    },
    {
        "name": "AB_003_Heavy_Problematic",
        "sequence": "EVHLLESGGGLVQPGGSMKLSCVASGFTFSDAYYMNWVRQSPEKGLEWVAEIRNKPYNYETYYSDSVKGRFTISRDDSKNAVYLQMNGLRAEDTGIYYCARYDYDILTGYYYYMDVWGKGTTVTVSS",
        "antigen_id": "2VIR"
    },
    {
        "name": "AB_004_Short_Invalid",
        "sequence": "ACDEFGHIKLMNPQRSTVWY", # Too short, should fail validation
    }
]

# Create a dummy FASTA-like input file for BatchAntibodyProcessor
dummy_sequence_file = NOTEBOOK_OUTPUT_DIR / "example_proteins.fasta"
with open(dummy_sequence_file, 'w') as f:
    for ab in protein_sequences_list:
        if ab['sequence']: # Only write if sequence exists
            f.write(f">{ab['name']}\n") # Can add [antigen_id={ab.get('antigen_id', '')}] if BatchProcessor supports it
            f.write(f"{ab['sequence']}\n")
print(f"Created dummy sequence file: {dummy_sequence_file}")

# List some available internal PDB IDs that can be used as antigens
print("\nSome available internal antigen PDB IDs:")
internal_antigens = list_internal_pdb_ids()
if internal_antigens:
    print(internal_antigens[:5]) # Print first 5
else:
    print("No internal antigen PDBs found.")

Output directory for this notebook: /home/mino_solo/moremi_toolkits/components/moremi-biokit/protein_notebook_outputs

Created dummy sequence file: protein_notebook_outputs/example_proteins.fasta

Some available internal antigen PDB IDs:
['6mpv', 'ANZ81575.1 PstS', 'CAA48354.1 HBV', 'CAA87404.1', 'CCE35749.1 mmpL3']


## Using BatchAntibodyProcessor

In [3]:
batch_output_dir = NOTEBOOK_OUTPUT_DIR / "batch_processor_results"
batch_output_dir.mkdir(parents=True, exist_ok=True)

print(f"Running BatchAntibodyProcessor... Output will be in: {batch_output_dir}\n")

# Instantiate the processor
# We can control individual PDF/CSV generation for enhanced reports via generate_pdf and generate_csv flags
processor = BatchAntibodyProcessor(
    input_file=str(dummy_sequence_file),
    output_dir=str(batch_output_dir),
    generate_pdf=True,  # Generate individual PDF reports for each protein
    generate_csv=False  # Do NOT generate individual CSV reports for each protein
)

# Run the batch processing
processor.process_batch()

print(f"\nBatch processing complete. Check the directory: {batch_output_dir}")
print("This directory should contain:")
print("- Overall ranking CSV and PDF (if protein_report_generator is available).")
print("- `protein_reports/` subdirectory with individual PDF reports (because generate_pdf=True).")
print("- `pdbs/` subdirectory with downloaded/used PDB files.")
print("- Log files (`processing.log`, `failed_proteins.txt`).")

Running BatchAntibodyProcessor... Output will be in: antibody_notebook_outputs/batch_processor_results

2025-05-09 11:11:23,419 - INFO - No target_antigen_pdb_id provided, randomly selected: pdb4HJOA
2025-05-09 11:11:23,424 - INFO - Using internal antigen 'pdb4HJOA', materialised at './antibody_notebook_outputs/batch_processor_results/pdbs/pdb4HJOA.pdb'
2025-05-09 11:11:23,429 - INFO - 🚀 Starting validation of 4 antibodies...

🔍 Processing 4 antibodies from example_antibodies.fasta
2025-05-09 11:11:23,431 - INFO - Found 4 antibody sequences in antibody_notebook_outputs/batch_processor_results/sequences.txt
2025-05-09 11:11:23,432 - INFO - 🌲 Starting batch processing of 4 antibodies...

🌲 Processing 4 antibodies from antibody_notebook_outputs/batch_processor_results/sequences.txt
2025-05-09 11:11:23,433 - INFO - Processing antibody 1/4:
2025-05-09 11:11:23,435 - INFO - 
🧪 Processing antibody sequence: EVQLVESGGGLVQPGGSLRL...
2025-05-09 11:11:23,436 - INFO - ├── Calculating basic propert

## Antibody Validation

In [None]:
step_by_step_output_dir = NOTEBOOK_OUTPUT_DIR / "step_by_step_results"
step_by_step_output_dir.mkdir(parents=True, exist_ok=True)
validator_pdb_path = step_by_step_output_dir / "validator_pdbs"
validator_pdb_path.mkdir(parents=True, exist_ok=True)

# Instantiate ProteinValidator
# We can specify a target_antigen_pdb_id, or it will pick one randomly from internal PDBs if available.
chosen_antigen_for_validator = internal_antigens[0] if internal_antigens else None
validator = ProteinValidator(
    pdb_files_path=str(validator_pdb_path), 
    target_antigen_pdb_id=chosen_antigen_for_validator
)

print(f"Using target antigen for Validator: {validator.target_antigen_pdb_id}\n")

validation_results = []
sequences_to_validate = [ab['sequence'] for ab in protein_sequences_list if ab['sequence']]

# Using process_proteins (which takes a file path)
# For this example, let's re-use the dummy_sequence_file for simplicity with process_proteins
print(f"Validating sequences from file: {dummy_sequence_file}")
validation_results = validator.process_proteins(str(dummy_sequence_file), str(step_by_step_output_dir / "validation_run"))

# Alternatively, to process a list of sequences one by one:
# validated_metrics_list = validator.validate_proteins(sequences_to_validate) 
# print(f"Validated {len(validated_metrics_list)} proteins successfully via validate_proteins method.")

successful_metrics_list = validator.get_successful_metrics(validation_results)

print(f"\nTotal sequences processed by validator: {len(validation_results)}")
print(f"Number of successfully validated proteins: {len(successful_metrics_list)}")
print(f"Number of failed validations: {len(validation_results) - len(successful_metrics_list)}")

if successful_metrics_list:
    print("\nExample metrics for the first successfully validated protein:")
    first_valid_metrics_dict = successful_metrics_list[0].to_dict()
    print(f"  Sequence: {first_valid_metrics_dict['sequence'][:30]}...")
    print(f"  Antigen Used: {first_valid_metrics_dict['antigen']}")
    print(f"  Molecular Weight: {first_valid_metrics_dict['molecular_weight']}")
    print(f"  Total Score (from validator internal scoring): {first_valid_metrics_dict['total_score']}")
    # print(first_valid_metrics_dict['metrics']['protparam'])
    # You can explore first_valid_metrics_dict['metrics'] for detailed results from each tool
else:
    print("\nNo proteins were successfully validated in the step-by-step approach.")

## Antibody Ranking

In [3]:
if successful_metrics_list:
    ranker_output_dir = step_by_step_output_dir / "ranking_run"
    ranker_output_dir.mkdir(parents=True, exist_ok=True)

    # Instantiate ProteinRanker
    # Control PDF/CSV generation for individual protein reports
    ranker = ProteinRanker(
        generate_pdf=True, 
        generate_csv=True 
    )
    ranker.set_output_directory(str(ranker_output_dir))

    print(f"\nRanking {len(successful_metrics_list)} validated proteins...")
    print(f"Ranker outputs (including individual reports if enabled) will be in: {ranker_output_dir}\n")
    
    ranked_df = ranker.rank_proteins(successful_metrics_list)

    print("Top ranked proteins (DataFrame view):")
    display(ranked_df.head())

    # Get results as a list of dictionaries
    ranked_dicts = ranker.get_ranking_results_as_dict()
    if ranked_dicts:
        print("\nExample of first ranked protein (dictionary view):")
        # print(ranked_dicts[0]) # Full dict might be too verbose
        print(f"  Sequence: {ranked_dicts[0]['sequence'][:30]}...")
        print(f"  Total Score: {ranked_dicts[0]['total_score']}")
        print(f"  Antigen: {ranked_dicts[0]['antigen']}")
        print(f"  Molecular Formula: {ranked_dicts[0]['molecular_formula']}")
    
    print(f"\nRanking complete. Check {ranker_output_dir} for:")
    print("- Overall ranking CSV (`rankings/rankings_*.csv`).")
    print("- Overall ranking PDF (`rankings/ranking_report_*.pdf`) if protein_report_generator is available.")
    print("- `protein_reports/` subdirectory with individual PDF and CSV reports (as generate_pdf=True, generate_csv=True).")
else:
    print("\nNo successfully validated proteins to rank.")

NameError: name 'successful_metrics_list' is not defined

## Custom Scoring (Optional)

In [2]:
if successful_metrics_list:
    custom_scoring_output_dir = step_by_step_output_dir / "custom_scoring_run"
    custom_scoring_output_dir.mkdir(parents=True, exist_ok=True)
    
    # Define custom weights
    custom_weights = {
        MetricCategory.BINDING_AFFINITY: 0.40, # Increased weight
        MetricCategory.STRUCTURE: 0.15,        # Decreased weight
        MetricCategory.GLYCOSYLATION: 0.10,
        MetricCategory.AGGREGATION: 0.10,
        MetricCategory.PROTPARAM: 0.10,
        MetricCategory.IMMUNOGENICITY: 0.05,
        MetricCategory.CONSERVANCY: 0.05,
        MetricCategory.STABILITY: 0.05,
        MetricCategory.EPITOPE: 0.00,      # Zero weight
        MetricCategory.DEVELOPABILITY: 0.00 # Zero weight
    }
    
    custom_config = ScoringConfig()
    custom_config.category_weights = custom_weights
    
    # Instantiate ProteinRanker with custom config
    custom_ranker = ProteinRanker(
        config=custom_config, 
        generate_pdf=False, # Disable individual PDFs for this run
        generate_csv=True   # Only generate overall ranking CSV and individual CSVs
    )
    custom_ranker.set_output_directory(str(custom_scoring_output_dir))
    
    print(f"\nRanking with custom scoring config...")
    print(f"Custom ranker outputs will be in: {custom_scoring_output_dir}\n")
    custom_ranked_df = custom_ranker.rank_proteins(successful_metrics_list)
    
    print("Top ranked proteins (custom scoring):")
    display(custom_ranked_df.head())
else:
    print("\nSkipping custom scoring example as no proteins were successfully validated.")

NameError: name 'successful_metrics_list' is not defined