# Batch Processing with NovoMD

This notebook demonstrates how to process multiple molecules efficiently using NovoMD API.

Use cases:
- Processing compound libraries
- Virtual screening workflows
- Building molecular databases

In [None]:
import requests
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Configuration
BASE_URL = "http://localhost:8010"
API_KEY = "your-api-key"
headers = {"Content-Type": "application/json", "X-API-Key": API_KEY}

## 1. Sample Dataset

Let's create a sample dataset of drug-like molecules:

In [None]:
# Sample molecules (common drugs and drug-like compounds)
molecules_df = pd.DataFrame([
    {"name": "Aspirin", "smiles": "CC(=O)OC1=CC=CC=C1C(=O)O", "category": "NSAID"},
    {"name": "Ibuprofen", "smiles": "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", "category": "NSAID"},
    {"name": "Acetaminophen", "smiles": "CC(=O)NC1=CC=C(C=C1)O", "category": "Analgesic"},
    {"name": "Caffeine", "smiles": "CN1C=NC2=C1C(=O)N(C(=O)N2C)C", "category": "Stimulant"},
    {"name": "Nicotine", "smiles": "CN1CCCC1C2=CN=CC=C2", "category": "Stimulant"},
    {"name": "Metformin", "smiles": "CN(C)C(=N)NC(=N)N", "category": "Antidiabetic"},
    {"name": "Omeprazole", "smiles": "CC1=CN=C(C(=C1OC)C)CS(=O)C2=NC3=C(N2)C=CC=C3", "category": "PPI"},
    {"name": "Atorvastatin", "smiles": "CC(C)C1=C(C(=C(N1CCC(CC(CC(=O)O)O)O)C2=CC=C(C=C2)F)C3=CC=CC=C3)C(=O)NC4=CC=CC=C4", "category": "Statin"},
    {"name": "Lisinopril", "smiles": "NCCCC[C@H](N[C@@H](CCC1=CC=CC=C1)C(=O)O)C(=O)N2CCC[C@H]2C(=O)O", "category": "ACE Inhibitor"},
    {"name": "Amlodipine", "smiles": "CCOC(=O)C1=C(NC(=C(C1C2=CC=CC=C2Cl)C(=O)OC)C)COCCN", "category": "CCB"},
    {"name": "Simvastatin", "smiles": "CCC(C)(C)C(=O)OC1CC(C=C2C1C(C(C=C2)C)CCC3CC(CC(=O)O3)O)C", "category": "Statin"},
    {"name": "Losartan", "smiles": "CCCCC1=NC(=C(N1CC2=CC=C(C=C2)C3=CC=CC=C3C4=NNN=N4)CO)Cl", "category": "ARB"},
    {"name": "Gabapentin", "smiles": "NCC1(CCCCC1)CC(=O)O", "category": "Anticonvulsant"},
    {"name": "Sertraline", "smiles": "CNC1CCC(C2=CC=CC=C12)C3=CC(=C(C=C3)Cl)Cl", "category": "SSRI"},
    {"name": "Fluoxetine", "smiles": "CNCCC(C1=CC=CC=C1)OC2=CC=C(C=C2)C(F)(F)F", "category": "SSRI"},
])

print(f"Dataset: {len(molecules_df)} molecules")
molecules_df

## 2. Sequential Processing

In [None]:
def process_molecule(smiles, name=None):
    """Process a single molecule and return properties"""
    try:
        response = requests.post(
            f"{BASE_URL}/smiles-to-omd",
            headers=headers,
            json={"smiles": smiles, "force_field": "AMBER"},
            timeout=30
        )
        result = response.json()
        
        if result['success']:
            return {
                'success': True,
                'name': name,
                'smiles': smiles,
                **{k: v for k, v in result['metadata'].items() 
                   if not isinstance(v, list)}  # Exclude coordinate arrays
            }
        else:
            return {'success': False, 'name': name, 'smiles': smiles, 'error': result.get('error')}
    except Exception as e:
        return {'success': False, 'name': name, 'smiles': smiles, 'error': str(e)}

In [None]:
# Process sequentially (simple but slower)
print("Processing molecules sequentially...")
start_time = time.time()

results = []
for _, row in tqdm(molecules_df.iterrows(), total=len(molecules_df)):
    result = process_molecule(row['smiles'], row['name'])
    result['category'] = row['category']
    results.append(result)

elapsed = time.time() - start_time
print(f"\nProcessed {len(results)} molecules in {elapsed:.2f} seconds")
print(f"Average: {elapsed/len(results):.2f} seconds per molecule")

## 3. Parallel Processing (Faster)

In [None]:
def process_molecules_parallel(df, max_workers=4):
    """Process molecules in parallel using ThreadPoolExecutor"""
    results = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        futures = {
            executor.submit(process_molecule, row['smiles'], row['name']): row
            for _, row in df.iterrows()
        }
        
        # Collect results as they complete
        for future in tqdm(as_completed(futures), total=len(futures)):
            row = futures[future]
            result = future.result()
            result['category'] = row['category']
            results.append(result)
    
    return results

In [None]:
# Process in parallel
print("Processing molecules in parallel (4 workers)...")
start_time = time.time()

results_parallel = process_molecules_parallel(molecules_df, max_workers=4)

elapsed = time.time() - start_time
print(f"\nProcessed {len(results_parallel)} molecules in {elapsed:.2f} seconds")
print(f"Average: {elapsed/len(results_parallel):.2f} seconds per molecule")

## 4. Analyze Results

In [None]:
# Convert to DataFrame
results_df = pd.DataFrame(results)

# Check success rate
success_count = results_df['success'].sum()
print(f"Success rate: {success_count}/{len(results_df)} ({100*success_count/len(results_df):.1f}%)")

# Show failed molecules if any
failed = results_df[~results_df['success']]
if len(failed) > 0:
    print(f"\nFailed molecules:")
    print(failed[['name', 'smiles', 'error']])

In [None]:
# Filter successful results and display summary
successful_df = results_df[results_df['success']].copy()

summary_cols = ['name', 'category', 'molecular_weight', 'num_atoms_with_h', 
                'radius_of_gyration', 'sasa', 'dipole_moment']
successful_df[summary_cols].round(2)

## 5. Group Analysis by Category

In [None]:
# Aggregate statistics by drug category
numeric_cols = ['molecular_weight', 'num_atoms_with_h', 'radius_of_gyration', 
                'sasa', 'molecular_volume', 'globularity']

category_stats = successful_df.groupby('category')[numeric_cols].agg(['mean', 'std']).round(2)
category_stats

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot of molecular weight by category
successful_df.boxplot(column='molecular_weight', by='category', ax=axes[0])
axes[0].set_title('Molecular Weight by Drug Category')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Molecular Weight (Da)')
plt.sca(axes[0])
plt.xticks(rotation=45, ha='right')

# Scatter plot
for category in successful_df['category'].unique():
    subset = successful_df[successful_df['category'] == category]
    axes[1].scatter(subset['molecular_weight'], subset['sasa'], 
                    label=category, s=100, alpha=0.7)

axes[1].set_xlabel('Molecular Weight (Da)')
axes[1].set_ylabel('SASA (Å²)')
axes[1].set_title('Surface Area vs Molecular Weight')
axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

## 6. Export Results

In [None]:
# Export to CSV
export_cols = ['name', 'category', 'smiles', 'molecular_weight', 'num_atoms_with_h',
               'num_heavy_atoms', 'radius_of_gyration', 'asphericity', 'globularity',
               'sasa', 'molecular_volume', 'dipole_moment', 'total_charge']

successful_df[export_cols].to_csv('batch_results.csv', index=False)
print("Results exported to batch_results.csv")

In [None]:
# Export to JSON for web applications
successful_df[export_cols].to_json('batch_results.json', orient='records', indent=2)
print("Results exported to batch_results.json")

## 7. Rate Limiting Considerations

NovoMD has rate limiting (default: 100 requests/minute). For large batches:

In [None]:
def process_with_rate_limit(df, requests_per_minute=90):
    """Process molecules with rate limiting"""
    delay = 60 / requests_per_minute
    results = []
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        result = process_molecule(row['smiles'], row['name'])
        result['category'] = row['category']
        results.append(result)
        time.sleep(delay)
    
    return results

# For very large datasets, use this approach:
# results = process_with_rate_limit(large_dataset_df, requests_per_minute=90)