# OEPandas - Advanced Features

This notebook demonstrates advanced OEPandas features including file I/O, design units, and performance optimization.

In [None]:
import oepandas as oepd
import pandas as pd
from openeye import oechem
from pathlib import Path
import tempfile

## 1. File I/O

### Reading Different File Formats

In [None]:
# OEPandas can read various formats:

# SD files (with properties)
# df = oepd.read_sdf("molecules.sdf")

# OEB binary files (compressed or uncompressed)
# df = oepd.read_oeb("molecules.oeb.gz")

# SMILES files
# df = oepd.read_smi("molecules.smi")

# CSV with SMILES column
# df = oepd.read_molecule_csv("data.csv", molecule_columns="SMILES")

# OERecord databases
# df = oepd.read_oedb("records.oedb")

print("Multiple file format readers available!")

### Writing Data

Export molecular data to various formats:

In [None]:
# Create sample data
sample_data = [
    {"SMILES": "CC(=O)Oc1ccccc1C(=O)O", "Name": "Aspirin", "Activity": 7.5},
    {"SMILES": "CC(C)Cc1ccc(cc1)C(C)C(=O)O", "Name": "Ibuprofen", "Activity": 6.8},
]
df = pd.DataFrame(sample_data)
df = oepd.read_molecule_csv(df, molecule_columns="SMILES")

# Create temporary directory for output
with tempfile.TemporaryDirectory() as tmpdir:
    output_dir = Path(tmpdir)
    
    # Write to SDF
    df.oechem.write_sdf(output_dir / "output.sdf", molecule_column="SMILES")
    print("✓ Wrote SDF file")
    
    # Write to OEB (compressed)
    df.oechem.write_oeb(output_dir / "output.oeb.gz", molecule_column="SMILES")
    print("✓ Wrote compressed OEB file")
    
    # Write to SMILES
    df.oechem.write_smi(output_dir / "output.smi", molecule_column="SMILES")
    print("✓ Wrote SMILES file")
    
    print(f"\nAll files written to: {output_dir}")

## 2. Working with Design Units

Design units are OpenEye's container for protein-ligand complexes:

In [None]:
# Read design unit file (if you have one)
# df_du = oepd.read_oedu("complexes.oedu")

# Extract components
# df_du["Ligand"] = df_du.Design_Unit.get_ligands()
# df_du["Protein"] = df_du.Design_Unit.get_proteins()

# Calculate ligand properties
# df_du["LigandMW"] = df_du.Ligand.apply(oechem.OECalculateMolecularWeight)

# Calculate protein properties
# df_du["NumResidues"] = df_du.Protein.apply(
#     lambda mol: sum(1 for _ in oechem.OEGetResidues(mol))
# )

print("Design unit functionality available for protein-ligand complexes")

## 3. Performance Optimization

### Bulk Operations

In [None]:
# For large datasets, use vectorized operations when possible
import time

# Create a larger dataset
large_smiles = [
    "CC(=O)Oc1ccccc1C(=O)O",
    "CC(C)Cc1ccc(cc1)C(C)C(=O)O",
    "CC(=O)Nc1ccc(cc1)O",
] * 100  # 300 molecules

df_large = pd.DataFrame({"SMILES_str": large_smiles})
df_large = oepd.read_molecule_csv(df_large, molecule_columns="SMILES_str")

# Efficient way: use apply
start = time.time()
atom_counts = df_large.SMILES_str.apply(lambda mol: mol.NumAtoms() if mol else 0)
elapsed = time.time() - start

print(f"Calculated atom counts for {len(df_large)} molecules in {elapsed:.4f} seconds")
print(f"Average: {elapsed/len(df_large)*1000:.2f} ms per molecule")

### Memory-Efficient Iteration

In [None]:
# For operations that don't need the full DataFrame in memory,
# iterate efficiently

def process_molecules_efficiently(df, batch_size=100):
    """Process molecules in batches"""
    results = []
    
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        # Process batch
        batch_results = batch.SMILES_str.apply(
            lambda mol: oechem.OECalculateMolecularWeight(mol) if mol else 0
        )
        results.extend(batch_results)
    
    return results

mw_values = process_molecules_efficiently(df_large, batch_size=50)
print(f"Processed {len(mw_values)} molecules in batches")

## 4. Advanced Molecular Operations

### Conformer Generation

In [None]:
# Generate 3D conformers
from openeye import oeomega

def generate_conformers(mol, max_confs=10):
    """Generate conformers for a molecule"""
    if mol is None:
        return None
    
    omega = oeomega.OEOmega()
    omega.SetMaxConfs(max_confs)
    omega.SetIncludeInput(False)
    omega.SetStrictStereo(False)
    
    mol_copy = mol.CreateCopy()
    if omega(mol_copy):
        return mol_copy
    return None

# Apply to a molecule
sample_mol = df_large.SMILES_str.iloc[0]
if sample_mol:
    mol_with_confs = generate_conformers(sample_mol)
    if mol_with_confs:
        print(f"Generated {mol_with_confs.NumConfs()} conformers")

### Molecular Fingerprints

In [None]:
# Calculate molecular fingerprints for similarity searching
from openeye import oegraphsim

def calc_fingerprint(mol):
    """Calculate circular fingerprint"""
    if mol is None:
        return None
    fp = oegraphsim.OEFingerPrint()
    oegraphsim.OEMakeFP(fp, mol, oegraphsim.OEFPType_Circular)
    return fp

# Add fingerprints to DataFrame
sample_df = df_large.head(10)
sample_df["Fingerprint"] = sample_df.SMILES_str.apply(calc_fingerprint)

print(f"Calculated fingerprints for {len(sample_df)} molecules")

## 5. Data Quality and Filtering

### Filtering Invalid Molecules

In [None]:
# Check molecular validity
df_sample = df_large.head(20).copy()

# Add validity check
df_sample["IsValid"] = df_sample.SMILES_str.apply(
    lambda mol: mol.IsValid() if mol is not None else False
)

print(f"Valid molecules: {df_sample.IsValid.sum()}")
print(f"Invalid molecules: {(~df_sample.IsValid).sum()}")

# Filter to only valid molecules
df_valid = df_sample[df_sample.IsValid]
print(f"\nDataFrame filtered to {len(df_valid)} valid molecules")

### Property-Based Filtering

In [None]:
# Apply Lipinski's Rule of Five
def passes_ro5(mol):
    """Check if molecule passes Lipinski's Rule of Five"""
    if mol is None:
        return False
    
    mw = oechem.OECalculateMolecularWeight(mol)
    logp = oechem.OEGetXLogP(mol)
    hbd = oechem.OECount(mol, oechem.OEIsHBondDonor())
    hba = oechem.OECount(mol, oechem.OEIsHBondAcceptor())
    
    return (mw <= 500 and logp <= 5 and hbd <= 5 and hba <= 10)

df_sample["PassesRO5"] = df_sample.SMILES_str.apply(passes_ro5)

print(f"Molecules passing Lipinski's Rule of Five: {df_sample.PassesRO5.sum()}")
print(f"Molecules failing: {(~df_sample.PassesRO5).sum()}")

## 6. Integration with Other Libraries

OEPandas integrates seamlessly with the scientific Python ecosystem:

In [None]:
# Calculate descriptors for ML
df_ml = df_sample.copy()

# Add multiple descriptors
df_ml["MW"] = df_ml.SMILES_str.apply(
    lambda mol: oechem.OECalculateMolecularWeight(mol) if mol else 0
)
df_ml["LogP"] = df_ml.SMILES_str.apply(
    lambda mol: oechem.OEGetXLogP(mol) if mol else 0
)
df_ml["TPSA"] = df_ml.SMILES_str.apply(
    lambda mol: oechem.OEGet2dPSA(mol) if mol else 0
)
df_ml["RotBonds"] = df_ml.SMILES_str.apply(
    lambda mol: oechem.OECount(mol, oechem.OEIsRotor()) if mol else 0
)

# Feature matrix for ML
feature_columns = ["MW", "LogP", "TPSA", "RotBonds"]
X = df_ml[feature_columns].values

print(f"Feature matrix shape: {X.shape}")
print(f"\nFeature statistics:")
print(df_ml[feature_columns].describe())

## Summary

This notebook covered:
- Reading and writing multiple file formats
- Working with design units
- Performance optimization techniques
- Advanced molecular operations (conformers, fingerprints)
- Data quality and filtering
- Integration with ML workflows

For more information, see the [OEPandas documentation](https://github.com/scott-arne/oepandas).