In [None]:
# ============================================================================
# PharmacoNet Batch Modeling - PRODUCTION READY VERSION
# ============================================================================
# Complete rewrite - all bugs fixed
# 1. Enable T4 GPU in Runtime > Change runtime type
# 2. Run this single cell
# 3. Upload your CSV (columns: PDB_code, Ligand_ID)
# 4. Get .pm pharmacophore model files
# ============================================================================

print("="*70)
print("PHARMACONET BATCH PROCESSOR - PRODUCTION v2.0")
print("="*70)
print()

# ============================================================================
# STEP 1: GPU Check
# ============================================================================
print("STEP 1: Checking GPU...")
import torch
if torch.cuda.is_available():
    print(f"‚úì GPU: {torch.cuda.get_device_name(0)}")
else:
    print("‚ö† No GPU - will use CPU (slower)")
print()

# ============================================================================
# STEP 2: Install Dependencies
# ============================================================================
print("STEP 2: Installing dependencies...")
!apt-get update -qq > /dev/null 2>&1
!apt-get install -y -qq libopenbabel-dev openbabel libglew-dev libpng-dev libxml2-dev libfreetype6-dev > /dev/null 2>&1

!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q numpy==2.0.0 numba omegaconf molvoxel gdown tqdm rdkit openbabel-wheel biopython pandas psutil
!pip install -q pymol-open-source
!pip install -q git+https://github.com/SeonghwanSeo/PharmacoNet.git
print("‚úì All dependencies installed")
print()

# ============================================================================
# STEP 3: Setup Working Directory
# ============================================================================
print("STEP 3: Setting up workspace...")
import os
from pathlib import Path

WORK_DIR = Path("/content/pharmaconet_batch")
WORK_DIR.mkdir(exist_ok=True)
os.chdir(WORK_DIR)
(WORK_DIR / "input").mkdir(exist_ok=True)
(WORK_DIR / "output").mkdir(exist_ok=True)
(WORK_DIR / "utils").mkdir(exist_ok=True)
print(f"‚úì Workspace: {WORK_DIR}")
print()

# ============================================================================
# STEP 4: Create Python Modules
# ============================================================================
print("STEP 4: Generating Python modules...")

# Create utils/__init__.py
(WORK_DIR / "utils" / "__init__.py").write_text("")

# Create utils/parse_rcsb_pdb.py
parse_code = '''import os
from dataclasses import dataclass
from pathlib import Path
from urllib.request import urlopen
import numpy as np
import pymol
from openbabel import pybel

PathLike = str | Path

@dataclass
class LigandInform:
    order: int
    id: str
    pdbchain: str
    authchain: str
    residx: int
    center: tuple[float, float, float]
    file_path: PathLike
    name: str | None
    synonyms: str | None

def download_pdb(pdb_code: str, output_file: PathLike):
    url = f"https://files.rcsb.org/download/{pdb_code.lower()}.pdb"
    try:
        with urlopen(url) as response:
            with open(output_file, "w") as f:
                f.write(response.read().decode("utf-8"))
    except Exception as e:
        print(f"Error downloading PDB: {e}")
        raise

def parse_pdb(pdb_code: str, protein_path: PathLike, save_dir: PathLike) -> list[LigandInform]:
    try:
        protein = next(pybel.readfile("pdb", str(protein_path)))
        if "HET" not in protein.data.keys():
            return []
        
        het_lines = protein.data["HET"].split("\\n")
        hetnam_lines = protein.data["HETNAM"].split("\\n")
        hetsyn_lines = protein.data["HETSYN"].split("\\n") if "HETSYN" in protein.data.keys() else []
        het_id_list = tuple(line.strip().split()[0] for line in het_lines)
        
        ligand_name_dict = {}
        for line in hetnam_lines:
            line = line.strip()
            if line.startswith(het_id_list):
                key, *strings = line.split()
                ligand_name_dict[key] = " ".join(strings)
            else:
                _, key, *strings = line.split()
                if ligand_name_dict.get(key, "").endswith("-"):
                    ligand_name_dict[key] += " ".join(strings)
                else:
                    ligand_name_dict[key] = ligand_name_dict.get(key, "") + " " + " ".join(strings)
        
        pymol.finish_launching(["pymol", "-cq"])
        pymol.cmd.load(str(protein_path))
        ligand_inform_list = []
        last_chain = protein.data["SEQRES"].split("\\n")[-1].split()[1]
        
        for idx, line in enumerate(het_lines):
            vs = line.strip().split()
            if len(vs) == 4:
                ligid, authchain, residue_idx, _ = vs
            else:
                ligid, authchain, residue_idx = vs[0], vs[1][0], vs[1][1:]
            
            pdbchain = chr(ord(last_chain) + idx + 1)
            identify_key = f"{pdb_code}_{pdbchain}_{ligid}"
            ligand_path = os.path.join(save_dir, f"{identify_key}.pdb")
            
            if not os.path.exists(ligand_path):
                pymol.cmd.select(identify_key, f"resn {ligid} and resi {residue_idx} and chain {authchain}")
                pymol.cmd.save(ligand_path, identify_key)
            
            ligand = next(pybel.readfile("pdb", ligand_path))
            x, y, z = np.mean([atom.coords for atom in ligand.atoms], axis=0).tolist()
            inform = LigandInform(idx + 1, ligid, pdbchain, authchain, int(residue_idx), (x, y, z), 
                                ligand_path, ligand_name_dict.get(ligid), None)
            ligand_inform_list.append(inform)
        
        return ligand_inform_list
    finally:
        try:
            pymol.cmd.reinitialize()
            pymol.cmd.quit()
        except:
            pass
'''
(WORK_DIR / "utils" / "parse_rcsb_pdb.py").write_text(parse_code)

# Create modeling.py - CLEAN VERSION
modeling_code = '''#!/usr/bin/env python3
import argparse
import logging
import os
import sys
from pathlib import Path

import pmnet
from pmnet.module import PharmacoNet
from pmnet.pharmacophore_model import PharmacophoreModel
from utils.parse_rcsb_pdb import download_pdb, parse_pdb

SUCCESS = 0
EXIT = 1
FAIL = 2
NO_LIGAND = 3
LIGAND_NOT_FOUND = 4

def main(args):
    PREFIX = args.prefix if args.prefix else args.pdb
    SAVE_DIR = Path(args.out_dir) if args.out_dir else Path("./result") / PREFIX
    SAVE_DIR.mkdir(exist_ok=True, parents=True)
    
    module = PharmacoNet("cuda" if args.cuda else "cpu", weight_path=args.weight_path)
    logging.info("PharmacoNet loaded")
    
    # Get protein file
    if args.pdb:
        protein_path = str(SAVE_DIR / f"{PREFIX}.pdb")
        if not os.path.exists(protein_path):
            logging.info(f"Downloading {args.pdb}...")
            download_pdb(args.pdb, protein_path)
        else:
            logging.info(f"Using cached {protein_path}")
    elif args.protein:
        protein_path = args.protein
        if not os.path.exists(protein_path):
            raise Exception(f"Protein file not found: {protein_path}")
    else:
        raise Exception("Missing --pdb or --protein")
    
    # Parse ligands
    inform_list = parse_pdb(PREFIX, protein_path, SAVE_DIR)
    
    if len(inform_list) == 0:
        logging.warning("No ligands detected in PDB")
        return NO_LIGAND
    
    # Filter by ligand_id if specified
    if args.ligand_id:
        original_count = len(inform_list)
        inform_list = [inf for inf in inform_list if inf.id.upper() == args.ligand_id.upper()]
        if len(inform_list) == 0:
            logging.warning(f"Ligand {args.ligand_id} not found (had {original_count} ligands)")
            return LIGAND_NOT_FOUND
        logging.info(f"Filtered to ligand {args.ligand_id}")
    
    # Process each ligand
    for inform in inform_list:
        model_path = SAVE_DIR / f"{PREFIX}_{inform.pdbchain}_{inform.id}_model.{args.suffix}"
        
        if (not args.force) and os.path.exists(model_path):
            logging.info(f"Skipping {model_path} (exists)")
            continue
        
        logging.info(f"Processing ligand {inform.id} (chain {inform.pdbchain})...")
        pharmacophore_model = module.run(protein_path, ref_ligand_path=inform.file_path, center=inform.center)
        pharmacophore_model.save(str(model_path))
        logging.info(f"Saved {model_path}")
    
    return SUCCESS

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--pdb", type=str)
    parser.add_argument("--ligand_id", type=str)
    parser.add_argument("--protein", type=str)
    parser.add_argument("--out_dir", type=str)
    parser.add_argument("--prefix", type=str)
    parser.add_argument("--suffix", choices=("pm", "json"), default="pm")
    parser.add_argument("--weight_path", type=str)
    parser.add_argument("--cuda", action="store_true")
    parser.add_argument("--force", action="store_true")
    parser.add_argument("-v", "--verbose", action="store_true")
    
    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO, format='%(message)s')
    
    try:
        sys.exit(main(args))
    except Exception as e:
        logging.error(f"FATAL ERROR: {e}")
        sys.exit(FAIL)
'''
(WORK_DIR / "modeling.py").write_text(modeling_code)

# Create batch_modeling.py - CLEAN VERSION
batch_code = '''#!/usr/bin/env python3
import argparse
import csv
import gc
import logging
import os
import subprocess
import sys
from datetime import datetime
from pathlib import Path

def cleanup():
    """Aggressive cleanup"""
    try:
        import torch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
    except:
        pass
    gc.collect()
    gc.collect()
    gc.collect()
    try:
        subprocess.run(['pkill', '-f', 'pymol'], stderr=subprocess.DEVNULL, timeout=2)
    except:
        pass

def get_memory_stats():
    """Get memory info"""
    stats = []
    try:
        import torch
        if torch.cuda.is_available():
            alloc = torch.cuda.memory_allocated() / 1024**3
            total = torch.cuda.get_device_properties(0).total_memory / 1024**3
            stats.append(f"GPU: {alloc:.1f}/{total:.1f}GB")
    except:
        pass
    try:
        import psutil
        mem = psutil.virtual_memory()
        stats.append(f"RAM: {mem.percent:.0f}%")
    except:
        pass
    return " | ".join(stats)

def parse_csv(csv_path):
    """Parse CSV and return list of entries"""
    entries = []
    with open(csv_path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            row_norm = {k.lower().strip(): v.strip() if v else None for k, v in row.items()}
            pdb = row_norm.get('pdb_code') or row_norm.get('pdb')
            ligand = row_norm.get('ligand_id') or row_norm.get('ligand')
            if pdb:
                entries.append({'pdb': pdb.upper(), 'ligand': ligand.upper() if ligand else None})
    return entries

def check_exists(pdb, ligand, output_dir, suffix):
    """Check if output already exists"""
    pdb_dir = output_dir / pdb
    if not pdb_dir.exists():
        return False
    pattern = f"{pdb}_*_{ligand}_model.{suffix}" if ligand else f"{pdb}_*_model.{suffix}"
    return len(list(pdb_dir.glob(pattern))) > 0

def save_summary(successful, timeout, no_ligand, not_found, failed, skipped, output_dir, start, end):
    """Save summary.txt"""
    with open(output_dir / "summary.txt", 'w') as f:
        f.write("="*70 + "\\n")
        f.write("PHARMACONET BATCH RESULTS\\n")
        f.write("="*70 + "\\n\\n")
        f.write(f"Started:  {start.strftime('%Y-%m-%d %H:%M:%S')}\\n")
        f.write(f"Finished: {end.strftime('%Y-%m-%d %H:%M:%S')}\\n")
        f.write(f"Duration: {end - start}\\n\\n")
        
        total = len(successful) + len(timeout) + len(no_ligand) + len(not_found) + len(failed) + len(skipped)
        f.write(f"Total:            {total}\\n")
        f.write(f"Successful:       {len(successful)}\\n")
        f.write(f"Timeout (>30min): {len(timeout)}\\n")
        f.write(f"No Ligands:       {len(no_ligand)}\\n")
        f.write(f"Ligand Not Found: {len(not_found)}\\n")
        f.write(f"Failed:           {len(failed)}\\n")
        f.write(f"Skipped (cached): {len(skipped)}\\n\\n")
        
        if successful:
            f.write("="*70 + "\\n")
            f.write(f"SUCCESSFUL ({len(successful)})\\n")
            f.write("="*70 + "\\n")
            for e in successful:
                f.write(f"  ‚úì {e['pdb']}" + (f" - {e['ligand']}" if e['ligand'] else "") + "\\n")
            f.write("\\n")
        
        if timeout:
            f.write("="*70 + "\\n")
            f.write(f"TIMEOUT - EXCEEDED 30 MINUTES ({len(timeout)})\\n")
            f.write("="*70 + "\\n")
            for e in timeout:
                f.write(f"  ‚è± {e['pdb']}" + (f" - {e['ligand']}" if e['ligand'] else "") + "\\n")
            f.write("\\n")
        
        if no_ligand:
            f.write("="*70 + "\\n")
            f.write(f"NO LIGANDS IN PDB ({len(no_ligand)})\\n")
            f.write("="*70 + "\\n")
            for e in no_ligand:
                f.write(f"  ‚äò {e['pdb']}\\n")
            f.write("\\n")
        
        if not_found:
            f.write("="*70 + "\\n")
            f.write(f"LIGAND NOT FOUND ({len(not_found)})\\n")
            f.write("="*70 + "\\n")
            for e in not_found:
                f.write(f"  ‚äò {e['pdb']} - requested: {e['ligand']}\\n")
            f.write("\\n")
        
        if failed:
            f.write("="*70 + "\\n")
            f.write(f"FAILED ({len(failed)})\\n")
            f.write("="*70 + "\\n")
            for e in failed:
                f.write(f"  ‚úó {e['pdb']}" + (f" - {e['ligand']}" if e['ligand'] else ""))
                if 'error' in e:
                    f.write(f" ({e['error']})")
                f.write("\\n")
            f.write("\\n")
        
        if skipped:
            f.write("="*70 + "\\n")
            f.write(f"SKIPPED - ALREADY EXISTS ({len(skipped)})\\n")
            f.write("="*70 + "\\n")
            for e in skipped:
                f.write(f"  ‚äô {e['pdb']}" + (f" - {e['ligand']}" if e['ligand'] else "") + "\\n")

def main(args):
    start_time = datetime.now()
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Parse CSV
    entries = parse_csv(args.input_csv)
    logging.info(f"Loaded {len(entries)} entries from CSV")
    
    # Check for existing
    skipped = []
    if args.skip_existing and not args.force:
        to_process = []
        for entry in entries:
            if check_exists(entry['pdb'], entry['ligand'], output_dir, args.suffix):
                skipped.append(entry)
            else:
                to_process.append(entry)
        if skipped:
            logging.info(f"Skipping {len(skipped)} already completed")
        entries = to_process
    
    if not entries:
        logging.info("All entries complete!")
        save_summary([], [], [], [], [], skipped, output_dir, start_time, datetime.now())
        return
    
    logging.info(f"Processing {len(entries)} entries (30min timeout each)")
    logging.info("")
    
    successful = []
    timeout_list = []
    no_ligand = []
    not_found = []
    failed = []
    
    for i, entry in enumerate(entries, 1):
        pdb = entry['pdb']
        ligand = entry['ligand']
        desc = f"{pdb}" + (f" (ligand: {ligand})" if ligand else "")
        
        logging.info("="*70)
        logging.info(f"[{i}/{len(entries)}] {desc}")
        logging.info("="*70)
        
        if i % 5 == 0 or i == 1:
            logging.info(f"Memory: {get_memory_stats()}")
        
        # Build command
        cmd = [sys.executable, "modeling.py", "--pdb", pdb, "--out_dir", str(output_dir / pdb), 
               "--suffix", args.suffix]
        if ligand:
            cmd.extend(["--ligand_id", ligand])
        if args.cuda:
            cmd.append("--cuda")
        if args.force:
            cmd.append("--force")
        
        try:
            result = subprocess.run(cmd, check=False, capture_output=True, text=True, 
                                  timeout=1800, cwd=str(Path.cwd()))
            
            if result.returncode == 0:
                successful.append(entry)
                logging.info("‚úì SUCCESS")
            elif result.returncode == 3:
                no_ligand.append(entry)
                logging.warning("‚äò No ligands in PDB")
            elif result.returncode == 4:
                not_found.append(entry)
                logging.warning(f"‚äò Ligand {ligand} not found")
            else:
                entry_err = entry.copy()
                entry_err['error'] = f"Exit code {result.returncode}"
                failed.append(entry_err)
                logging.error(f"‚úó FAILED (exit {result.returncode})")
                if result.stderr:
                    logging.error(result.stderr[-500:])
        
        except subprocess.TimeoutExpired:
            timeout_list.append(entry)
            logging.warning("‚è± TIMEOUT (>30min) - Continuing to next entry...")
        
        except Exception as e:
            entry_err = entry.copy()
            entry_err['error'] = str(e)
            failed.append(entry_err)
            logging.error(f"‚úó ERROR: {e}")
        
        finally:
            cleanup()
            if i % 5 == 0:
                logging.info("üßπ Cleanup done")
        
        logging.info("")
    
    end_time = datetime.now()
    save_summary(successful, timeout_list, no_ligand, not_found, failed, skipped, output_dir, start_time, end_time)
    
    logging.info("="*70)
    logging.info("BATCH COMPLETE")
    logging.info("="*70)
    logging.info(f"Total: {len(entries) + len(skipped)}")
    logging.info(f"Successful: {len(successful)}")
    logging.info(f"Timeout: {len(timeout_list)}")
    logging.info(f"Failed: {len(failed) + len(no_ligand) + len(not_found)}")
    logging.info(f"Skipped: {len(skipped)}")
    logging.info(f"Summary: {output_dir}/summary.txt")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_csv", type=str, default="input/pdb_list.csv")
    parser.add_argument("--output_dir", type=str, default="output")
    parser.add_argument("--suffix", choices=("pm", "json"), default="pm")
    parser.add_argument("--cuda", action="store_true")
    parser.add_argument("--force", action="store_true")
    parser.add_argument("--skip_existing", action="store_true", default=True)
    
    args = parser.parse_args()
    logging.basicConfig(level=logging.INFO, format='%(message)s')
    main(args)
'''
(WORK_DIR / "batch_modeling.py").write_text(batch_code)
!chmod +x {WORK_DIR}/batch_modeling.py

print("‚úì Created utils/parse_rcsb_pdb.py")
print("‚úì Created modeling.py")
print("‚úì Created batch_modeling.py")
print()

# ============================================================================
# STEP 5: Upload CSV
# ============================================================================
print("="*70)
print("STEP 5: Upload your CSV file")
print("="*70)
print("Required columns: PDB_code, Ligand_ID")
print("Example:")
print("  PDB_code,Ligand_ID")
print("  1A2B,HEM")
print("  3C4D,ATP")
print()

from google.colab import files
import shutil

uploaded = files.upload()
if uploaded:
    csv_file = list(uploaded.keys())[0]
    shutil.move(csv_file, WORK_DIR / "input" / "pdb_list.csv")
    print(f"‚úì CSV uploaded: {WORK_DIR}/input/pdb_list.csv")
    print()
else:
    print("‚ùå No file uploaded - cannot continue")
    raise SystemExit("Upload cancelled")

# ============================================================================
# STEP 6: Run Batch Processing
# ============================================================================
print("="*70)
print("STEP 6: Running batch processing")
print("="*70)
print("Configuration:")
print("  ‚Ä¢ Timeout: 30 minutes per entry (non-fatal)")
print("  ‚Ä¢ Output: .pm pharmacophore models only")
print("  ‚Ä¢ PyMOL visualization: DISABLED (for speed)")
print("  ‚Ä¢ Memory cleanup: After every entry")
print("  ‚Ä¢ Auto-skip: Previously completed entries")
print()
print("Starting batch job...")
print()

!cd /content/pharmaconet_batch && python batch_modeling.py \
    --input_csv input/pdb_list.csv \
    --output_dir output \
    --cuda \
    --skip_existing

print()

# ============================================================================
# STEP 7: Download Results
# ============================================================================
print("="*70)
print("STEP 7: Download results")
print("="*70)

summary_path = WORK_DIR / "output" / "summary.txt"
if summary_path.exists():
    print("üìÑ Summary Report:")
    print("-" * 70)
    with open(summary_path) as f:
        print(f.read())
    print("-" * 70)
else:
    print("‚ö† No summary.txt found")

print()
print("Creating ZIP archive...")
output_zip = "/content/pharmacophore_results.zip"
shutil.make_archive(output_zip.replace('.zip', ''), 'zip', WORK_DIR / "output")
print(f"‚úì Created {output_zip}")

files.download(output_zip)
print("‚úì Download started!")
print()

print("="*70)
print("‚úÖ COMPLETE!")
print("="*70)
print(f"Results location: {WORK_DIR}/output/")
print(f"Summary report: {WORK_DIR}/output/summary.txt")
print()
print("üí° TIP: Re-run this cell to resume - already completed entries are skipped")
print("="*70)