In [5]:
from pycdxml import cdxml_styler, cdxml_converter

# Apply style from an existing file
styler = cdxml_styler.CDXMLStyler(style_source="Untitled ACS Document 1996-7.cdxml")
styler.apply_style_to_file('Untitled ACS Document 1996-7.cdxml', outpath='./output.cdxml')

# Apply style to document
doc = cdxml_converter.read_cdxml('Untitled ACS Document 1996-7.cdxml')
styler.apply_style_to_doc(doc)

# Convert to base64 encoded CDX, ignoring unknown attributes
try:
    b64 = cdxml_converter.to_b64_cdx(doc, ignore_unknown_attribute=True)
    print("Successfully converted to base64:", b64[:50] + "...")  # Print first 50 chars
except Exception as e:
    print(f"Conversion failed: {e}")

Found unknown attribute 'MonomerRenderingStyle with value 'graphic'. Ignoring attribute.
Found unknown attribute 'AtomID with value '1'. Ignoring attribute.
Found unknown attribute 'AtomID with value '2'. Ignoring attribute.
Found unknown attribute 'AtomID with value '3'. Ignoring attribute.
Found unknown attribute 'AtomID with value '4'. Ignoring attribute.
Found unknown attribute 'AtomID with value '5'. Ignoring attribute.
Found unknown attribute 'AtomID with value '6'. Ignoring attribute.
Found unknown attribute 'AtomID with value '7'. Ignoring attribute.
Found unknown attribute 'AtomID with value '8'. Ignoring attribute.
Found unknown attribute 'AtomID with value '9'. Ignoring attribute.
Found unknown attribute 'AtomID with value '10'. Ignoring attribute.
Found unknown attribute 'AtomID with value '11'. Ignoring attribute.
Found unknown attribute 'AtomID with value '12'. Ignoring attribute.
Found unknown attribute 'AtomID with value '13'. Ignoring attribute.
Found unknown attribute

Successfully converted to base64: VmpDRDAxMDAEAwIBAAAAAAAAAAAAAACAiBMAAAMAEwAAAENoZW...


In [50]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, rdFMCS
import xml.etree.ElementTree as ET
from xml.dom import minidom
import os
from concurrent.futures import ThreadPoolExecutor

# ========== CONFIGURATION ==========
INPUT_EXCEL = "HGODEL0036_HGP4548_S202508-98.cut2.xlsx"
OUTPUT_DIR = "cdxml_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ACS 1996 Document Style Parameters
ACS_STYLE = {
    "bond_length": 14.4,        # Classic ACS bond length (smaller = more compact)
    "font_size": 6,             # Smaller font size
    "page_width": 540,
    "page_height": 540,
    "margin": 50                # Reduced margin
}

COLOR_MAPPING = {
    'R1': ((1, 0, 0), "Red"),
    'R2': ((0, 1, 0), "Green"), 
    'R3': ((0, 0, 1), "Blue")
}

# ========== CDXML GENERATION ==========
def create_cdxml(mol, atom_colors, output_path):
    """Generate CDXML in ACS 1996 style (compact, no explicit Hs)"""
    # Remove hydrogens and create working copy
    mol = Chem.RemoveHs(Chem.Mol(mol))
    
    # Generate coordinates with ACS-style scaling
    Chem.Kekulize(mol, clearAromaticFlags=True)
    AllChem.Compute2DCoords(mol)
    
    # Apply ACS 1996 scaling factors
    conf = mol.GetConformer()
    for i in range(mol.GetNumAtoms()):
        pos = conf.GetAtomPosition(i)
        pos.x *= 0.5  # Reduce X coordinate spacing
        pos.y *= 0.5  # Reduce Y coordinate spacing
    
    # CDXML structure
    cdxml = ET.Element("CDXML", {
        "version": "4.0",
        "xmlns": "http://www.cambridgesoft.com/xml/cdxml.dtd"
    })
    
    # Color table
    colors = ET.SubElement(cdxml, "colortable")
    colors.extend([
        ET.Element("color", {"id": "0", "r": "1", "g": "1", "b": "1"}),  # White
        ET.Element("color", {"id": "32", "r": "0", "g": "0", "b": "0"}), # Black
        ET.Element("color", {"id": "33", "r": "1", "g": "0", "b": "0"}), # Red (R1)
        ET.Element("color", {"id": "34", "r": "0", "g": "1", "b": "0"}), # Green (R2)
        ET.Element("color", {"id": "35", "r": "0", "g": "0", "b": "1"})  # Blue (R3)
    ])
    
    # Page setup with ACS style
    page = ET.SubElement(cdxml, "page", {
        "WidthPages": "1",
        "HeightPages": "1",
        "BoundingBox": "0 0 540 540",
        "Background": "0 0 1 1 1"
    })
    fragment = ET.SubElement(page, "fragment")
    
    # Add atoms with ACS-style positioning
    conf = mol.GetConformer()
    for i, atom in enumerate(mol.GetAtoms()):
        pos = conf.GetAtomPosition(i)
        color_id = "32"  # Default black
        
        if i in atom_colors:
            if atom_colors[i] == (1, 0, 0): color_id = "33"  # R1 Red
            elif atom_colors[i] == (0, 1, 0): color_id = "34"  # R2 Green
            elif atom_colors[i] == (0, 0, 1): color_id = "35"  # R3 Blue
        
        ET.SubElement(fragment, "n", {
            "id": str(i+1),
            "Element": str(atom.GetAtomicNum()),
            "p": f"{pos.x*14.4 + 270:.2f} {540 - (pos.y*14.4 + 270):.2f}",  # ACS scaling
            "Font": "3",
            "PointSize": str(ACS_STYLE["font_size"]),
            "Color": color_id
        })
    
    # Add bonds with ACS-style bond length
    for bond in mol.GetBonds():
        ET.SubElement(fragment, "b", {
            "id": str(bond.GetIdx()+1),
            "B": str(bond.GetBeginAtomIdx()+1),
            "E": str(bond.GetEndAtomIdx()+1),
            "Order": str(int(bond.GetBondTypeAsDouble())),
            "BondLength": str(ACS_STYLE["bond_length"]),
            "Color": "32"
        })
    
    # Write to file
    with open(output_path, 'w') as f:
        f.write(minidom.parseString(ET.tostring(cdxml)).toprettyxml(indent="  "))

# ========== PROCESSING FUNCTIONS ==========
def process_compound(row, idx, output_dir):
    """Process a single compound with ACS 1996 style"""
    try:
        smiles = str(row['Smiles']) if pd.notna(row.get('Smiles')) else None
        if not smiles:
            return False, f"Row {idx+1}: Empty SMILES"
        
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            return False, f"Row {idx+1}: Invalid SMILES"
        
        Chem.SanitizeMol(mol)
        atom_colors = {}
        
        # Map R-groups to colors
        for rname, (color, _) in COLOR_MAPPING.items():
            rcol = f"{rname}_Reagent"
            if rcol in row and pd.notna(row[rcol]):
                reagent = Chem.MolFromSmiles(str(row[rcol]))
                if reagent:
                    mcs = rdFMCS.FindMCS([mol, reagent], 
                                       bondCompare=rdFMCS.BondCompare.CompareOrderExact,
                                       timeout=5)
                    if mcs.numAtoms > 0:
                        substruct = Chem.MolFromSmarts(mcs.smartsString)
                        for match in mol.GetSubstructMatches(substruct):
                            for atom_idx in match:
                                atom_colors[atom_idx] = color
        
        # Save CDXML
        output_path = os.path.join(output_dir, f"compound_{idx+1}.cdxml")
        create_cdxml(mol, atom_colors, output_path)
        return True, f"Row {idx+1}: Success (ACS 1996 style)"
    
    except Exception as e:
        return False, f"Row {idx+1}: Error - {str(e)}"

def main():
    """Main processing workflow"""
    try:
        df = pd.read_excel(INPUT_EXCEL, engine='openpyxl')
        print(f"Processing {len(df)} compounds from {INPUT_EXCEL}")
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return
    
    with ThreadPoolExecutor() as executor:
        futures = []
        for idx, row in df.iterrows():
            futures.append(executor.submit(process_compound, row, idx, OUTPUT_DIR))
        
        success_count = 0
        for future in futures:
            success, message = future.result()
            print(message)
            if success:
                success_count += 1
    
    print(f"\nCompleted: {success_count}/{len(df)} successful")
    print(f"ACS 1996 style CDXML files saved to: {os.path.abspath(OUTPUT_DIR)}")

if __name__ == "__main__":
    main()

Processing 28 compounds from HGODEL0036_HGP4548_S202508-98.cut2.xlsx
Row 1: Success (ACS 1996 style)
Row 2: Success (ACS 1996 style)
Row 3: Success (ACS 1996 style)
Row 4: Success (ACS 1996 style)
Row 5: Success (ACS 1996 style)
Row 6: Success (ACS 1996 style)
Row 7: Success (ACS 1996 style)
Row 8: Success (ACS 1996 style)
Row 9: Success (ACS 1996 style)
Row 10: Success (ACS 1996 style)
Row 11: Success (ACS 1996 style)
Row 12: Success (ACS 1996 style)
Row 13: Success (ACS 1996 style)
Row 14: Success (ACS 1996 style)
Row 15: Success (ACS 1996 style)
Row 16: Success (ACS 1996 style)
Row 17: Success (ACS 1996 style)
Row 18: Success (ACS 1996 style)
Row 19: Success (ACS 1996 style)
Row 20: Success (ACS 1996 style)
Row 21: Success (ACS 1996 style)
Row 22: Success (ACS 1996 style)
Row 23: Success (ACS 1996 style)
Row 24: Success (ACS 1996 style)
Row 25: Success (ACS 1996 style)
Row 26: Success (ACS 1996 style)
Row 27: Success (ACS 1996 style)
Row 28: Success (ACS 1996 style)

Completed: 28/2

In [47]:
results_df.to_excel("r_group.xlsx")