In [8]:
import pandas as pd
import os
import argparse
from rdkit import Chem
from rdkit.Chem import AllChem, MolStandardize, Draw
import io
import base64
from PIL import Image
import athena_smiles_standardisation
from pandarallel import pandarallel

pandarallel.initialize()

def generate_inchikey(smiles):
    """
    Generate InChIKey from SMILES.
    Returns the InChIKey or None if the input is invalid.
    """
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        inchi_key = Chem.MolToInchiKey(mol)
        return inchi_key
    except:
        return None

def process_csv_file(file_path, smiles_column=None):
    """
    Process a CSV file to extract SMILES.
    If smiles_column is not specified, tries to find it automatically.
    Returns a dataframe with original data and standardized SMILES + InChIKey.
    """
    standardizer = athena_smiles_standardisation.SMILESStandardizer()
    
    try:
        # Try to read with various delimiters
        try:
            df = pd.read_csv(file_path, sep=',')
        except:
            try:
                df = pd.read_csv(file_path, sep='\t')
            except:
                df = pd.read_csv(file_path, sep=';')
        
        # If smiles_column not specified, try to detect it
        if smiles_column is None:
            potential_columns = ['SMILES', 'smiles', 'Smiles', 'Canonical_SMILES', 'canonical_smiles', 'Canonical SMILES']
            for col in potential_columns:
                if col in df.columns:
                    smiles_column = col
                    break
            
            if smiles_column is None and len(df.columns) > 0:
                # Try the first column if nothing else matches
                smiles_column = df.columns[0]
        
        if smiles_column not in df.columns:
            raise ValueError(f"Could not find SMILES column. Available columns: {df.columns.tolist()}")
        
        # Create copies of the dataframe with standardized information
        df['original_smiles'] = df[smiles_column].copy()
        df['std_smiles'] = df[smiles_column].parallel_apply(standardizer.standardize)
        
        # Remove rows with invalid SMILES
        df = df.dropna(subset=['std_smiles'])
        
        # Generate InChIKeys
        df['inchikey'] = df['std_smiles'].parallel_apply(generate_inchikey)
        df['inchikey_prefix'] = df['inchikey'].parallel_apply(lambda x: x[:14] if x else None)
        
        # Remove rows with invalid InChIKeys
        df = df.dropna(subset=['inchikey_prefix'])
        
        return df
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return pd.DataFrame()

def find_overlaps(df1, df2):
    """
    Find overlapping compounds between two dataframes based on InChIKey prefixes.
    Returns a dataframe with matching information.
    """
    # Find overlaps
    overlaps = []
    
    if not df1.empty and not df2.empty:
        # Reset indices to use as reference
        df1 = df1.reset_index(drop=True)
        df2 = df2.reset_index(drop=True)
        
        for idx1, row1 in df1.iterrows():
            matching_rows = df2[df2['inchikey_prefix'] == row1['inchikey_prefix']]
            for idx2, row2 in matching_rows.iterrows():
                overlaps.append({
                    'index_file1': idx1,
                    'index_file2': idx2,
                    'original_smiles_file1': row1['original_smiles'],
                    'original_smiles_file2': row2['original_smiles'],
                    'std_smiles_file1': row1['std_smiles'],
                    'std_smiles_file2': row2['std_smiles'],
                    'inchikey_file1': row1['inchikey'],
                    'inchikey_file2': row2['inchikey'],
                    'inchikey_prefix': row1['inchikey_prefix']
                })
    
    return pd.DataFrame(overlaps)

def visualize_structures(overlaps_df, output_html='overlap_structures.html'):
    """
    Generate an HTML file to visualize matching structures side by side.
    """
    if overlaps_df.empty:
        with open(output_html, 'w') as f:
            f.write("<html><body><h2>No matching structures found.</h2></body></html>")
        return
    
    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <style>
            body { font-family: Arial, sans-serif; }
            table { border-collapse: collapse; width: 100%; }
            th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
            th { background-color: #f2f2f2; }
            tr:nth-child(even) { background-color: #f9f9f9; }
            .structure-img { width: 200px; height: auto; }
            h2 { color: #2c3e50; }
        </style>
        <title>Hermes - Matching Molecular Structures</title>
    </head>
    <body>
        <h2>Matching Molecular Structures</h2>
        <p>Total matches found: """ + str(len(overlaps_df)) + """</p>
        <table>
            <tr>
                <th>Match #</th>
                <th>File 1 Index</th>
                <th>File 1 Structure</th>
                <th>File 1 SMILES</th>
                <th>File 2 Index</th>
                <th>File 2 Structure</th>
                <th>File 2 SMILES</th>
                <th>InChIKey Prefix</th>
            </tr>
    """
    
    for i, row in overlaps_df.iterrows():
        try:
            mol1 = Chem.MolFromSmiles(row['original_smiles_file1'])
            mol2 = Chem.MolFromSmiles(row['original_smiles_file2'])
            
            # Generate images
            img1 = Draw.MolToImage(mol1, size=(300, 200))
            img2 = Draw.MolToImage(mol2, size=(300, 200))
            
            # Convert to base64
            buffered1 = io.BytesIO()
            img1.save(buffered1, format="PNG")
            img_base64_1 = base64.b64encode(buffered1.getvalue()).decode('utf-8')
            
            buffered2 = io.BytesIO()
            img2.save(buffered2, format="PNG")
            img_base64_2 = base64.b64encode(buffered2.getvalue()).decode('utf-8')
            
            html_content += f"""
                <tr>
                    <td>{i+1}</td>
                    <td>{row['index_file1']}</td>
                    <td><img src="data:image/png;base64,{img_base64_1}" class="structure-img"/></td>
                    <td>{row['original_smiles_file1']}</td>
                    <td>{row['index_file2']}</td>
                    <td><img src="data:image/png;base64,{img_base64_2}" class="structure-img"/></td>
                    <td>{row['original_smiles_file2']}</td>
                    <td>{row['inchikey_prefix']}</td>
                </tr>
            """
        except Exception as e:
            # If there's an error with structure visualization, still show the SMILES
            html_content += f"""
                <tr>
                    <td>{i+1}</td>
                    <td>{row['index_file1']}</td>
                    <td>Error generating image</td>
                    <td>{row['original_smiles_file1']}</td>
                    <td>{row['index_file2']}</td>
                    <td>Error generating image</td>
                    <td>{row['original_smiles_file2']}</td>
                    <td>{row['inchikey_prefix']}</td>
                </tr>
            """
    
    html_content += """
        </table>
    </body>
    </html>
    """
    
    with open(output_html, 'w') as f:
        f.write(html_content)
    
    print(f"Visualization saved to {output_html}")


print(f"\nHermes - SMILES Comparison Tool")
print(f"============================\n")

print(f"Processing file 1")
df1 = process_csv_file("../assets/df1.csv")
print(f"Found {len(df1)} valid structures in file 1")

print(f"\nProcessing file 2")
df2 = process_csv_file("../assets/df2.csv")
print(f"Found {len(df2)} valid structures in file 2")

print(f"\nFinding overlaps...")
overlaps_df = find_overlaps(df1, df2)

visualize_structures(overlaps_df)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

Hermes - SMILES Comparison Tool

Processing file 1


[32m2025-04-20 13:42:30.035[0m | [31m[1mERROR   [0m | [36mathena_smiles_standardisation[0m:[36mstandardize[0m:[36m109[0m - [31m[1mFailed to Standardize :: OC1=C(C=C(C=C1)[As](O)(O)=O)[N+]([O-])=O[0m
[31m[1mNoneType[0m:[1m None[0m
[32m2025-04-20 13:42:30.068[0m | [31m[1mERROR   [0m | [36mathena_smiles_standardisation[0m:[36mstandardize[0m:[36m109[0m - [31m[1mFailed to Standardize :: CC(Cl)Cl.Cl* |lp:2:3,3:3,4:3,m:5:1.0|[0m
[31m[1mNoneType[0m:[1m None[0m


Found 17 valid structures in file 1

Processing file 2


[32m2025-04-20 13:42:30.431[0m | [31m[1mERROR   [0m | [36mathena_smiles_standardisation[0m:[36mstandardize[0m:[36m109[0m - [31m[1mFailed to Standardize :: N[Pt](N)(Cl)Cl[0m
[31m[1mNoneType[0m:[1m None[0m


Found 68 valid structures in file 2

Finding overlaps...
Visualization saved to overlap_structures.html


In [7]:
overlaps_df

Unnamed: 0,index_file1,index_file2,original_smiles_file1,original_smiles_file2,std_smiles_file1,std_smiles_file2,inchikey_file1,inchikey_file2,inchikey_prefix
0,5,25,OC(=O)CC1=C(NC2=C(Cl)C=CC=C2Cl)C=CC=C1,OC(=O)CC1=C(NC2=C(Cl)C=CC=C2Cl)C=CC=C1,O=C([O-])Cc1ccccc1Nc1c(Cl)cccc1Cl,O=C([O-])Cc1ccccc1Nc1c(Cl)cccc1Cl,DCOPUUMXTXDBNB-UHFFFAOYSA-M,DCOPUUMXTXDBNB-UHFFFAOYSA-M,DCOPUUMXTXDBNB
