# Conformer Generation Pipeline

## **Step 1: Input Smile and Reference Conformer**

In [None]:
import time

# Capture the start time
start_time: float = time.time()

In [None]:
import sys

# Add a folder to Python's import path
sys.path.append('/app')

In [None]:
from atk_conformer_generation_pipeline.utils import *
from atk_conformer_generation_pipeline.variables import *
import os
import glob
import re
import subprocess
from termcolor import colored

In [None]:
#Change the dir to /work
os.chdir("/work")
!pwd

**Change the below variables accordingly**

In [None]:
os.makedirs(output_dir, exist_ok=True)
os.chdir(output_dir)

In [None]:
!pwd

In [None]:
import sys
sys.setrecursionlimit(10000)
# set the maximum depth of the Python interpreter stack. This stack depth is crucial for recursive function calls, 
# as it limits how deep the recursion can go before causing a RecursionError.

**Importing the necessary libraries**

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import shutil
import time
import os
import re
import shutil
import pandas as pd
import numpy as np
from numpy import loadtxt
import csv
from typing import *
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from matplotlib.gridspec import GridSpec

In [None]:
%%time

### Remove all files and directories created in the previous execution to avoid any confusion

file_and_dir_to_remove: List[str]=[init_conf_xyz,opt_conf_SMILES_file,similarity_output_csv,
feasible_geometries_csv,infeasible_geometries_csv,feasible_geometries_xyz,infeasible_geometries_xyz,pairwise_RMSDs_dat,
pairwise_RMSDs_csv,cluster_reps_csv,cluster_reps_xyz,cluster_rep_prefix,cluster_reps_dir,clusters_RMSD_stats_csv,clusters_energy_stats_csv,
opt_cluster_reps_csv, opt_conf_energy_csv, opt_conf_sdf]

remove_paths(file_and_dir_to_remove)

# Step 2: Loading GNNIS & TD Conformers 

In [None]:
def loading_sdf_file(sdf_file):
    # Load all conformers from the SDF file
    supplier = Chem.SDMolSupplier(sdf_file, removeHs=False)
    
    # Create a new molecule to hold all conformers
    mol = None
    
    for i, m in enumerate(supplier):
        if m is None:
            print(f"[Warning] Molecule {i} could not be read. Check formatting in SDF.")
            continue
    
        if mol is None:
            mol = Chem.Mol(m)
            mol.RemoveAllConformers()  # start with clean conformer list
    
        # Add conformer with unique ID
        conf = m.GetConformer()
        conf.SetId(i)
        mol.AddConformer(conf, assignId=True)
    return mol

mol_gnnis=loading_sdf_file("conformers_gnnis.sdf")
mol_td=loading_sdf_file("conformers_TD.sdf")

# Generating Conformers using RDKit

In [None]:
%%time

import time
import sys


mol_rdkit_ini: Chem.Mol= generate_conformers(inp_smiles, num_conf_rdkit)  # Call the function to generate conformers

### Optimize the generated conformers and save the optimized coordinates
mol_rdkit, _ = mmff_optimize_conformers(mol_rdkit_ini)     # Call the function to optimize conformers
save_conformers_to_sdf(mol_rdkit,"conformers_RDKit.sdf")

# Combining all the conformers from TD, GNNIS and RDKit

In [None]:
def simple_combine_conformers(mol1, mol2):
    """Simple conformer combination"""
    
    combined_mol = Chem.Mol(mol1)
    combined_mol.RemoveAllConformers()
    
    # Add all conformers from both molecules
    for i in range(mol1.GetNumConformers()):
        conf = mol1.GetConformer(i)
        combined_mol.AddConformer(conf, assignId=True)
    
    for i in range(mol2.GetNumConformers()):
        conf = mol2.GetConformer(i)
        combined_mol.AddConformer(conf, assignId=True)
    
    return combined_mol

# Usage
mol_1 = simple_combine_conformers(mol_rdkit, mol_gnnis)
mol_2=simple_combine_conformers(mol_1, mol_td)
save_conformers_to_sdf(mol_2, init_conf_sdf)

# Applying cis/trans filter

In [None]:
!python /app/dihedral_filter.py "{inp_smiles}" {init_conf_sdf} filtered_initial_generated_conformers.sdf
#Reading it as mol
mol=loading_sdf_file("filtered_initial_generated_conformers.sdf")

In [None]:
# Find the number of atoms in the molecule
num_atoms_generated_conf: int = mol.GetNumAtoms()

# Step 3: Calculating Energy of the Conformers using ANI2x

In [None]:
import torchani
from rdkit.Geometry import Point3D
import torch

def ani_optimize_conformers(opt_mol: Chem.Mol, model_name: str = 'ANI2x', 
                           max_iter: int = 50, batch_size: int = 20) -> Tuple[Chem.Mol, Dict[int, float]]:
    """Optimize the conformers of a molecule using the ANI model with performance optimizations.

    Args:
        opt_mol (Chem.Mol): The molecule with conformers to optimize.
        model_name (str): ANI model to use ('ANI1ccx', 'ANI2x', etc.).
        max_iter (int): Maximum optimization iterations per conformer.
        batch_size (int): Number of conformers to process in parallel.

    Returns:
        Tuple[Chem.Mol, Dict[int, float]]: The optimized molecule and a dictionary
                                           containing ANI energies of the conformers in kcal/mol.
    """
    # Conversion factor from Hartree to kcal/mol
    HARTREE_TO_KCAL_MOL = 627.509474
    
    # Set up device and load ANI model ONCE
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    if model_name == 'ANI1ccx':
        model = torchani.models.ANI1ccx().to(device)
    elif model_name == 'ANI2x':
        model = torchani.models.ANI2x().to(device)
    else:
        raise ValueError(f"Unsupported ANI model: {model_name}")
    
    # Set model to evaluation mode for better performance
    model.eval()
    
    # An empty dictionary to store ANI energies of optimized conformers
    ani_energies: Dict[int, float] = {}
    
    # Get species tensor for all atoms in the molecule ONCE
    symbols = [atom.GetSymbol() for atom in opt_mol.GetAtoms()]
    species_str = ''.join(symbols)
    
    try:
        species = model.species_to_tensor(species_str).to(device).unsqueeze(0)
    except Exception:
        raise ValueError(f"ANI model does not support all atom types in the molecule")

    num_conformers = opt_mol.GetNumConformers()
    print(f"Optimizing {num_conformers} conformers...")
    
    # Process conformers in batches for better memory management
    for batch_start in range(0, num_conformers, batch_size):
        batch_end = min(batch_start + batch_size, num_conformers)
        print(f"Processing conformers {batch_start}-{batch_end-1}")
        
        batch_coords = []
        batch_conf_ids = []
        
        # Prepare batch data
        for conf_id in range(batch_start, batch_end):
            conf = opt_mol.GetConformer(conf_id)
            positions = conf.GetPositions()
            coordinates = torch.tensor(positions, dtype=torch.float32, device=device)
            batch_coords.append(coordinates)
            batch_conf_ids.append(conf_id)
        
        # Stack coordinates for batch processing
        batch_coordinates = torch.stack(batch_coords).requires_grad_(True)
        batch_species = species.repeat(len(batch_coords), 1)
        
        # Optimize batch
        optimizer = torch.optim.LBFGS([batch_coordinates], 
                                     max_iter=max_iter,
                                     tolerance_grad=1e-4,  # Looser convergence for speed
                                     tolerance_change=1e-6)
        
        def closure():
            optimizer.zero_grad()
            _, energies = model((batch_species, batch_coordinates))
            total_energy = energies.sum()
            total_energy.backward()
            return total_energy
        
        # Optimize the batch
        start_time = time.time()
        optimizer.step(closure)
        opt_time = time.time() - start_time
        print(f"Batch optimization took {opt_time:.2f} seconds")
        
        # Extract results and update molecule
        with torch.no_grad():
            _, final_energies = model((batch_species, batch_coordinates))
            optimized_coords = batch_coordinates.cpu().numpy()
            
            for i, conf_id in enumerate(batch_conf_ids):
                conf = opt_mol.GetConformer(conf_id)
                coords = optimized_coords[i]
                
                # Update conformer coordinates
                for atom_idx, pos in enumerate(coords):
                    point = Point3D(float(pos[0]), float(pos[1]), float(pos[2]))
                    conf.SetAtomPosition(atom_idx, point)
                
                # Store energy converted to kcal/mol
                energy_hartree = final_energies[i].item()
                energy_kcal_mol = energy_hartree * HARTREE_TO_KCAL_MOL
                ani_energies[conf_id] = energy_kcal_mol
        
        # Clear GPU cache to free memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    return opt_mol, ani_energies


In [None]:
%%time

### Optimize the generated conformers and save the optimized coordinates
opt_mol, ani_energies = ani_optimize_conformers(mol)     # Call the function to optimize conformers
save_conformers_to_sdf(opt_mol,opt_conf_sdf)


num_opt_conf: int= opt_mol.GetNumConformers()

### Save the energies of optimized to a CSV file
ani_energies_items : List[Tuple[int, float]] = list(ani_energies.items())
energy_DF: pd.DataFrame = pd.DataFrame(ani_energies_items, columns=['conformer_id', 'energy_in_kcalpermol'])
energy_DF.to_csv(opt_conf_energy_csv, index=False)

In [None]:
%%time

### Convert the 3D gometries of conformers into SMILES and save them
convert_conformers_to_smiles(opt_conf_sdf,opt_conf_SMILES_file)

In [None]:
### Process optimized conformers to calculate Tanimoto similarity and separate feasible and infeasible geometries.
infeasible_geom_DF, energy_DF=process_conformers(opt_conf_SMILES_file,opt_conf_sdf,feasible_geometries_sdf,infeasible_geometries_sdf,similarity_output_csv,infeasible_geometries_csv,inp_smiles,num_opt_conf,energy_DF)


In [None]:
%%time

### Calculate the numbers of conformers with feasible and infeasible geometries
num_feasible_geom: int = len(energy_DF)
num_infeasible_geom: int = len(infeasible_geom_DF)

with open("outputs.txt", 'a') as file:
    file.write(f'Number_of_feasible_geometries: {num_feasible_geom}\n')
    
print("Number of conformers with infeasible geometries:", num_infeasible_geom)
print("Number of conformers with feasible geometries:", num_feasible_geom)
print("Total number of conformers for which the geometry feasibility was checked:", num_infeasible_geom+num_feasible_geom)
#print("Total number of conformers generated:", num_conf)

In [None]:
%%time

### Calculate the relative energies of conformers and write the results to a CSV file.
rel_energy_DF: pd.DataFrame=calculate_relative_energies(energy_DF,feasible_geometries_csv)


In [None]:
%%time

fig = plt.figure(figsize=(4, 4))

### Plot the relative energy distribution for conformers with feasible geometries
n_bins=10
plt.hist(rel_energy_DF['rel_energy_in_kcalpermol'], bins=n_bins, density=False, color='black', histtype='step', fill=False, lw=2)
#density=False: If True, the histogram is normalized so that the area under the histogram integrates to 1. If False, the histogram represents the count of occurrences in each bin.
#'bar': Traditional bar histogram (default)
plt.xlabel('Rel. ANI-2x Energy (kcal/mol)')
plt.ylabel('Count')
plt.title('GRT-ani')
plt.grid(False)

### Show the plot
plt.show()

### Save figure
fig.savefig("rel_ANI-2x_energies-count_histogram", bbox_inches='tight', pad_inches=0.04, transparent = False)

## **Step 4: Clustering using Autograph**

In [None]:
os.makedirs("conformers_xyz", exist_ok=True)
output_path = "conformers_xyz/conf_.xyz"

# Shell command uses curly braces for variables:
!obabel -isdf {feasible_geometries_sdf} -oxyz -O {output_path} -m

In [None]:
# 4. Rename XYZ files from 1-based (conf_1.xyz, ...) to 0-based indexing (conf_0.xyz, ...)
temp_dir = os.path.join('conformers_xyz', 'temp_rename')
os.makedirs(temp_dir, exist_ok=True)

files = sorted([f for f in os.listdir('conformers_xyz') if f.startswith('conf_') and f.endswith('.xyz')],
               key=lambda x: int(x.split('_')[1].split('.')[0]))

for f in files:
    current_index = int(f.split('_')[1].split('.')[0])
    new_index = current_index - 1
    new_name = f'conf_{new_index}.xyz'
    shutil.move(os.path.join('conformers_xyz', f), os.path.join(temp_dir, new_name))
    print(f"Renamed: {f} -> {new_name}")

# Move renamed files back and clean up
for f in os.listdir(temp_dir):
    shutil.move(os.path.join(temp_dir, f), os.path.join('conformers_xyz', f))
os.rmdir(temp_dir)

print(f"All files renamed to zero-based indexing in 'conformers_xyz'. Total files: {len(files)}")


In [None]:
def update_energy_csv_with_filenames(df, prefix="conf", extension=".xyz"):
    """
    Update conformer_id column to use xyz filenames
    
    Parameters:
    csv_file (str): Path to your energy CSV file
    prefix (str): Prefix for the xyz files (e.g., "conf")
    extension (str): File extension (e.g., ".xyz")
    """
    
    # Create new filename column based on conformer_id
    df['Filenames'] = df['conformer_id'].apply(lambda x: f"{prefix}_{x}{extension}")
    
    # Rename columns to match AutoGraph expected format
    df = df.rename(columns={
        'Filenames': 'filenames',
        'rel_energy_in_kcalpermol': 'ANI'
    })
    
    # Keep only the required columns
    df_final = df[['filenames', 'ANI']]

    # Need for Autograph
    df_final.sort_values(by="ANI", ascending=True, inplace=True)
    
    # Save the updated CSV
    output_file = "energy_autograph.csv"
    df_final.to_csv(output_file, index=False)
    
    print(f"Updated CSV saved as: {output_file}")
    print(f"Sample rows:")
    print(df_final.head(10))
    
    return df_final

if __name__ == "__main__":
    # Update your energy.csv file
    updated_df = update_energy_csv_with_filenames(rel_energy_DF)


In [None]:
#!pip install pandas==1.3.5

if os.path.isdir('cluster_result_Autograph'):
    shutil.rmtree('cluster_result_Autograph')

inputs = "\n".join([
    "continue",
    "y",
    "conformers_xyz",
    "cluster_result_Autograph",
    "y"
    "energy_autograph.csv",
    "ANI"
])

process = subprocess.run(
    ["python", "/work/AutoGraph/AutoGraph.py"],
    input=inputs,
    text=True
)


## **Step 5: Identifying Cluster Representative**

Identifying the minimum energy conformer within each cluster as its representative

In [None]:
# Printing cluster centre indices
indices = []

# List all files in the directory
for filename in os.listdir("cluster_result_Autograph/centers"):
    # Match files like conf_123.xyz using regex
    match = re.match(r'conf_(\d+)\.xyz$', filename)
    if match:
        index = int(match.group(1))
        indices.append(index)


In [None]:
#Saving Cluster representatives
suppl = Chem.SDMolSupplier(feasible_geometries_sdf, removeHs=False)
writer = Chem.SDWriter("cluster_rep_conformers.sdf")

for idx, mol in enumerate(suppl):
    if mol is not None and idx in indices:
        writer.write(mol)
writer.close()

cluster_reps_DF = rel_energy_DF.loc[indices]
cluster_reps_DF['rel_energy_in_kcalpermol'] = cluster_reps_DF['energy_in_kcalpermol'] - cluster_reps_DF['energy_in_kcalpermol'].min()
cluster_reps_DF.sort_values(by="rel_energy_in_kcalpermol", ascending=True, inplace=True)
cluster_reps_DF.to_csv(cluster_reps_csv, index=False)

In [None]:
# Create the folder if it doesn't exist
os.makedirs(cluster_reps_dir, exist_ok=True)

# Read molecules from SDF
supplier = Chem.SDMolSupplier("cluster_rep_conformers.sdf", removeHs=False)

# Loop through and save each conformer
for i, mol in enumerate(supplier):
    if mol is None:
        continue  # skip invalid entries
    output_path = os.path.join(cluster_reps_dir, f"rep_of_cluster_{i}.sdf")
    writer = Chem.SDWriter(output_path)
    writer.write(mol)
    writer.close()
