In [1]:
import os
import time
import shutil
import subprocess
import pandas as pd
import multiprocessing
from rdkit.Chem import Draw
from natsort import natsorted
from decimer import DECIMER as dc
from joblib import Parallel, delayed
from rdkit.Chem import PandasTools as pt

In [4]:
def SmilesToImage(path):
    '''Function to convert SMILES in a csv with column mol to images.'''
    start_time = time.time()
    df = pd.read_csv(path)
    pt.AddMoleculeColumnToFrame(df, smilesCol='mol')
    gen_dir = f"{os.path.split(path)[0]}/{os.path.splitext(os.path.basename(path))[0]}"
    print(f'{len(df)} Images saving to: {gen_dir}')
    os.mkdir(gen_dir)
    for mol, name in zip(df['ROMol'],df['NAME_']):
        Draw.MolToFile(mol, f'{gen_dir}/{name}.png')
    print(f'The time taken to generate images for {len(df)} moecules is:', time.time()-start_time,'seconds')    

In [5]:
SmilesToImage("/home/administrator/satvik/sandbox/molGAN/databases/dcm_random.csv")

200 Images saving to: /home/administrator/satvik/sandbox/molGAN/databases/dcm_random
The time taken to generate images for 200 moecules is: 0.9935050010681152 seconds


In [16]:
def ImageToSmiles(folder_path):
    '''Function to make a dataframe containing SMILES and Names for images in a folder.'''
    start_time = time.time()
    
    def ImageConverter(self):
        model_name = "Canonical"
        smiles = dc.predict_SMILES(self,model_name)
        global converted_images
        converted_images = f'{os.path.splitext(os.path.basename(self))[0]},{smiles}'
        return converted_images
    
    def Parallelizer(folder_path):
        images_list = [os.path.abspath(os.path.join(folder_path, p)) for p in os.listdir(folder_path)]
        global job
        job = Parallel(n_jobs=(multiprocessing.cpu_count()-1))(delayed(ImageConverter)(images) for images in images_list)
        return job
    con_data = Parallelizer(folder_path)
    
    def MakeDataframe(con_data):
        global con_df
        con_df = pd.DataFrame([sub.split(",") for sub in natsorted(con_data)],columns=['Image_Name','SMILES'])
        con_df.to_csv(f'{folder_path}/{os.path.basename(folder_path)}.csv',index=False)
        return con_df
    MakeDataframe(con_data)
    
    print(f"The time taken to convert {len(con_data)} images is {time.time()-start_time} seconds")

In [17]:
ImageToSmiles("/home/administrator/satvik/sandbox/molGAN/databases/dcm_random")

The time taken to convert 3 images is 7.4657862186431885 seconds


In [None]:
def SDFtoCanCSV(self):
    start_time = time.time()
    sdf = pt.LoadSDF(self)
    sdf['can-smiles'] = [Chem.MolToSmiles(mol, isomericSmiles=False)for mol in sdf['ROMol']]
    df = sdf.drop(['ROMol'],axis=1)
    df.to_csv(f'{os.path.split(self)[0]}/{os.path.basename(self)}-can_smiles.csv',index=False)
    print(f'Canonical SMILES have been written to "can-smiles" in the csv file for {len(df)} molecules.')
    print(f'Time Taken: {time.time() - start_time} seconds')
    return df