In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MolToSmiles
import re
import tqdm

In [2]:
df = pd.read_csv('../data/antidiabetic/antidiabetic_molecules_smiles.csv')

In [3]:
def remove_slashes(text: str) -> str:
    #import re
    """
    Removes all forward and backward slashes from the input string.

    Parameters:
    text (str): The input string.

    Returns:
    str: The string without forward and backward slashes.
    """
    return re.sub(r'[\\/]', '', text)

df['SMILES'] = df['SMILES'].apply(remove_slashes)

In [4]:
def smiles_to_mol(df: pd.DataFrame, smiles_column: str = 'SMILES') -> pd.DataFrame:
    """
    Converts SMILES strings to molecule objects with hydrogen atoms appended.
    
    Parameters:
    df (pd.DataFrame): Input DataFrame with SMILES column.
    smiles_column (str): Name of the column containing SMILES strings (default 'SMILES').
    
    Returns:
    pd.DataFrame: DataFrame with 'MOL' column appended containing molecule objects.
    """
    mol_list = []
    
    for smile in df[smiles_column]:
        mol = Chem.MolFromSmiles(smile)
        if mol is not None:
            mol = Chem.AddHs(mol)
        mol_list.append(mol)
    
    df['MOL'] = mol_list
    return df

In [5]:
df = smiles_to_mol(df)

In [6]:
def generate_smiles_variants(smiles_str):
    mol = Chem.MolFromSmiles(smiles_str)
    smiles_variants = set()
    for i in range(10000):
        smiles_variants.add(MolToSmiles(mol, canonical=False, doRandom=True))
    return smiles_variants

# Generate SMILES variants and write to file
with open('../data/antidiabetic/ANTIDIABETIC_smiles_variants1.txt', 'w') as file:
    for index, row in tqdm.tqdm(df.iterrows(), total=len(df)):
        variants = generate_smiles_variants(row['SMILES'])
        for variant in variants:
            file.write(variant + '\n')

print("SMILES variants successfully written to ANTIDIABETIC_smiles_variants.txt")

100%|███████████████████████████████████████████| 46/46 [00:28<00:00,  1.60it/s]

SMILES variants successfully written to ANTIDIABETIC_smiles_variants.txt



