In [1]:
from rdkit import Chem
from rdkit.Chem import MolToSmiles
import pandas as pd

def enumerate_smiles(smiles, n_variants=10):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return []
    smiles_set = set()
    for _ in range(n_variants):
        # doRandom=True randomizes the atom traversal
        variant = MolToSmiles(mol, doRandom=True)
        smiles_set.add(variant)
    return list(smiles_set)

In [4]:
# Load the CSV file

paths = ['pos_train.csv', 'neg_train.csv', 'pos_test.csv', 'neg_test.csv']
for file_path in paths:
    df = pd.read_csv("model_data/" + file_path, header=None)

    # Convert the DataFrame column to a list
    original_smiles = df[0].tolist()

    # Create a list to store the augmented data
    augmented_data = []

    # Apply the enumerate_smiles function to each SMILES in the list
    for smiles in original_smiles:
        variants = enumerate_smiles(smiles, n_variants=3)  # Generate 5 variants for each SMILES
        for variant in variants:
            augmented_data.append({'smiles': variant})

    # Convert the augmented data to a DataFrame
    augmented_df = pd.DataFrame(augmented_data)

    # Append the augmented SMILES to the original DataFrame
    updated_df = pd.concat([df, augmented_df], ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    updated_df.to_csv("model_data/" + file_path, index=False)

    print(f"Augmented SMILES have been added to {file_path}")

Augmented SMILES have been added to pos_train.csv
Augmented SMILES have been added to neg_train.csv
Augmented SMILES have been added to pos_test.csv
Augmented SMILES have been added to neg_test.csv
