In [1]:
import pandas as pd
from rdkit import Chem
from rdkit import RDLogger
from rdkit.Chem import SDMolSupplier

In [2]:
# Function to validate and canonicalize SMILES
def validate_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        # Check for explicit valence issues
        for atom in mol.GetAtoms():
            if atom.GetExplicitValence() > Chem.GetPeriodicTable().GetDefaultValence(atom.GetAtomicNum()):
                return None
        # Attempt kekulization to catch "Can't kekulize mol" issues
        try:
            Chem.Kekulize(mol)
        except:
            return None
        # Additional sanitization to catch other potential issues
        try:
            Chem.SanitizeMol(mol)
        except:
            return None
        # Canonicalize the SMILES string
        canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
        return canonical_smiles
    except:
        return None

### **COCONUT**

In [3]:
# Read the SMILES file
coconut_file_path = "./COCONUT_DB_canonical.smi"

# Read the file into a DataFrame, assuming the SMILES strings are in the first column
coconut_df = pd.read_csv(coconut_file_path, sep='\t', header=None, names=['SMILES'])

# Remove IDs from the SMILES strings
coconut_df['SMILES'] = coconut_df['SMILES'].str.split().str[0]

# Set display options to show more characters
pd.set_option('display.max_colwidth', None)

# Print the head of the DataFrame with the full content
print("Initial DataFrame:")
print(coconut_df.head())
print(f"Total number of data points: {coconut_df.shape[0]}")

# Apply the validation function to filter out invalid SMILES
valid_smiles = coconut_df['SMILES'].apply(validate_smiles).dropna()

# Convert the Series of valid SMILES to a DataFrame
valid_smiles_df = pd.DataFrame(valid_smiles, columns=['SMILES'])

print(valid_smiles_df.head())

# Print the total number of valid data points
print(f"Total number of valid data points: {valid_smiles_df.shape[0]}")

# Save the valid SMILES strings to a text file
smiles_file = "cleaned_coconut.txt"
valid_smiles_df['SMILES'].to_csv(smiles_file, index=False, header=False)

print(f"SMILES strings have been saved to {smiles_file}")

Initial DataFrame:
                                                                                      SMILES
0                                             O=C1OC2C(C(=C)C)CC1C3(O)CC4OC54C(=O)OC[CH]253C
1    O=C(O)C=1C(=O)C(O)(CC(=O)C1N)C2OC(COC(=O)C)C(OC(=O)C(N=CS)=CC)C(OC3OC(C)C(O)C(OC)C3)C2O
2  O=C1OC2C(O)C=CC3C4=C5C(=O)C=6C(OC)=CC=C(OC)C6C(=O)C5=C(OC)C=C4CC32C(O)C7=CC(=CC(OC)=C17)C
3       O=C1OC2C(O)C=CC3C4=C5C(=O)C=6C=CC=C(O)C6C(=O)C5=C(OC)C=C4CC32C(O)C7=CC(=CC(OC)=C17)C
4                 O=C(OC1CC(C)(CCOC(=O)C(C)(C)C)C2CC(C)(C)CC2C1OC(=O)C=3C=CC=CC3)C=4C=CC=CC4
Total number of data points: 407270


[14:12:16] Explicit valence for atom # 20 C, 6, is greater than permitted
[14:12:17] Explicit valence for atom # 3 B, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 11 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 7 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is greater than permitted
[14:12:21] Explicit valence for atom # 1 N, 4, is

                                                                                    SMILES
1  CC=C(N=CS)C(=O)OC1C(COC(C)=O)OC(C2(O)CC(=O)C(N)=C(C(=O)O)C2=O)C(O)C1OC1CC(OC)C(O)C(C)O1
2           COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C(=O)c4c(OC)ccc(OC)c4C5=O)CC31C2O
3                COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(OC)c5c4C(=O)c4cccc(O)c4C5=O)CC31C2O
4                   CC1(C)CC2C(OC(=O)c3ccccc3)C(OC(=O)c3ccccc3)CC(C)(CCOC(=O)C(C)(C)C)C2C1
5                 COc1cc(C)cc2c1C(=O)OC1C(O)C=CC3c4c(cc(O)c5c4C(=O)c4cccc(O)c4C5=O)CC31C2O
Total number of valid data points: 377188
SMILES strings have been saved to cleaned_coconut.txt


### **SuperNatural 3.0**

In [14]:
# Read the CSV file
supernatural_file_path = "./supernatural3_full_canonical.csv"

supernatural_df = pd.read_csv(supernatural_file_path, sep=';')

# Set display options to show more characters
pd.set_option('display.max_colwidth', None)

# Print the original number of data points
print("Initial DataFrame for supernatural3_full_canonical.csv:")
print(supernatural_df.head())
print(f"Total number of data points: {supernatural_df.shape[0]}")

# Print the column names
print("Column names in the CSV file:")
print(supernatural_df.columns.tolist())

Initial DataFrame for supernatural3_full_canonical.csv:
             id  parent_id  \
0     SN0000001  SN0000001   
1  SN0000001-01  SN0000001   
2  SN0000001-02  SN0000001   
3  SN0000001-03  SN0000001   
4  SN0000001-04  SN0000001   

                                                                                                     traditional_name  \
0  7-methyl-9-{[(2S,3R,4S,5S,6R)-3,4,5-trihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy}-1H,2H,3H-cyclopenta[c]chromen-4-one   
1  7-methyl-9-{[(2S,3R,4S,5R,6R)-3,4,5-trihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy}-1H,2H,3H-cyclopenta[c]chromen-4-one   
2  7-methyl-9-{[(2S,3R,4S,5S,6R)-3,4,5-trihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy}-1H,2H,3H-cyclopenta[c]chromen-4-one   
3  7-methyl-9-{[(2S,3R,4S,5S,6R)-3,4,5-trihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy}-1H,2H,3H-cyclopenta[c]chromen-4-one   
4  7-methyl-9-{[(2S,3R,4S,5S,6R)-3,4,5-trihydroxy-6-(hydroxymethyl)oxan-2-yl]oxy}-1H,2H,3H-cyclopenta[c]chromen-4-one   

                                    

In [17]:
# Apply the validation function to filter out invalid SMILES
valid_smiles = supernatural_df['smiles'].apply(validate_smiles).dropna()

# Convert the Series of valid SMILES to a DataFrame
valid_smiles_df = pd.DataFrame(valid_smiles, columns=['smiles'])

# Print the total number of valid data points
print(f"Total number of valid data points: {valid_smiles_df.shape[0]}")

# Save the valid SMILES strings to a text file
smiles_file = "supernatural_cleaned.txt"
valid_smiles_df['smiles'].to_csv(smiles_file, index=False, header=False)

print(f"SMILES strings have been saved to {smiles_file}")


[13:34:36] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 10 11 12 16 25 39 42 49 50 51 52 67 69 72 73
[13:35:00] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 10 11 12 15 29 30 32 49 50 51 52 53 54 62 64
[13:35:13] Can't kekulize mol.  Unkekulized atoms: 0 2 39 40 41 42 61
[13:35:15] Explicit valence for atom # 10 C, 5, is greater than permitted
[13:35:22] Can't kekulize mol.  Unkekulized atoms: 5 6 36 37 38 39 58
[13:35:51] Can't kekulize mol.  Unkekulized atoms: 0 2 22 23 24 25 27 28 29
[13:36:08] Can't kekulize mol.  Unkekulized atoms: 2 4 5 6 7 15 16 17 18 21 24
[13:36:18] Can't kekulize mol.  Unkekulized atoms: 5 6 32 33 34 35 54
[13:36:24] SMILES Parse Error: syntax error while parsing: [H]OC(=O)C([H])([H])C([H])([H])C([H])([H])C(=O)N([H])C(=C(\[H])C([H])([H])[H])\C(=O)N1C([H])([H])C([H])([H])C([H])([H])[C@@]1([H])C(=O)N([H])[C@]([H])(C(=O)N([H])[C@@]([H])(C(=O)N([H])[C@@]([H])(C(=O)N([H])[C@@]([H])(C(=O)N([H])[C@]([H])(C(=O)N([H])[C@@]([H])(C(=O)N([H])[C@@]([H])(

Total number of valid data points: 1115706
SMILES strings have been saved to supernatural_cleaned.txt


### **LOTUS**

In [3]:
lotus_file_path = "./LOTUS_DB.smi"

lotus_df = pd.read_csv(lotus_file_path, sep='\t', header=None, names=['SMILES', 'identifier'])

# Print the original number of data points
print("Initial DataFrame for LOTUS_DB.smi:")
print(lotus_df.head())
print(f"Total number of data points: {lotus_df.shape[0]}")



Initial DataFrame for LOTUS_DB.smi:
                                              SMILES  identifier
0  CO[C@H](C=C(C)C)C[C@@H](C)[C@H]1CC[C@@]2(C)[C@...  LTS0257199
1  CC(=O)OC1C(OC2C(OC3CC(O)CC4=CCC5C6CC7OC8(CCC(C...  LTS0124597
2  C[C@H](CO)[C@H]1OC(=O)C=C2C1=C[C@H]1OC(=O)[C@]...  LTS0121510
3                                    CS(=O)(=O)C=CCO  LTS0110032
4             COc1cccc2c1C(=O)O[C@H](CCC[C@H](C)O)C2  LTS0103990
Total number of data points: 276518


In [4]:
# Apply the validation function to filter out invalid SMILES
valid_smiles = lotus_df['SMILES'].apply(validate_smiles).dropna()

# Convert the Series of valid SMILES to a DataFrame
valid_smiles_df = pd.DataFrame(valid_smiles, columns=['SMILES'])

# Print the total number of valid data points
print(f"Total number of valid data points: {valid_smiles_df.shape[0]}")

# Save the valid SMILES strings to a text file
smiles_file = "lotus_cleaned.txt"
valid_smiles_df['SMILES'].to_csv(smiles_file, index=False, header=False)

print(f"SMILES strings have been saved to {smiles_file}")

Total number of valid data points: 270358
SMILES strings have been saved to lotus_cleaned.txt


### **Zinc**

In [3]:
# Read the text file
zinc_file_path = "./250k_rndm_zinc_drugs_clean_sorted.txt"

# Read the file into a DataFrame, assuming the SMILES strings are in the first column
zinc_df = pd.read_csv(zinc_file_path, sep='\t', header=None, names=['SMILES'])

# Print the original number of data points
print("Initial DataFrame for 250k_rndm_zinc_drugs_clean_sorted.txt:")
print(zinc_df.head())
print(f"Total number of data points: {zinc_df.shape[0]}")

Initial DataFrame for 250k_rndm_zinc_drugs_clean_sorted.txt:
                    SMILES
0              C1CSCCSCCS1
1       Br[C@H]1CC[NH2+]C1
2           CCSC[C@H](C)CS
3      IC[C@@H]1CCC[NH2+]1
4  OC[C@@H](Br)[C@H](Br)CO
Total number of data points: 249456


In [4]:

# Apply the validation function to filter out invalid SMILES
valid_smiles = zinc_df['SMILES'].apply(validate_smiles).dropna()

# Convert the Series of valid SMILES to a DataFrame
valid_smiles_df = pd.DataFrame(valid_smiles, columns=['SMILES'])

# Print the total number of valid data points
print(f"Total number of valid data points: {valid_smiles_df.shape[0]}")

# Save the valid SMILES strings to a text file
smiles_file = "zinc_cleaned.txt"
valid_smiles_df['SMILES'].to_csv(smiles_file, index=False, header=False)

print(f"SMILES strings have been saved to {smiles_file}")

Total number of valid data points: 154168
SMILES strings have been saved to zinc_cleaned.txt


### **Tox21**

In [9]:
# Read the SDF file
tox21_file_path = "./Tox21/tox21_10k_data_all.sdf"

# Read the SDF file and extract SMILES strings
suppl = Chem.SDMolSupplier(tox21_file_path)
smiles_list = []
for mol in suppl:
    if mol is not None:
        smiles_list.append(Chem.MolToSmiles(mol))

# Convert the list of SMILES strings to a DataFrame
smiles_df = pd.DataFrame(smiles_list, columns=['SMILES'])

# Print the first few rows and the total number of data points
print("Extracted SMILES strings:")
print(smiles_df.head())
print(f"Total number of data points: {smiles_df.shape[0]}")

[14:07:33] Explicit valence for atom # 3 Cl, 2, is greater than permitted
[14:07:33] ERROR: Could not sanitize molecule ending on line 21572
[14:07:33] ERROR: Explicit valence for atom # 3 Cl, 2, is greater than permitted
[14:07:34] Explicit valence for atom # 2 Si, 8, is greater than permitted
[14:07:34] ERROR: Could not sanitize molecule ending on line 346021
[14:07:34] ERROR: Explicit valence for atom # 2 Si, 8, is greater than permitted
[14:07:34] Explicit valence for atom # 3 Cl, 2, is greater than permitted
[14:07:34] ERROR: Could not sanitize molecule ending on line 446665
[14:07:34] ERROR: Explicit valence for atom # 3 Cl, 2, is greater than permitted
[14:07:34] Explicit valence for atom # 1 Cl, 2, is greater than permitted
[14:07:34] ERROR: Could not sanitize molecule ending on line 619150
[14:07:34] ERROR: Explicit valence for atom # 1 Cl, 2, is greater than permitted


Extracted SMILES strings:
                                                                                                                                                                                                                              SMILES
0                                                                                                                                                                      C[n+]1c2cc(N)ccc2cc2ccc(N)cc21.Nc1ccc2cc3ccc(N)cc3nc2c1.[Cl-]
1                                                                                                                                                      O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(Br)c([O-])c(Br)cc12.[Na+].[Na+]
2  CO[C@@H]1[C@@H](OC)[C@H](C)[C@@](O)(CC(=O)[O-])O[C@H]1[C@H](C)[C@H]1O[C@@]2(CC[C@@](C)([C@H]3CC[C@@](C)([C@@H]4O[C@@H]([C@H]5O[C@](C)(O)[C@H](C)C[C@@H]5C)C[C@@H]4OC4C[C@H](OC)[C@@H](OC)[C@H](C)O4)O3)O2)C[C@H](O)[C@H]1C.[NH4+]
3                                                         

[14:07:35] Explicit valence for atom # 2 Si, 8, is greater than permitted
[14:07:35] ERROR: Could not sanitize molecule ending on line 810094
[14:07:35] ERROR: Explicit valence for atom # 2 Si, 8, is greater than permitted


In [11]:
# Apply the validation function to filter out invalid SMILES
valid_smiles = smiles_df['SMILES'].apply(validate_smiles).dropna()

# Convert the Series of valid SMILES to a DataFrame
valid_smiles_df = pd.DataFrame(valid_smiles, columns=['SMILES'])

# Print the total number of valid data points
print(f"Total number of valid data points: {valid_smiles_df.shape[0]}")

# Save the valid SMILES strings to a text file
smiles_file = "tox21_cleaned.txt"
valid_smiles_df['SMILES'].to_csv(smiles_file, index=False, header=False)

print(f"SMILES strings have been saved to {smiles_file}")



Total number of valid data points: 9390
SMILES strings have been saved to tox21_cleaned.txt


### **PubChem**

In [12]:
# Read the text file
pubchem_file_path = "./pubchem_1k_smiles.txt"

# Read the file into a DataFrame, assuming the SMILES strings are in the first column
pubchem_df = pd.read_csv(pubchem_file_path, sep='\t', header=None, names=['SMILES'])

# Print the original number of data points
print("Initial DataFrame for pubchem_1k_smiles.txt:")
print(pubchem_df.head())
print(f"Total number of data points: {pubchem_df.shape[0]}")




Initial DataFrame for pubchem_1k_smiles.txt:
                                                         SMILES
0                        CN(c1ccccc1)c1ccccc1C(=O)NCC1(O)CCOCC1
1                    CC[NH+](CC)C1CCC([NH2+]C2CC2)(C(=O)[O-])C1
2                            COCC(CNC(=O)c1ccc2c(c1)NC(=O)C2)OC
3                                  OCCn1cc(CNc2cccc3c2CCCC3)nn1
4  CCCCCCc1ccc(C#Cc2ccc(C#CC3=CC=C(CCC)CC3)c(C3CCCCC3)c2)c(F)c1
Total number of data points: 999


In [13]:
# Apply the validation function to filter out invalid SMILES
valid_smiles = pubchem_df['SMILES'].apply(validate_smiles).dropna()

# Convert the Series of valid SMILES to a DataFrame
valid_smiles_df = pd.DataFrame(valid_smiles, columns=['SMILES'])

# Print the total number of valid data points
print(f"Total number of valid data points: {valid_smiles_df.shape[0]}")

# Save the valid SMILES strings to a text file
smiles_file = "pubchem_cleaned.txt"
valid_smiles_df['SMILES'].to_csv(smiles_file, index=False, header=False)

print(f"SMILES strings have been saved to {smiles_file}")

Total number of valid data points: 526
SMILES strings have been saved to pubchem_cleaned.txt
