## Converting from raw SMILES data to canonized/cleaned data

In [None]:
#import
import pandas as pd
from rdkit import Chem

In [None]:
#read smiles data in
smiles_df = pd.read_csv("smiles.csv") 
smiles_only = smiles_df["smiles"]

In [None]:
#create rdkit molecule objects
molecules = [Chem.MolFromSmiles(smile) for smile in smiles_only]



In [None]:
#using pickling because the conversion process is extremely memory-intensive

import pickle

with open('molecules.pkl', 'rb') as f:
    molecules = pickle.load(f)

In [None]:
#breaking data into multiple arrays

array_1 = molecules[0:100000]
array_2 = molecules[100000:200000]
array_3 = molecules[200000:300000]
array_4 = molecules[300000:400000]
array_5 = molecules[400000:500000]
array_6 = molecules[500000:600000]
array_7 = molecules[600000:]

In [4]:
#examining rdkit object

len(molecules)
#706862 total molecules


706862

In [None]:
#sanitizing each array using RDkit - converting mol to SMILES to mol

sanitized_1 = []

for molecule in array_1:
    if molecule is not None:
        Chem.SanitizeMol(molecule) # sanitize
        smiles_no_stereo = Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=True)        
        mol_no_stereo = Chem.MolFromSmiles(smiles_no_stereo)
        sanitized_1.append(mol_no_stereo)

with open('sanitized_1.pkl', 'wb') as f:
    pickle.dump(sanitized_1, f)

In [6]:
sanitized_2 = []

for molecule in array_2:
    if molecule is not None:
        Chem.SanitizeMol(molecule) # sanitize
        smiles_no_stereo = Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=True)        
        mol_no_stereo = Chem.MolFromSmiles(smiles_no_stereo)
        sanitized_2.append(mol_no_stereo)

with open('sanitized_2.pkl', 'wb') as f:
    pickle.dump(sanitized_2, f)

In [7]:
sanitized_3 = []

for molecule in array_3:
    if molecule is not None:
        Chem.SanitizeMol(molecule) # sanitize
        smiles_no_stereo = Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=True)        
        mol_no_stereo = Chem.MolFromSmiles(smiles_no_stereo)
        sanitized_3.append(mol_no_stereo)

with open('sanitized_3.pkl', 'wb') as f:
    pickle.dump(sanitized_3, f)

In [None]:
sanitized_4 = []

for molecule in array_4:
    if molecule is not None:
        Chem.SanitizeMol(molecule) # sanitize
        smiles_no_stereo = Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=True)        
        mol_no_stereo = Chem.MolFromSmiles(smiles_no_stereo)
        sanitized_4.append(mol_no_stereo)

with open('sanitized_4.pkl', 'wb') as f:
    pickle.dump(sanitized_4, f)

In [4]:
sanitized_5 = []

for molecule in array_5:
    if molecule is not None:
        Chem.SanitizeMol(molecule) # sanitize
        smiles_no_stereo = Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=True)        
        mol_no_stereo = Chem.MolFromSmiles(smiles_no_stereo)
        sanitized_5.append(mol_no_stereo)

with open('sanitized_5.pkl', 'wb') as f:
    pickle.dump(sanitized_5, f)



In [5]:
sanitized_6 = []

for molecule in array_6:
    if molecule is not None:
        Chem.SanitizeMol(molecule) # sanitize
        smiles_no_stereo = Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=True)        
        mol_no_stereo = Chem.MolFromSmiles(smiles_no_stereo)
        sanitized_6.append(mol_no_stereo)

with open('sanitized_6.pkl', 'wb') as f:
    pickle.dump(sanitized_6, f)



In [6]:
sanitized_7 = []

for molecule in array_7:
    if molecule is not None:
        Chem.SanitizeMol(molecule) # sanitize
        smiles_no_stereo = Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=True)        
        mol_no_stereo = Chem.MolFromSmiles(smiles_no_stereo)
        sanitized_7.append(mol_no_stereo)

with open('sanitized_7.pkl', 'wb') as f:
    pickle.dump(sanitized_7, f)



In [4]:
with open('sanitized_1.pkl', 'rb') as f:
    sanitized_1 = pickle.load(f)

In [5]:
with open('sanitized_2.pkl', 'rb') as f:
    sanitized_2 = pickle.load(f)

In [6]:
with open('sanitized_3.pkl', 'rb') as f:
    sanitized_3 = pickle.load(f)

In [7]:
with open('sanitized_4.pkl', 'rb') as f:
    sanitized_4 = pickle.load(f)

In [8]:
with open('sanitized_5.pkl', 'rb') as f:
    sanitized_5 = pickle.load(f)

In [9]:
with open('sanitized_6.pkl', 'rb') as f:
    sanitized_6 = pickle.load(f)

In [10]:
with open('sanitized_7.pkl', 'rb') as f:
    sanitized_7 = pickle.load(f)

In [None]:
#stitching it all back together

sanitized = sanitized_1 + sanitized_2 + sanitized_3 + sanitized_4 + sanitized_5 + sanitized_6 + sanitized_7

In [None]:
#remove duplicates
sanitized = list(set(sanitized))

In [None]:
#converting back to smiles and saving as new csv (sanitized_smiles)

smiles_list = [Chem.MolToSmiles(molecule) for molecule in sanitized]


In [19]:
df = pd.DataFrame(smiles_list, columns=['SMILES'])
df.to_csv('sanitized_smiles.csv', index=False)