In [None]:
import pandas as pd
from rdkit import Chem
import pickle

def sanitize_smiles(smiles_list):
    sanitized = []
    for smile in smiles_list:
        molecule = Chem.MolFromSmiles(smile)
        if molecule is not None:
            Chem.SanitizeMol(molecule) 
            smiles_no_stereo = Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=True)
            mol_no_stereo = Chem.MolFromSmiles(smiles_no_stereo)
            sanitized.append(mol_no_stereo)
    return sanitized

def sanitize_noncanon(smiles_list):
    sanitized_noncanon = []
    for smile in smiles_list:
        molecule = Chem.MolFromSmiles(smile)
        if molecule is not None:
            Chem.SanitizeMol(molecule)
            smiles_no_stero_noncanon = Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=False)
            mol_no_stereo_noncanon = Chem.MolFromSmiles(smiles_no_stero_noncanon)
            sanitized_noncanon.append(mol_no_stereo_noncanon)
    return sanitized_noncanon

chunk_size = 10**6  
input_file = '../msse-Chem277B-Project/dataset/smiles.csv'

all_sanitized = []
sanitized_noncanon = []

with open('sanitized.pkl', 'wb') as sanitized_file, open('sanitized_noncanon.pkl', 'wb') as sanitized_noncanon_file:  # Create/overwrite the output file
    for chunk in pd.read_csv(input_file, chunksize=chunk_size):
        smiles_list = chunk["smiles"].tolist()
        sanitized_chunk = sanitize_smiles(smiles_list) 
        sanitized_noncanon_chunk = sanitize_noncanon(smiles_list)
        all_sanitized.extend(sanitized_chunk)  
        sanitized_noncanon.extend(sanitized_noncanon_chunk)
        pickle.dump(sanitized_chunk, sanitized_file)  
        pickle.dump(sanitized_noncanon_chunk, sanitized_noncanon_file)


all_sanitized = list(set(all_sanitized)) 
sanitized_noncanon = list(set(sanitized_noncanon))
smiles_list = [Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=True) for molecule in all_sanitized]
smiles_noncanon = [Chem.MolToSmiles(molecule, isomericSmiles=False, canonical=False) for molecule in sanitized_noncanon]

df = pd.DataFrame(smiles_list, columns=['SMILES'])
df['SMILES_NONCANON'] = smiles_noncanon
df.to_csv('sanitized_noncanon_smiles.csv', index=False)
