In [1]:
import pandas as pd
import numpy as np
import os
from copy import copy
from tqdm import tqdm
from glob import glob
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow import keras

from rdkit.Chem import MolStandardize, MolFromSmiles, MolToSmiles

from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras.models import Sequential


2022-11-15 12:05:00.528373: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-15 12:05:00.622233: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
with open("../data/raw/ZINC_results.smi") as f:
    smiles = [s.split("\t")[0].rstrip() for s in f]
smiles[:4]


['c1ccc(cc1)[N-]S(=O)(=O)CCCCCCC(=O)NO',
 'c1cc(ccc1[N+](=O)[O-])OC[C@H]2CO2',
 'c1cc(ccc1[N+](=O)[O-])OC[C@@H]2CO2',
 'c1cc(c(cc1[N+](=O)[O-])[N+](=O)[O-])Cl']

In [3]:
one_hot_nature = ['C', 'N', 'O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si', '#', '(', ')', '+', '-', '1', '2', '3',
               '4', '5', '6', '7', '8', '=', '[', ']', '@', 'c', 'n', 'o', 's', 'X', '.']
print("Nature Token size: ", len(one_hot_nature))
atoms = ["Al", "As", "B", "Br", "C", "Cl", "F", "H", "I", "K", "Li", "N", "Na", "O", "P", "S", "Se", "Si", "Te"]
special = ["(", ")", "[", "]", "=", "#", "%", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "+", "-", "se", "te", "c", "n", "o", "s"]
print("Molecular Informatics Token size: ", len(atoms) + len(special))

Nature Token size:  35
Molecular Informatics Token size:  44


In [4]:
class SmilesTokenizer(object):
    def __init__(self):
        atoms = ['C', 'N', 'O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si']
        special = ['#', '(', ')', '+', '-', '1', '2', '3', '4', '5', '6', '7', '8', '=', '[', ']', '@', 'c', 'n', 'o', 's', 'X', '.']
        padding = ["G", "E"]

        self.table = sorted(atoms, key=len, reverse=True) + special + padding
        table_len = len(self.table)

        self.table_2_chars = list(filter(lambda x: len(x) == 2, self.table))
        self.table_1_chars = list(filter(lambda x: len(x) == 1, self.table))

        self.one_hot_dict = {}
        for i, symbol in enumerate(self.table):
            vec = np.zeros(table_len, dtype=np.float32)
            vec[i] = 1
            self.one_hot_dict[symbol] = vec

    def tokenize(self, smiles):
        smiles = smiles + " "
        N = len(smiles)
        token = []
        i = 0
        while i < N:
            c1 = smiles[i]
            c2 = smiles[i : i + 2]

            if c2 in self.table_2_chars:
                token.append(c2)
                i += 2
                continue

            if c1 in self.table_1_chars:
                token.append(c1)
                i += 1
                continue

            i += 1

        return token

    def one_hot_encode(self, smiles, pad_len=-1):
        one_hot = ['C', 'N', 'O', 'H', 'F', 'Cl', 'P', 'B', 'Br', 'S', 'I', 'Si', '#', '(', ')', '+', '-', '1', '2', '3',
                '4', '5', '6', '7', '8', '=', '[', ']', '@', 'c', 'n', 'o', 's', 'X', '.']
        
        smiles = smiles + '.'
        if pad_len < 0:
            vec = np.zeros((len(smiles), len(one_hot)))
        else:
            vec = np.zeros((pad_len, len(one_hot)))
        cont = True
        j = 0
        i = 0
        while cont:
            
            try:
                if smiles[i + 1] in ['r', 'i', 'l']:
                    sym = smiles[i:i + 2]
                    i += 2
                else:
                    sym = smiles[i]
                    i += 1
            except:
                print(f'smiles[i + 1] not working, value smiles {smiles}')
            if sym in one_hot:
                vec[j, one_hot.index(sym)] = 1
            else:
                vec[j, one_hot.index('X')] = 1
            j += 1
            if smiles[i] == '.' or j >= (pad_len - 1) and pad_len > 0:
                vec[j, one_hot.index('.')] = 1
                cont = False
        return vec


In [5]:
class Preprocessor(object):
    def __init__(self):
        self.normarizer = MolStandardize.normalize.Normalizer()
        self.lfc = MolStandardize.fragment.LargestFragmentChooser()

    def process(self, smi):
        mol = MolFromSmiles(smi)
        if mol:
            mol = self.normarizer.normalize(mol)
            mol = self.lfc.choose(mol)
            smi = MolToSmiles(mol, isomericSmiles=False, canonical=True)
            return smi
        else:
            return None


In [6]:
pp = Preprocessor()

print(f"input SMILES num: {len(smiles)}")
print("start preprocessing...")

smiles = [pp.process(smi) for smi in tqdm(smiles)]
# drop duplicates
smiles = list(set([s for s in smiles if s]))

# token limits (1 to 140)
st = SmilesTokenizer()
smiles_tokenized = [st.tokenize(smi) for smi in tqdm(smiles)]
smiles_processed = []

# err = 0
# err_tokens = []
# for i in range(len(smiles)):
#     if smiles[i] != "".join(smiles_tokenized[i]):
#         print("=====================================")
#         print(len(smiles[i]), " :", smiles[i])
#         print(len(smiles_tokenized[i]), " :" ,smiles_tokenized[i])
#         for char in smiles[i]:
#             if char not in smiles_tokenized[i]:
#                 err_tokens.append(char)
#         err += 1
# print("Error: ", err)
# print("Error Tokens: ", err_tokens)

for tokenized in smiles_tokenized:
    if 1 <= len(tokenized) <= 140:
        smiles_processed.append(tokenized)

print(f"output SMILES num: {len(smiles_processed)}")



input SMILES num: 5000
start preprocessing...


100%|██████████| 5000/5000 [00:03<00:00, 1263.42it/s]
100%|██████████| 4999/4999 [00:00<00:00, 62850.19it/s]

output SMILES num: 4999





In [7]:
def _pad(tokenized_smi):
    return (
        ["G"] + tokenized_smi + ["E"]
    )

def _padding(data):
    padded_smiles = [_pad(t_smi) for t_smi in data]
    return padded_smiles

In [8]:
# add paddings
print("".join(smiles_processed[0]))
smiles_processed = _padding(smiles_processed)
print("".join(smiles_processed[0]))


Cc1cc(C[NH+](C)Cc2ccc(F)cc2Cl)n[nH]1
GCc1cc(C[NH+](C)Cc2ccc(F)cc2Cl)n[nH]1E


In [16]:
# one hot encode
x, y = [], []

# for tp_smi in smiles_processed[:5]:
#     print("-----------------------------------")
#     print("".join(tp_smi[:-1]))
#     print("".join(tp_smi[1:]))

smiles_processed = smiles_processed[:50]

for tp_smi in smiles_processed:
    print("===========================")
    _x = st.one_hot_encode("".join(tp_smi[:-1]))
    x.append(_x)
    _y = st.one_hot_encode("".join(tp_smi[1:]))
    y.append(_y)

# Problem with different shapes

# x = np.array(x, dtype=np.float32)
# y = np.array(y, dtype=np.float32)
# print(x.shape)
# x




ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (50,) + inhomogeneous part.