In [1]:
import tensorflow as tf
import re
import pickle
import sys
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "4"
gpus = tf.config.experimental.list_physical_devices("GPU")
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# More information : https://www.tensorflow.org/tutorials/load_data/tfrecord

2024-05-06 11:05:01.991038: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-06 11:05:02.085959: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-06 11:05:02.086016: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-06 11:05:02.088195: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-06 11:05:02.102391: I tensorflow/core/platform/cpu_feature_guar

In [2]:
tokenizer_SMILES = pickle.load(open("tokenizer_smiles.pkl", "rb"))
tokenizer_IUPAC = pickle.load(open("tokenizer_iupac.pkl", "rb"))
SMILES_max_length = 74
IUPAC_max_length = 133
Total_file_size = 102400
path_to_file = "Split_STOUT_IWOMI_data.txt"
num_chunk = 10  # Total number of training files
file_index = 0

In [3]:
def preprocess_sentence(w: str) -> str:
    """Preprocesses a sentence by converting to ASCII, adding start and end tokens, and spacing punctuation.

    Args:
        w (str): Input sentence.

    Returns:
        str: Preprocessed sentence.
    """
    w = "<start> " + w + " <end>"
    return w


def create_dataset(line: str) -> list:
    """Creates a dataset from a line by preprocessing sentences.

    Args:
        line (str): Input line containing tab-separated SMILES and IUPAC names.

    Returns:
        list: List of preprocessed sentences.
    """
    word_pairs = [preprocess_sentence(w) for w in line.strip().split("\t")]
    return word_pairs

In [4]:
def tokenize(
    smiles: str,
    IUPAC: str,
    SMILES_max_length: int = SMILES_max_length,
    IUPAC_max_length: int = IUPAC_max_length,
) -> tuple:
    """Tokenizes SMILES and IUPAC names using pre-trained tokenizers.

    Args:
        smiles (str): SMILES representation.
        IUPAC (str): IUPAC name.
        SMILES_max_length (int): Maximum length for SMILES sequences.
        IUPAC_max_length (int): Maximum length for IUPAC sequences.

    Returns:
        tuple: Tuple containing tokenized SMILES and IUPAC tensors.
    """
    smiles_tokens = tokenizer_SMILES.texts_to_sequences([smiles])
    iupac_tokens = tokenizer_IUPAC.texts_to_sequences([IUPAC])

    smiles_tensor = tf.keras.preprocessing.sequence.pad_sequences(
        smiles_tokens, padding="post", maxlen=SMILES_max_length
    )
    iupac_tensor = tf.keras.preprocessing.sequence.pad_sequences(
        iupac_tokens, padding="post", maxlen=IUPAC_max_length
    )
    return smiles_tensor, iupac_tensor

In [5]:
def _bytes_feature(value: bytes) -> tf.train.Feature:
    """Returns a bytes_list from a string / byte.

    Args:
        value (bytes): Input byte value.

    Returns:
        tf.train.Feature: TensorFlow Feature containing the byte value.
    """
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()  # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [6]:
def get_feature(SMILES_tensor: tf.Tensor, IUPAC_tensor: tf.Tensor) -> bytes:
    """Generates a TFRecord feature from SMILES and IUPAC tensors.

    Args:
        SMILES_tensor (tf.Tensor): Tokenized and padded SMILES tensor.
        IUPAC_tensor (tf.Tensor): Tokenized and padded IUPAC tensor.

    Returns:
        bytes: Serialized TFRecord feature.
    """
    feature = {
        # 'image_id': _bytes_feature(image_id_.encode('utf8')),
        "input_smiles": _bytes_feature(SMILES_tensor.tostring()),
        "target_iupac": _bytes_feature(IUPAC_tensor.tostring()),
    }
    example = tf.train.Example(features=tf.train.Features(feature=feature))
    serialized = example.SerializeToString()
    return serialized

In [7]:
if not os.path.exists("Training_data"):
    # If it doesn't exist, create it
    os.makedirs("Training_data")
with open(path_to_file, "r") as file:
    chunk_size = int(Total_file_size / num_chunk)  # set this proper divisible
    processed_lines = []
    for i, line in enumerate(file):
        SMILES, IUPAC_names = create_dataset(line)
        SMILES_tensor, IUPAC_tensor = tokenize(SMILES, IUPAC_names)
        feature = get_feature(SMILES_tensor, IUPAC_tensor)
        processed_lines.append(feature)
        if (i + 1) % chunk_size == 0:
            tfrecord_name = (
                "Training_data/"
                + "train-"
                + path_to_file
                + "_%02d.tfrecord" % file_index
            )
            writer = tf.io.TFRecordWriter(tfrecord_name)
            for j in range(len(processed_lines)):
                writer.write(processed_lines[j])
            print("%s write to tfrecord success!" % tfrecord_name)
            file_index = file_index + 1
            processed_lines = []

  "input_smiles": _bytes_feature(SMILES_tensor.tostring()),
  "target_iupac": _bytes_feature(IUPAC_tensor.tostring()),


Training_data/train-Split_STOUT_IWOMI_data.txt_00.tfrecord write to tfrecord success!
Training_data/train-Split_STOUT_IWOMI_data.txt_01.tfrecord write to tfrecord success!
Training_data/train-Split_STOUT_IWOMI_data.txt_02.tfrecord write to tfrecord success!
Training_data/train-Split_STOUT_IWOMI_data.txt_03.tfrecord write to tfrecord success!
Training_data/train-Split_STOUT_IWOMI_data.txt_04.tfrecord write to tfrecord success!
Training_data/train-Split_STOUT_IWOMI_data.txt_05.tfrecord write to tfrecord success!
Training_data/train-Split_STOUT_IWOMI_data.txt_06.tfrecord write to tfrecord success!
Training_data/train-Split_STOUT_IWOMI_data.txt_07.tfrecord write to tfrecord success!
Training_data/train-Split_STOUT_IWOMI_data.txt_08.tfrecord write to tfrecord success!
Training_data/train-Split_STOUT_IWOMI_data.txt_09.tfrecord write to tfrecord success!
