In [1]:
import tensorflow as tf
from tensorflow.python.keras import layers, losses
import matplotlib.pyplot as plt
import os

# Data location:

In [2]:
data_folder = "organic-formula-name"

dataset_directory = os.path.join(os.path.dirname("data"), "data", data_folder)
train_directory = os.path.join(dataset_directory, "train")
test_directory = os.path.join(dataset_directory, "test")

print("In data:", os.listdir(dataset_directory))
print("In train:", os.listdir(train_directory))
print("In test:", os.listdir(test_directory))

In data: ['formulas.txt', 'names.txt', 'test', 'train']
In train: ['formula', 'formulas-90%.txt', 'name', 'names-90%.txt']
In test: ['formula', 'formulas-10%.txt', 'name', 'names-10%.txt']


### Sample file:

In [3]:
sample_file = os.path.join(train_directory, "formula", "xaa.txt")

print(open(sample_file).read())

FileNotFoundError: [Errno 2] No such file or directory: 'data\\organic-formula-name\\train\\formula\\xaa.txt'

# Data collection:

In [None]:
seed = 32 # Random seed for data shuffling and transformations
validation_split = 0.2 # Proportion of train data used to validate the model

In [None]:
raw_train_data_source = tf.keras.utils.text_dataset_from_directory(
    train_directory,
    subset = "training",
    seed = seed,
    validation_split = validation_split,
)

In [None]:
raw_validation_data_source = tf.keras.utils.text_dataset_from_directory(
    train_directory,
    subset = "validation",
    seed = seed,
    validation_split = validation_split,
)

In [None]:
raw_test_data_source = tf.keras.utils.text_dataset_from_directory(
    test_directory,
)

# Data pre-processing:

In [None]:
def data_standardization(input_data):
    input_data = tf.strings.lower(input_data) # CH3-CH=CH-CH(NO2)Br -> ch3-ch=ch-ch(no2)br
    input_data = tf.strings.regex_replace(input_data, "[^a-zà-ú]", ' ') # ch3-ch=ch-ch(no2)br -> ch  ch ch ch no  br
    return tf.strings.regex_replace(input_data, "\s+", ' ') # ch  ch ch ch no  br  -> ch ch ch ch no br

In [None]:
max_features = 2048 # Sets a boundary for len(vectorize_layer.get_vocabulary())

In [None]:
sequence_length = 16 # Vectorized string's dimension

vectorize_layer = layers.TextVectorization(
    output_mode = "int",
    max_tokens = max_features,
    standardize = data_standardization,
    output_sequence_length = sequence_length
)

In [None]:
# Makes a text-only dataset (without labels), then calls adapt
train_text = raw_train_data_source.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [None]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [None]:
# Retrieves a batch of 32 reviews and labels from the dataset
text_batch, label_batch = next(iter(raw_train_data_source))
first_review, first_label = text_batch[0], label_batch[0]

print("Review:", first_review)
print("Label:", raw_train_data_source.class_names[first_label])
print("Vectorized review:", vectorize_text(first_review, first_label))

In [None]:
print("Vocabulary size:", len(vectorize_layer.get_vocabulary()))

In [None]:
print("Vocabulary:", sorted(vectorize_layer.get_vocabulary(), key = len))

In [None]:
train_data_source = raw_train_data_source.map(vectorize_text)
validation_data_source = raw_validation_data_source.map(vectorize_text)
test_data_source = raw_test_data_source.map(vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE # ??

train_data_source = train_data_source.cache().prefetch(buffer_size = AUTOTUNE)
validation_data_source = validation_data_source.cache().prefetch(buffer_size = AUTOTUNE)
test_data_source = test_data_source.cache().prefetch(buffer_size = AUTOTUNE)

# Model creation:

In [None]:
embedding_dim = 16 # ??

In [None]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.Dense(256, activation = "relu"),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1),
])

### Loss function and optimizer:

In [None]:
model.compile(
    optimizer = "adam",
    metrics = tf.metrics.BinaryAccuracy(threshold = 0.0),
    loss = losses.BinaryCrossentropy(from_logits = True),
)

### Training:

In [None]:
epochs = 5

history = model.fit(
    epochs = epochs,
    x = train_data_source,
    validation_data = validation_data_source,
)

### Compiling:

In [None]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation("sigmoid")
])

export_model.compile(
    optimizer = "adam", 
    metrics = ["accuracy"],
    loss = losses.BinaryCrossentropy(from_logits = False), 
)

# Model evaluation:

### Using test data:

In [None]:
loss, accuracy = model.evaluate(test_data_source)

print("Test loss:", loss)
print("Test accuracy:", accuracy)

### Using raw test data:

In [None]:
loss, accuracy = export_model.evaluate(raw_test_data_source)

print("Raw test loss:", loss)
print("Raw test accuracy:", accuracy)

### Accuracy graph:

In [None]:
history_dict = history.history
history_dict.keys()

accuracy = history_dict["binary_accuracy"]
validation_accuracy = history_dict["val_binary_accuracy"]
loss = history_dict["loss"]
validation_loss = history_dict["val_loss"]

epochs = range(1, len(accuracy) + 1)

plt.plot(epochs, accuracy, "bo", label = "Training accuracy") # Blue dots
plt.plot(epochs, validation_accuracy, "b", label = "Validation accuracy") # Blue line
plt.title("Training and validation accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(loc = "lower right")

plt.show()

### Loss graph:

In [None]:
plt.plot(epochs, loss, "ro", label = "Training loss") # Red dots
plt.plot(epochs, validation_loss, "r", label = "Validation loss") # Red line
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()

plt.show()

# Model predictions:

In [None]:
examples = [
    "H3C-CH2",
    "ChCh",
    "CH3-CO-O-CH2-CH3",
    "CH3-CH2-O-CH2-CH3",
    "CH3-CH2-CH=CH-COOH",
    "ch3chch2ch(ch2ch2ch3)cooh",
    "ch3(Ch3)Chch2Ch(Ch3)Ch2Ch(Ch2Ch2Ch3)Ch3",
    "benceno",
    "2-cloropentanato",
    "di 2-cloropentanil éter",
    "2-bromo-2-cloropropano",
    "metanoato de isopropilo",
    "orto-difenilciclohexano",
    "2-bromo-2-cloropropil yododecil éter",
    "3-cloro-2-fluoro-hexa-1,3-dien-5-in-1-ona",
    "4-amino-2,6,6-tricloro-7,7-difluoro-89-metil-3-nitro-1,1-diyodononaconta-1,3-dien-5-ona",
]

for example in examples:
    percentage = export_model.predict([example])[0][0] * 100
    print("Formula" if percentage < 50 else "Name", "(" + "%.2f" % percentage + " %):", example)