In [1]:
import os
import tensorflow as tf
from tensorflow.python.keras import layers, losses
import matplotlib.pyplot as plt

# Data location:

In [2]:
data_location = "../data/split"

first_category = "inorganic"
second_category = "organic"

data_folder = "name" + "-" + first_category + "-" + second_category

In [3]:
dataset_directory = os.path.join(data_location, data_folder)
train_directory = os.path.join(dataset_directory, "train")
test_directory = os.path.join(dataset_directory, "test")

print("In data:", os.listdir(dataset_directory))
print("In train:", os.listdir(train_directory))
print("In test:", os.listdir(test_directory))

In data: ['test', 'train']
In train: ['inorganic', 'inorganic-90%.txt', 'organic', 'organic-90%.txt']
In test: ['inorganic', 'inorganic-10%.txt', 'organic', 'organic-10%.txt']


### Sample file:

In [4]:
sample_file = os.path.join(train_directory, first_category, "ab.txt")

print("Sample file:", open(sample_file).read())

Sample file: manganito argÃ©ntico



# Data collection:

In [5]:
seed = 32 # Random seed for data shuffling and transformations
validation_split = 0.2 # Proportion of train data used to validate the model

In [6]:
raw_train_data_source = tf.keras.preprocessing.text_dataset_from_directory(
    train_directory,
    subset = "training",
    seed = seed,
    validation_split = validation_split,
)

Found 25888 files belonging to 2 classes.
Using 20711 files for training.


In [7]:
raw_validation_data_source = tf.keras.preprocessing.text_dataset_from_directory(
    train_directory,
    subset = "validation",
    seed = seed,
    validation_split = validation_split,
)

Found 25888 files belonging to 2 classes.
Using 5177 files for validation.


In [8]:
raw_test_data_source = tf.keras.preprocessing.text_dataset_from_directory(
    test_directory,
)

Found 2876 files belonging to 2 classes.


# Data pre-processing:

In [9]:
# Registers data_standarization

# TODO use model class to avoid code repetition

@tf.keras.utils.register_keras_serializable()
def data_standardization(input_data): # CH3-CH=CH-CH(NO2)Br
    input_data = tf.strings.lower(input_data) # ch3-ch=ch-ch(no2)br
    input_data = tf.strings.regex_replace(input_data, "[^a-zà-ú]", ' ') # ch  ch ch ch no  br
    return tf.strings.regex_replace(input_data, "\s+", ' ') # ch ch ch ch no br

In [10]:
max_features = 2048 # Sets a boundary for len(vectorize_layer.get_vocabulary())

In [11]:
sequence_length = 16 # Vectorized string's dimension

vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    output_mode = "int",
    max_tokens = max_features,
    standardize = data_standardization,
    output_sequence_length = sequence_length
)

In [12]:
# Makes a text-only dataset (without labels), then calls adapt
train_text = raw_train_data_source.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [13]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [14]:
# Retrieves a batch of 32 reviews and labels from the dataset
text_batch, label_batch = next(iter(raw_train_data_source))
first_review, first_label = text_batch[0], label_batch[0]

print("Review:", first_review)
print("Label:", raw_train_data_source.class_names[first_label])
print("Vectorized review:", vectorize_text(first_review, first_label))

Review: tf.Tensor(b'3-cloro-3-fluoro-1-nitropropano\r\n', shape=(), dtype=string)
Label: organic
Vectorized review: (<tf.Tensor: shape=(1, 16), dtype=int64, numpy=
array([[  5,   6, 248,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0]], dtype=int64)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)


In [15]:
print("Vocabulary size:", len(vectorize_layer.get_vocabulary()))

Vocabulary size: 2048


In [16]:
print("Vocabulary:", sorted(vectorize_layer.get_vocabulary(), key = len))

Vocabulary: ['', 'v', 'i', 'n', 'l', 'p', 'o', 'm', 's', 'g', 'de', 'ol', 'en', 'ii', 'iv', 'in', 'vi', 'di', 'iii', 'eno', 'ona', 'bis', 'ino', 'vii', 'but', 'oro', 'hex', 'oxo', 'sec', 'oct', 'éter', 'inil', 'enil', 'etil', 'tris', 'dien', 'diol', 'prop', 'enal', 'cinc', 'buta', 'pent', 'inal', 'diin', 'yodo', 'viii', 'hexa', 'boro', 'hept', 'terc', 'neón', '[UNK]', 'bromo', 'cloro', 'metil', 'amina', 'nitro', 'amino', 'ácido', 'óxido', 'dieno', 'cromo', 'plomo', 'cobre', 'osmio', 'bario', 'sodio', 'talio', 'diino', 'trien', 'galio', 'radio', 'plata', 'litio', 'cesio', 'renio', 'penta', 'rodio', 'indio', 'cerio', 'itrio', 'dioro', 'triol', 'butil', 'tulio', 'curio', 'torio', 'diona', 'fenil', 'erbio', 'hepta', 'etano', 'etilo', 'ciano', 'eteno', 'butan', 'buten', 'hexen', 'etoxi', 'etino', 'xenón', 'vinil', 'radón', 'lejía', 'hexan', 'hasio', 'fenol', 'fluoro', 'etinil', 'propil', 'yoduro', 'etenil', 'níquel', 'hierro', 'estaño', 'dienil', 'yodato', 'uranio', 'borato', 'niobio', 'cal

In [17]:
train_data_source = raw_train_data_source.map(vectorize_text)
validation_data_source = raw_validation_data_source.map(vectorize_text)
test_data_source = raw_test_data_source.map(vectorize_text)

In [18]:
AUTOTUNE = tf.data.experimental.AUTOTUNE # ??

train_data_source = train_data_source.cache().prefetch(buffer_size = AUTOTUNE)
validation_data_source = validation_data_source.cache().prefetch(buffer_size = AUTOTUNE)
test_data_source = test_data_source.cache().prefetch(buffer_size = AUTOTUNE)

# Model creation:

In [19]:
embedding_dim = 2 ** 10  # 2 ** 10 seems to be optimal, 2 ** 4 was default

In [20]:
def create_model ():
  model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, embedding_dim),
    layers.Dropout(0.2),
    layers.Dense(256, activation = "relu"),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1),
  ])
  
  model.compile(
    optimizer = "adam",
    metrics = tf.metrics.BinaryAccuracy(threshold = 0.0),
    loss = losses.BinaryCrossentropy(from_logits = True),
  )
  
  return model

model = create_model()
  

### Training:

In [None]:
epochs = 5

history = model.fit(
    epochs = epochs,
    x = train_data_source,
    validation_data = validation_data_source,
)

Epoch 1/5

### Compiling:

In [None]:
export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation("sigmoid")
])

export_model.compile(
    optimizer = "adam", 
    metrics = ["accuracy"],
    loss = losses.BinaryCrossentropy(from_logits = False), 
)

# Model evaluation:

### Using test data:

In [None]:
loss, accuracy = model.evaluate(test_data_source)

print("Test loss:", loss)
print("Test accuracy:", accuracy)

### Using raw test data:

In [None]:
loss, accuracy = export_model.evaluate(raw_test_data_source)

print("Raw test loss:", loss)
print("Raw test accuracy:", accuracy)

### Accuracy graph:

In [None]:
history_dict = history.history
history_dict.keys()

accuracy = history_dict["binary_accuracy"]
validation_accuracy = history_dict["val_binary_accuracy"]
loss = history_dict["loss"]
validation_loss = history_dict["val_loss"]

epochs = range(1, len(accuracy) + 1)

plt.plot(epochs, accuracy, "bo", label = "Training accuracy") # Blue dots
plt.plot(epochs, validation_accuracy, "b", label = "Validation accuracy") # Blue line
plt.title("Training and validation accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(loc = "lower right")

plt.show()

### Loss graph:

In [None]:
plt.plot(epochs, loss, "ro", label = "Training loss") # Red dots
plt.plot(epochs, validation_loss, "r", label = "Validation loss") # Red line
plt.title("Training and validation loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()

plt.show()

# Export model to a file

In [None]:
export_model.save("models/" + data_folder + "-model", save_format='tf')

# Model predictions:

In [None]:
def predict_categories(examples):
    for example in examples:
        prediction = export_model.predict([example])[0][0] * 100
        category = first_category if prediction < 50 else second_category
        print(category, "(" + "%.2f" % prediction + "%):", example)

### Inorganic formulas:

In [None]:
inorganic_formulas = [
    "mn2(Hpo3)3",
    "NaCl",
    "H2SO4",
    "CL203",
    "znCO2",
    "H2O",
    "FeNA2",
    "G2S2O5",
    "Cl2O⁺H2O",
]

predict_categories(inorganic_formulas)

### Inorganic names:

In [None]:
inorganic_names = [
    "arsenito diacido de sodio",
    "hipoclorito de sodio",
    "potasiuro de boro",
    "cloruro de sodio",
    "acido disulfuroso",
    "sulfurico"
]

predict_categories(inorganic_names) # 98.21%

### Organic formulas:

In [None]:
organic_formulas = [
    "Ch3ChCh(Ch3ChCh3)ChCh",
    "H3C-CH2",
    "H3C-CH2-CH2",
    "h c c h",
    "h c c h h c c h",
    "CH3-CO-O-CH2-CH3",
    "CH3-CH2-O-CH2-CH3",
    "CH3-CH2-CH=CH-COOH",
    "ch3chch2ch(ch2ch2ch3)cooh",
    "ch3(Ch3)Chch2Ch(Ch3)Ch2Ch(Ch2Ch2Ch3)Ch3",
]

predict_categories(organic_formulas)

### Organic names:

In [None]:
organic_names = [
    "arsano",
    "benceno",
    "naftaleno",
    "2-cloropentanato",
    "cloruro de propilo",
    "di 2-cloropentanil éter",
    "2-bromo-2-cloropropano",
    "metanoato de isopropilo",
    "orto-difenilciclohexano",
    "2-bromo-2-cloropropil yododecil éter",
    "3-cloro-2-fluoro-hexa-1,3-dien-5-in-1-ona",
    "4-amino-2,6,6-tricloro-7,7-difluoro-89-metil-3-nitro-1,1-diyodononaconta-1,3-dien-5-ona",
]

predict_categories(organic_names)