# Chemical formula prediction with less than 1 Levenshtein distance

## Introduction
* InChI descirbes many molecular information in terms of layers.
* So one of approaches to construct InChI descriptions is to determine all layers one by one.
* I first determined molecular chiralities from molecular images using CNN (see this [notebook](https://www.kaggle.com/wineplanetary/step-by-step-detection-1-99-acc-chirality)).
* Next, I determine chemical formulas using CNN.
* The dataset used in this notebook is avairable from [here](https://www.kaggle.com/wineplanetary/bms-arranged-label), which is produced by [this notebook](https://www.kaggle.com/wineplanetary/understanding-inchi-format-and-arrange-train-label)


## Chemical Formula
* I have already reduced all InChIs in train dataset into atoms by [this notebook](https://www.kaggle.com/wineplanetary/understanding-inchi-format-and-arrange-train-label)
* I determine chemical formulas from images by solving multivariate regression problem
* I achieved that <span style="color: red; font-weight: bold;">less than 1 Levenshtein distance</span>  of chemical formulas with half of train dataset!
* More dataset will decrease the Levenshtein distance.

## References and Acknowledgements
* Dataset
 * https://www.kaggle.com/wineplanetary/bms-arranged-label
* Notebook
 * https://www.kaggle.com/wineplanetary/understanding-inchi-format-and-arrange-train-label
 * https://www.kaggle.com/wineplanetary/step-by-step-detection-1-99-acc-chirality

In [None]:
import os
import glob
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

import tensorflow as tf

import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.experimental import CosineDecay
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Conv2D, BatchNormalization, MaxPool2D

import Levenshtein

In [None]:
class CFG:
    seed = 12345
    batch_size = 32
    init_lr = 1e-3
    epochs = 5
    img_size = 380
    class_mode = "raw"
    n_CLASS = 12
    interpolation = "nearest"
    color_mode = "grayscale"
    shuffle = True
    num_data = 1000 # change to 1000000
    test_size = 0.05

In [None]:
# seed
tf.random.set_seed(CFG.seed)
np.random.seed(CFG.seed)
random.seed(CFG.seed)
os.environ["PYTHONHASHSEED"] = str(CFG.seed)

In [None]:
train_dir = "../input/bms-molecular-translation/train"
chbtmspath = "../input/bms-arranged-label/arranged_bms_train_labels.csv"

In [None]:
data_org = pd.read_csv(chbtmspath)

In [None]:
# I was truncate data to 1000000 due to the calculation time, but more data would be increase your accuracy
atom_list = ["C", "H", "B", "Br", "Cl", "F", "I", "N", "O", "P", "S", "Si"]
data = data_org[["image_path", *atom_list]].copy()
data = data.sample(n=CFG.num_data, random_state=CFG.seed)

In [None]:
def train_trans_func(image):
    return image / 255.

def val_trans_func(image):
    return image / 255.

In [None]:
datagen_train = ImageDataGenerator(preprocessing_function = train_trans_func)
datagen_val = ImageDataGenerator(preprocessing_function = val_trans_func)

In [None]:
def create_train_set(train):
    train_set = datagen_train.flow_from_dataframe(train,
                                                  directory = None,
                                                  seed = CFG.seed,
                                                  x_col = "image_path",
                                                  y_col = atom_list,
                                                  target_size = (CFG.img_size, CFG.img_size),
                                                  class_mode = CFG.class_mode,
                                                  interpolation = CFG.interpolation,
                                                  shuffle = CFG.shuffle,
                                                  color_mode = CFG.color_mode,
                                                  batch_size = CFG.batch_size)
    return train_set
    
def create_val_set(val):
    val_set = datagen_val.flow_from_dataframe(val,
                                              directory = None,
                                              seed=CFG.seed,
                                              x_col = "image_path",
                                              y_col = atom_list,
                                              target_size = (CFG.img_size, CFG.img_size),
                                              class_mode = CFG.class_mode,
                                              interpolation = CFG.interpolation,
                                              shuffle = CFG.shuffle,
                                              color_mode = CFG.color_mode,
                                              batch_size = CFG.batch_size)
    return val_set

In [None]:
train, val = train_test_split(data, test_size=CFG.test_size, random_state=CFG.seed)

In [None]:
valid_set = create_val_set(val)
train_set = create_train_set(train)

In [None]:
def create_model():
    model = Sequential()
    model.add(Conv2D(16, 3, activation="relu", padding="same", input_shape=(CFG.img_size, CFG.img_size, 1)))
    model.add(Conv2D(16, 3, activation="relu", padding="same"))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2), strides=None, padding="valid"))
    model.add(Conv2D(32, 3, activation="relu", padding="same"))
    model.add(Conv2D(32, 3, activation="relu", padding="same"))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2), strides=None, padding="valid"))
    model.add(Conv2D(64, 3, activation="relu", padding="same"))
    model.add(Conv2D(64, 3, activation="relu", padding="same"))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2), strides=None, padding="valid"))
    model.add(Conv2D(64, 3, activation="relu", padding="same"))
    model.add(Conv2D(64, 3, activation="relu", padding="same"))
    model.add(BatchNormalization())
    model.add(GlobalAveragePooling2D())
    model.add(Dense(CFG.n_CLASS, activation="relu"))
    return model

model = create_model()
model.summary()

In [None]:
step_size_train = train_set.n // train_set.batch_size
step_size_valid = valid_set.n // valid_set.batch_size

In [None]:
model = create_model()
model.load_weights("../input/bms-models/bms_formula_model.h5")

lr = CosineDecay(initial_learning_rate = CFG.init_lr,
                 decay_steps = step_size_train * CFG.epochs)

model.compile(optimizer = Adam(learning_rate=lr),
              loss="mean_squared_error",
              metrics=["mean_squared_error"])

checkpoint_cb = ModelCheckpoint("bms_formula_best_model.h5",
                                save_best_only=True,
                                monitor="val_loss",
                                mode="min")

history = model.fit(train_set,
                    validation_data = valid_set,
                    epochs = CFG.epochs,
                    batch_size = CFG.batch_size,
                    steps_per_epoch = step_size_train,
                    validation_steps = step_size_valid,
                    callbacks=[checkpoint_cb])

model.save("bms_formula_model.h5")

## check model accuracy in a simple way

In [None]:
def arr2formula(atom_arr):
    formula = ""
    for atom, num in zip(atom_list, atom_arr):
        if num > 1:
            formula += "%s%s" % (atom, int(num))
        elif num == 1:
            formula += atom
    return formula

lsval = 0.

for i, (test, formula_list) in enumerate(tqdm(train_set)):
    if i >= len(train_set):
        break
    predicts = np.round(model.predict(test))
    for predict, formula in zip(predicts, formula_list):
        predict_formula = arr2formula(predict)
        true_formula = arr2formula(formula)
        if i < 10:
            print("predicted, true = %s, %s" % (predict_formula, true_formula))
        lsval += Levenshtein.distance(predict_formula, true_formula) / (CFG.num_data * (1 - CFG.test_size))

print("The Levenshtein distance of train data is %s" % lsval)