# Introduction
* InChI descirbes many molecular information in terms of layers of information.
* So one of approach to construct InChI descriptions is to determine all layers one by one.
* At first, I determine chiralities of chemical substances, which is provided by InChI layers starting with prefix "s".
* The dataset used in this code are avairable from [here](https://www.kaggle.com/wineplanetary/bms-arranged-label), which is produced by [this notebook](https://www.kaggle.com/wineplanetary/understanding-inchi-format-and-arrange-train-label)

## Chirality
* A stereochemical layer have a type of stereochemistry information, which always have a prefix "s".
* Stereochemistry sublayer (perhaps) shows a chemical substance have chirality.
* I determine stereochemistry information in the first place by CNN.
![chirality (from wikipedia)](https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Zwitterion-Alanine.png/1920px-Zwitterion-Alanine.png)
image from wikipedia


## References and Acknowledgements
* Dataset
 * https://www.kaggle.com/wineplanetary/bms-arranged-label
* Notebook
 * https://www.kaggle.com/wineplanetary/understanding-inchi-format-and-arrange-train-label
* Others
 * https://ja.wikipedia.org/wiki/InChI

In [None]:
!pip install albumentations
!pip install -U efficientnet

In [None]:
import os
import glob
import json
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import tensorflow as tf

import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.experimental import CosineDecay
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.applications import EfficientNetB4
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import albumentations as A

In [None]:
class CFG:
    seed = 12345
    norm_mean = [0.485, 0.456, 0.406]
    norm_std = [0.229, 0.224, 0.225]
    batch_size = 16
    init_lr = 1e-3
    epochs = 3
    img_size = 380
    class_mode = "binary"
    n_CLASS = 2
    interpolation = "nearest"

In [None]:
# seed
tf.random.set_seed(CFG.seed)
np.random.seed(CFG.seed)
random.seed(CFG.seed)
os.environ["PYTHONHASHSEED"] = str(CFG.seed)

In [None]:
train_dir = "../input/bms-molecular-translation/train"
chbtmspath = "../input/bms-arranged-label/arranged_bms_train_labels.csv"

In [None]:
data_org = pd.read_csv(chbtmspath)

In [None]:
data = data_org[["image_path"]].copy()
data["label"] = data_org["s_flg"].astype("str")

In [None]:
# I was truncate data to 50000 due to the calculation time, but more data would be increase your accuracy
true_data = data[data["label"] == "1"]
false_data = data[data["label"] == "0"].sample(n=true_data.shape[0], random_state=CFG.seed)
data = pd.concat([true_data, false_data], ignore_index=True).sample(n=50000, random_state=CFG.seed)

In [None]:
train_augumentation = A.Compose([
                        A.Normalize(mean=CFG.norm_mean, std=CFG.norm_std, max_pixel_value=255, p=1.0),
])

val_augumentation = A.Compose([
                        A.Normalize(mean=CFG.norm_mean, std=CFG.norm_std, max_pixel_value=255, p=1.0),
])


def train_trans_func(image):
    image = train_augumentation(image=image.astype(np.uint8))["image"]
    return image

def val_trans_func(image):
    image = val_augumentation(image=image.astype(np.uint8))["image"]
    return image

In [None]:
datagen_train = ImageDataGenerator(preprocessing_function = train_trans_func)
datagen_val = ImageDataGenerator(preprocessing_function = val_trans_func)

In [None]:
def create_train_set(train):
    train_set = datagen_train.flow_from_dataframe(train,
                                 directory = None,
                                 seed = CFG.seed,
                                 x_col = "image_path",
                                 y_col = "label",
                                 target_size = (CFG.img_size, CFG.img_size),
                                 class_mode = CFG.class_mode,
                                 interpolation = CFG.interpolation,
                                 shuffle = True,
                                 batch_size = CFG.batch_size)
    return train_set
    
def create_val_set(val):
    val_set = datagen_val.flow_from_dataframe(val,
                                 directory = None,
                                 seed=CFG.seed,
                                 x_col = "image_path",
                                 y_col = "label",
                                 target_size = (CFG.img_size, CFG.img_size),
                                 class_mode = CFG.class_mode,
                                 interpolation = CFG.interpolation,
                                 shuffle = True,
                                 batch_size = CFG.batch_size)
    return val_set

In [None]:
train, val = train_test_split(data, 
                              test_size=0.05,
                              random_state=CFG.seed,
                              stratify=data["label"])

In [None]:
valid_set = create_val_set(val)
train_set = create_train_set(train)

In [None]:
def create_model():
    model = Sequential()
    model.add(EfficientNetB4(input_shape = (CFG.img_size, CFG.img_size, 3), 
                             include_top=False,
                             weights = "imagenet",
                             drop_connect_rate=0.6))
    model.add(GlobalAveragePooling2D())
    model.add(Dense(1, activation="sigmoid"))
    return model

model = create_model()
model.summary()

In [None]:
step_size_train = train_set.n // train_set.batch_size
step_size_valid = valid_set.n // valid_set.batch_size

In [None]:
model = create_model()
    
loss = BinaryCrossentropy(from_logits = False,
                          name="binary_crossentropy")

lr = CosineDecay(initial_learning_rate = CFG.init_lr,
                 decay_steps = step_size_train * CFG.epochs)

model.compile(optimizer = Adam(learning_rate=lr),
              loss=loss,
              metrics=["binary_accuracy"])

checkpoint_cb = ModelCheckpoint("bms_s_best_model.h5",
                                save_best_only=True,
                                monitor="val_loss",
                                mode="min")

# history = model.fit(train_set,
#                     validation_data = valid_set,
#                     epochs = CFG.epochs,
#                     batch_size = CFG.batch_size,
#                     steps_per_epoch = step_size_train,
#                     validation_steps = step_size_valid,
#                     callbacks=[checkpoint_cb])

# model.save("bms_s_model.h5")