In [None]:
!pip install -q pydot graphviz

In [None]:
import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Hyperparameter

In [None]:
EPOCHS = 10

TRAINING_STEPS = 1000
VALIDATION_STEPS = 100
EVALUATION_STEPS = 100

SHUFFLE_BUFFER = 200
BATCH_SIZE = 64

IMAGE_DIM = 256

ATOMS_TO_COUNT = [
    "C",
    "H",
    "O",
    "S",
    "N",
    "Br",
    "F",
    "Cl",
    "P",
    "Si",
    "B",
    "I"
]

# Dataset

In [None]:
INPUT_PATH = "../input/bms-molecular-translation"

In [None]:
raw_train_df = pd.read_csv(os.path.join(INPUT_PATH, "train_labels.csv"))
raw_train_df.head()

In [None]:
raw_train_df['ImagePath'] = raw_train_df.image_id.map(lambda x: os.path.join(x[0], x[1], x[2], x + ".png"))

In [None]:
raw_train_df.head()

In [None]:
raw_train_df.InChI.map(lambda x: x.split("=")[1].split("/")).map(len).value_counts()

In [None]:
raw_train_df[raw_train_df.InChI.map(lambda x: x.split("=")[1].split("/")).map(len)==11]

In [None]:
raw_train_df.InChI[774948].split("=")[1].split("/")

In [None]:
raw_train_df.InChI.map(lambda x: x.split("=")[1].split("/")[0]).value_counts()

In [None]:
raw_train_df.InChI.map(lambda x: x.split("=")[1].split("/")[1]).value_counts()

In [None]:
re.sub(r"([A-Z][a-z]*)(?=[A-Z]|$)",r"\g<1>1","C15H18BrN5O2SSS")

In [None]:
train_molecule_names = raw_train_df.InChI.map(lambda x: x.split("=")[1].split("/")[1])
train_molecule_names = train_molecule_names.map(lambda x: re.sub(r"([A-Z][a-z]*)(?=[A-Z]|$)",r"\g<1>1",x))
train_molecule_names = train_molecule_names.map(lambda x: list(filter(None, re.split(r"([A-Z]+[a-z\d]+)", x))))
train_molecule_names = train_molecule_names.map(lambda x: map(lambda y: list(filter(None, re.split(r"(\D+)", y))),x))
train_molecule_names = train_molecule_names.map(dict)
train_molecule_names = train_molecule_names.tolist()

In [None]:
names_df = pd.DataFrame(train_molecule_names)
names_df = names_df.fillna(0)
names_df

In [None]:
names_df.describe()

In [None]:
processed_train_df = raw_train_df.join(names_df.astype(int))

In [None]:
processed_train_df.head()

In [None]:
del raw_train_df
del names_df
del train_molecule_names

### Normalization

In [None]:
label_max = processed_train_df.describe()[ATOMS_TO_COUNT].T['max'].values

In [None]:
processed_train_df[ATOMS_TO_COUNT] = processed_train_df[ATOMS_TO_COUNT]/label_max

In [None]:
processed_train_df.describe().T

### Train Test Split

In [None]:
train_df, val_df = train_test_split(processed_train_df, random_state=0)

In [None]:
def parse_image(file_path, labels, data_dir):
    img = tf.io.read_file(os.path.join(INPUT_PATH, data_dir.numpy().decode('utf-8'), file_path.numpy().decode('utf-8')))
    img = tf.image.decode_jpeg(img, channels=1)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, (IMAGE_DIM, IMAGE_DIM))
    return img, labels

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_df.ImagePath, train_df[ATOMS_TO_COUNT]))
train_dataset = train_dataset.map(
    lambda file_path, labels: tf.py_function(parse_image, [file_path, labels, "train"], [tf.float32, tf.float64]),
    num_parallel_calls=tf.data.experimental.AUTOTUNE
)
train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER)
train_dataset = train_dataset.batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
list(train_dataset.take(1))

In [None]:
val_dataset = tf.data.Dataset.from_tensor_slices((val_df.ImagePath, val_df[ATOMS_TO_COUNT]))
val_dataset = val_dataset.map(
    lambda file_path, labels: tf.py_function(parse_image, [file_path, labels, "train"], [tf.float32, tf.float64]),
    num_parallel_calls=tf.data.experimental.AUTOTUNE
)
val_dataset = val_dataset.shuffle(SHUFFLE_BUFFER)
val_dataset = val_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
images, labels = list(train_dataset.take(1))[0]
plt.figure(figsize=(40,20))
for i, image in enumerate(images, 1):
    plt.subplot(4,8,i)
    plt.imshow(image)
    plt.axis("off")
    plt.title(np.array(labels)[i-1]*label_max)
    if i==32:
        break

In [None]:
labels.shape, images.shape

# Model

In [None]:
feat_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, 3, input_shape=(256, 256, 1)),
    tf.keras.layers.MaxPool2D(2,2),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv2D(64, 3),
    tf.keras.layers.MaxPool2D(2,2),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv2D(128, 3),
    tf.keras.layers.MaxPool2D(2,2),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv2D(256, 3),
    tf.keras.layers.MaxPool2D(2,2),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv2D(512, 3),
    tf.keras.layers.MaxPool2D(2,2),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv2D(1024, 3),
    tf.keras.layers.Flatten()
], name="FeatureModel")

In [None]:
feat_model.summary()

In [None]:
def get_regressor(symb):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(1024, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ], name=f"{symb}_Regressor")

In [None]:
image_input = tf.keras.Input(shape=(256,256,1), name="InputImage")

features = feat_model(image_input)

outputs = []
for symb in ATOMS_TO_COUNT:
    output = get_regressor(symb)(features)
    outputs.append(output)

outputs = tf.concat(outputs, axis=1)

model = tf.keras.Model(image_input, outputs)

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, expand_nested=True)

In [None]:
model.compile(
    optimizer="adam",
    loss="mae",
    metrics=["mse", "mae"]
)

In [None]:
np.rint(model.predict(images) * label_max)

In [None]:
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    'model.h5',
    save_best_only=True
)

early_stop_callback = tf.keras.callbacks.EarlyStopping(patience=3)

In [None]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
    steps_per_epoch=TRAINING_STEPS,
    validation_steps=VALIDATION_STEPS,
    callbacks=[checkpoint_callback, early_stop_callback]
)

In [None]:
model = tf.keras.models.load_model("model.h5")

In [None]:
plt.plot(history.epoch, history.history['mse'], label="Train MSE")
plt.plot(history.epoch, history.history['val_mse'], '--', label="Validation MSE")
plt.title("Mean Squared Error")
plt.legend()

In [None]:
plt.plot(history.epoch, history.history['mae'], label="Train MAE")
plt.plot(history.epoch, history.history['val_mae'], '--', label="Validation MAE")
plt.title("Mean Absolute Error")
plt.legend()

In [None]:
model.evaluate(train_dataset, steps=EVALUATION_STEPS)

In [None]:
model.evaluate(val_dataset, steps=EVALUATION_STEPS)

In [None]:
for images, labels in val_dataset.take(1):
    plt.figure(figsize=(40,20))
    preds = model.predict(images)*label_max
    preds[preds<0] = 0
    preds = np.rint(preds)
    labels = labels.numpy()*label_max
    for i, image in enumerate(images, 1):
        plt.subplot(4,8,i)
        plt.imshow(image)
        plt.axis("off")
        plt.title(np.array([labels[i-1], preds[i-1]]))
        if i==32:
            break

## Prediction

In [None]:
test_df = pd.read_csv(os.path.join(INPUT_PATH, "sample_submission.csv"))
test_df['ImagePath'] = test_df.image_id.map(lambda x: os.path.join(x[0], x[1], x[2], x + ".png"))
test_df.head()

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices(test_df.ImagePath)
test_dataset = test_dataset.map(
    lambda file_path: tf.py_function(parse_image, [file_path, "", "test"], [tf.float32]),
    num_parallel_calls=tf.data.experimental.AUTOTUNE
)
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
for images in test_dataset.take(1):
    plt.figure(figsize=(40,20))
    labels = model.predict(images)*label_max
    labels[labels<0] = 0
    labels = np.rint(labels)
    for i, image in enumerate(images[0], 1):
        plt.subplot(4,8,i)
        plt.imshow(image)
        plt.axis("off")
        plt.title(np.array(labels)[i-1])
        if i==32:
            break

In [None]:
preds = model.predict(test_dataset, verbose=1)*label_max

In [None]:
preds = np.rint(preds)

In [None]:
preds[preds<0] = 0

In [None]:
preds.shape

In [None]:
test_labels = pd.DataFrame(preds, columns=ATOMS_TO_COUNT).astype(int)

In [None]:
test_labels

In [None]:
test_df

In [None]:
submission = pd.DataFrame(list(
    map(
        lambda y: "InChI=1S/"+y,
        map(
            lambda x: "".join([f"{c}{x[c] if x[c]>1 else ''}" for c in x if x[c]]),
            test_labels.to_dict(orient="records")
        )
    )
), columns=["InChI"]).join(test_df.image_id)[['image_id', 'InChI']]

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
submission.shape

In [None]:
submission