In [None]:
import os
import numpy as np
import pandas as pd

test_images = 0

os.chdir('/kaggle/input/bms-molecular-translation')
print(os.listdir())

## Carregando Datasets

In [None]:
train_df = pd.read_csv('./train_labels.csv')
sample_df = pd.read_csv('./sample_submission.csv')
eai_df = pd.read_csv('./extra_approved_InChIs.csv')

In [None]:
train_df.head(3)

In [None]:
plt.imshow(cv2.imread('train/0/0/0/000011a64c74.png'))

In [None]:
sample_df.head(3)

# 

In [None]:
eai_df.head(3)

In [None]:
train_df.shape

In [None]:
def get_path_by_filename(filename, train=True):
    prefix = 'train' if train else 'test'
    return './{}/{}/{}/{}/{}.png'.format(
        prefix, filename[0], filename[1], filename[2], filename
    )

train_df['path'] = train_df['image_id'].apply(get_path_by_filename)
train_df.head(3)

## Visualizando os Dados

In [None]:
import matplotlib.pyplot as plt
import cv2

def plot_molecules(images):
    length = len(images)
    width = int(length ** .5)
    height = int(length / width)
    fig, axs = plt.subplots(width, height, figsize=(5 * width, 3 * height))
    fig.subplots_adjust(hspace=.2, wspace=.2)
    axs = axs.ravel()
    for index, (image_path, inChI) in enumerate(images):
        title = inChI[:20]
        img = cv2.imread(image_path)
        axs[index].imshow(img)
        axs[index].set_title('{}...'.format(title))
        axs[index].set_xticklabels([])
        axs[index].set_yticklabels([])
    plt.show()

In [None]:
def get_molecule_tuple(row):
    return (row['path'], row['InChI'])

sample_images = list(
    map(
        lambda index: get_molecule_tuple(train_df.iloc[index]),
        range(25)
    )
)

plot_molecules(sample_images)

## Rotulando as Imagens

Os rótulos consistem em camadas e subcamadas que são separadas pelo delimitador "/" e começam com uma letra de prefixo característica.
As seis camadas com subcamadas importantes são:

1) Camada principal

* Fórmula química (sem prefixo). Esta é a única subcamada que deve ocorrer em cada InChI.
* Conexões Atom (prefixo: "c"). Os átomos na fórmula química (exceto hidrogênios) são numerados em sequência; essa subcamada descreve quais átomos estão conectados por ligações a quais outros.
* Átomos de hidrogênio (prefixo: "h"). Descreve quantos átomos de hidrogênio estão conectados a cada um dos outros átomos.

2) Camada de carga
   
* subcamada de carga (prefixo: "q")
* subcamada de prótons (prefixo: "p" para "prótons")

3) Camada estereoquímica
   
* ligações duplas e cumulenos (prefixo: "b")
* estereoquímica tetraédrica de átomos e alenos (prefixos: "t", "m")
* tipo de informação estereoquímica (prefixo: "s")
4) Camada isotópica (prefixos: "i", "h", bem como "b", "t", "m", "s" para estereoquímica isotópica)

5) Camada H Fixo (prefixo: "f"); contém alguns ou todos os tipos de camadas acima, exceto conexões atômicas; pode terminar com a subcamada "o"; nunca incluído no padrão InChI

6) Camada reconectada (prefixo: "r"); contém todo o InChI de uma estrutura com átomos metálicos reconectados; nunca incluído no padrão InChI

In [None]:
train_df['molecule'] = train_df['InChI'].apply(lambda inChI: inChI.split('/')[1])
train_df['length'] = train_df['molecule'].apply(lambda molecule: len(molecule))

train_df.head(3)

In [None]:
chars = set()
for molecule in train_df['molecule']:
    for char in molecule:
        chars.add(char)
chars

In [None]:
char_to_label = {char:label for label,char in enumerate(chars)}
label_to_char = {label:char for label,char in enumerate(chars)}
label_to_char[100] = ''

In [None]:
char_to_label

## Formatando Dados

In [None]:
from tensorflow.keras.utils import Sequence
    
    
class GenerateDataForCTCFormat(Sequence):
    def __init__(self, df, char_map, batch_size=16, w=200, h=50, downsample_factor=4, max_length=20, shuffle=True):
        self.dataframe = df
        self.char_map = char_map
        self.batch_size = batch_size
        self.width = w
        self.height = h
        self.downsample_factor = downsample_factor
        self.max_length = max_length
        self.shuffle = shuffle
        self.indices = np.arange(len(df))
        self.on_epoch_end()
        
    def process_image(self, path):
        kernel_size = (2, 2)
        width = self.width
        height = self.height
        
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        img = cv2.erode(img, (kernel_size))
        img = cv2.resize(img, (width, height))
        img = img / 255 # Normalization
        img = img.T
        return np.expand_dims(img, axis=-1)
        
    def __len__(self):
        return len(self.dataframe)//self.batch_size
    
    def __getitem__(self, index):
        batch_size = self.batch_size
        width = self.width
        height = self.height
        length = self.max_length
        downsample_factor = self.downsample_factor
        df = self.dataframe
        char_map = self.char_map
        
        next_index = index + 1
        
        current_batch_index = self.indices[index * batch_size:next_index * batch_size]
        
        batch_images = np.ones((batch_size, width, height, 1), dtype=np.float32)
        batch_labels = np.ones((batch_size, length), dtype=np.float32)
        input_length = np.ones((batch_size, 1), dtype=np.float32) * (width // downsample_factor - 2)
        label_length = np.zeros((batch_size, 1), dtype=np.int64)
        
        for i, index in enumerate(current_batch_index):
            img = self.process_image(df['path'].values[index])
            molecule = df['molecule'].values[index]
            label = [char_map[letter] for letter in molecule]
            label.extend([100] * (20 - len(label)))
            
            batch_images[i] = img
            batch_labels[i] = label
            label_length[i] = len(label)
            
        batch_inputs = {
            'input_data': batch_images,
            'input_label': batch_labels,
            'input_length': input_length,
            'label_length': label_length,
        }
        
        return (batch_inputs, np.zeros((batch_size), dtype=np.float32))

    def on_epoch_end(self):
        if self.shuffle: np.random.shuffle(self.indices)

In [None]:
train_size = 150000
validation_size = 10000

train = GenerateDataForCTCFormat(train_df[:train_size], char_to_label)
validation = GenerateDataForCTCFormat(train_df[train_size:validation_size], char_to_label)

## CTC

In [None]:
import keras

class CTCLayer(keras.layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost
    
    def call(self, y_true, y_pred, input_length, label_length):
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)
        return loss


## Modelando

In [None]:
import tensorflow as tf

def make_model():
    entry = keras.layers.Input(shape=(200, 50, 1), dtype=np.float32, name='input_data')
    labels = keras.layers.Input(shape=[5], dtype=np.float32, name='input_label')
    input_length = keras.layers.Input(shape=[1], dtype=np.int64, name='input_length')
    label_length = keras.layers.Input(shape=[1], dtype=np.int64, name='label_length')
    
    x = keras.layers.Conv2D(
        32, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal'
    )(entry)
    x = keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = keras.layers.Conv2D(
        64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal'
    )(x)
    x = keras.layers.MaxPooling2D(pool_size=(2, 2))(x)
    x = keras.layers.Reshape((50, 768))(x)
    x = keras.layers.Dense(64, activation='relu')(x)
    x = keras.layers.Dropout(.4)(x)
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(128, return_sequences=True, dropout=.2)
    )(x)
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(64, return_sequences=True, dropout=.25)
    )(x)
    x = keras.layers.Dense(
        len(chars) + 1, activation='softmax', kernel_initializer='he_normal', name='dense_output'
    )(x)
    output = CTCLayer(name='outputs')(labels, x, input_length, label_length)
    model = keras.models.Model(
        [entry, labels, input_length, label_length], output
    )
    sgd = tf.keras.optimizers.SGD(
        learning_rate=.0015,
        decay=.000001,
        nesterov=True,
        clipnorm=5,
    )
    model.compile(optimizer=sgd)
    return model
    

In [None]:
model = make_model()
model.summary()

In [None]:
es = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

In [None]:
model_fit = model.fit(
    train,
    validation_data=validation,
    steps_per_epoch=1500,
    epochs=8,
    callbacks=[es]
)

model.save('model')

In [None]:
model_fit