In [None]:
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow.keras.models as M
import tensorflow.keras.layers as L
import tensorflow.keras.optimizers as O
import tensorflow.keras.losses as Loss

from tqdm import tqdm

from PIL import Image
import cv2

import matplotlib.pyplot as plt

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=15240)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)


In [None]:
BATCH_SIZE = 512
EPOCHS = 10
DIM =(100,100)
MAX_LENGTH = 20

In [None]:
train_path = '../input/bms-molecular-translation/train/'

In [None]:
labels = pd.read_csv("../input/bms-molecular-translation/train_labels.csv")

In [None]:
for i in tqdm(range(len(labels))):
    labels.InChI.values[i] = labels.InChI.values[i][9:]
    labels.InChI.values[i] = labels.InChI.values[i].split('/')[0]
    image_id = labels.image_id.values[i]
    labels.image_id.values[i] = train_path+image_id[0]+'/'+image_id[1]+'/'+image_id[2]+'/'+image_id+'.png'
labels.head()

In [None]:
labels['InChI'] = labels['InChI'].str.pad(width=MAX_LENGTH,side='right',fillchar='$')

In [None]:
characters = set(char for label in labels.InChI.values for char in label)

In [None]:
tokens = dict()

for i in range(len(characters)):
    tokens[list(characters)[i]] = i

detokens = {x:y for y,x in tokens.items()}
vocab = len(tokens)

In [None]:
print(tokens)
print(detokens)

In [None]:
for i in tqdm(range(len(labels))):    
    labels.InChI.values[i] = ':'.join([str(tokens[x]) for x in labels.InChI.values[i]])

In [None]:
img = cv2.imread(labels.image_id.values[0],cv2.IMREAD_GRAYSCALE)

In [None]:
plt.imshow(img)

In [None]:
def build_model(dim):
    initializer = tf.keras.initializers.GlorotUniform()
    inp = L.Input(shape=(dim[0],dim[1],1,),name = 'Input')
    labels = L.Input(shape=(None,),name= 'Labels')
    X = L.Conv2D(16,3,strides=1,name='Conv2D_1')(inp)
    X = L.BatchNormalization(name='norm1')(X)
    X = L.Activation('relu',name='relu_1')(X)
    X = L.Conv2D(32,5,strides=1,name='Conv2D_2')(X)
    X = L.BatchNormalization(name='norm2')(X)
    X = L.Activation('relu',name='relu_2')(X)
    X = L.Conv2D(64,7,strides=1,name='Conv2D_3')(X)
    X = L.BatchNormalization(name='norm3')(X)
    X = L.Activation('relu',name='relu_3')(X)
    X = L.Conv2D(64,9,strides=1,name='Conv2D_4')(X)
    X = L.BatchNormalization(name='norm4')(X)
    X = L.Activation('relu',name='relu_4')(X)
    X = L.MaxPooling2D(name='Max2D_1')(X)
    X = L.Dropout(0.2,name='Dropout_1')(X)
    inp2 = tf.image.resize(inp,[X.shape[1],X.shape[2]])
    X = L.Add(name='Add_1')([X,inp2])
    X = L.BatchNormalization(name='norm_A1')(X)
    X = L.MaxPooling2D(name='Max2D_A1')(X)
    X = L.Dropout(0.2,name='Dropout_A1')(X)
    X = tf.reduce_sum(X,axis=3)
    X = L.Bidirectional(L.LSTM(32,return_sequences=True,dropout=0.2))(X)
    X = L.Bidirectional(L.LSTM(32,return_sequences=True,dropout=0.2))(X)
    Out = L.Dense(vocab,activation='softmax',name='Output',kernel_initializer = initializer)(X)
    model = M.Model(inputs=inp,outputs=Out)
    adam = O.Adam(learning_rate=0.001)
    model.compile(optimizer=adam,loss='categorical_crossentropy')
    
    return model

In [None]:
model = build_model(DIM)
model.summary()

In [None]:
base_model = M.load_model('../input/trained-model-for-bmsmolecular/phase1_base_model.h5')
model.set_weights(base_model.get_weights())

In [None]:
tf.keras.utils.plot_model(model,to_file='model.png')

In [None]:
def tokenize(label):
    l = [tokens[x] for x in label]
    return l

def data_generator(image_id,labels):
    for j in range(len(labels)):
        label = tf.cast(labels[j],dtype=tf.string)
        label = tf.strings.split(label,sep=':')
        label = tf.strings.to_number(label,out_type=tf.int32)
        yield image_id[j],label

def preprocess_image(image_id,label):
    image = tf.io.read_file(image_id)                            
    image = tf.image.decode_png(image,channels=1)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize(image,[DIM[0],DIM[1]])
    label = preprocess_label(label)
    return tf.data.Dataset.from_tensors((image, label))

def preprocess_label(label):
    label = tf.one_hot(label,vocab,axis=-1)
    return label

def preprocess_test_image(image_id):
    image = tf.io.read_file(image_id)                            
    image = tf.image.decode_png(image,channels=1)
    image = tf.image.convert_image_dtype(image, tf.float32)
    image = tf.image.resize_with_pad(image, DIM[0],DIM[1])
    return image

In [None]:
dataset = tf.data.Dataset.from_generator(data_generator,args=[labels.image_id.values,labels.InChI.values],output_signature=(
                    tf.TensorSpec(shape=(),dtype=tf.string),
                    tf.TensorSpec(shape=(MAX_LENGTH,),dtype=tf.int32)))

In [None]:
train_data = dataset.interleave(lambda x,y: preprocess_image(x,y)).cache().batch(BATCH_SIZE,drop_remainder=True).repeat().prefetch(tf.data.AUTOTUNE)

In [None]:
def scheduler(epoch, lr):
    if epoch < 8:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

lrscheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

checkpoint = tf.keras.callbacks.ModelCheckpoint("/content/drive/MyDrive/bms-molecular-translation/phase1-base-model-v1.4-{epoch:02d}-{val_loss:.4f}.hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')

history = model.fit(train_data,
                    epochs=EPOCHS,
                    batch_size = BATCH_SIZE,
                    steps_per_epoch=len(labels)//BATCH_SIZE,
                    callbacks = [lrscheduler,checkpoint])

In [None]:
model.save('phase1_base_model_v1.4.h5')