# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
tqdm.pandas()
import matplotlib.pyplot as plt
import cv2
import keras 
import keras.layers as L
import keras.models as M
import tensorflow as tf
from keras.utils import Sequence
import os
import skimage.io as io
import gc
from IPython.display import clear_output

# Importing Data

In [None]:
train=pd.read_csv('../input/bms-molecular-translation/train_labels.csv')

In [None]:
train.head()

# Making Required changes and adding columns

In [None]:
train['path'] = train['image_id'].progress_apply(
    lambda x: "../input/bms-molecular-translation/train/{}/{}/{}/{}.png".format(
        x[0], x[1], x[2], x))
train.head()

In [None]:
im=cv2.imread(train['path'][1])
# Showing highlighted image
image=(cv2.erode(im,np.ones((2,2))))
plt.imshow(image)

In [None]:
# Let's Get the First Part Of The Inchi and try to predict it in this file 
train['part 1'] = train['InChI'].progress_apply(
    lambda x:  x.split('/')[1]  )
train.head()

In [None]:
train['Length']=train['part 1'].progress_apply(lambda x : len(x))
train.head()

# Getting to know Part 1 column better :)

In [None]:
# Getting the highest length 
max_len=train['Length'].max()
max_len

In [None]:
# Let's get the character set of the usual characters in part 1
characters=set()
for i in train['part 1'].values:
    for j in i :
        if j not in characters :
            characters.add(j)
characters=sorted(characters)
    

In [None]:
# Having a look at the characters
characters

In [None]:
# Making Dictionary for labelling
char_to_label={i:j for j,i in enumerate(characters) }
label_to_char={j:i for j,i in enumerate(characters)}
# Adding another label 100 which will show no character which means there is nothing there
label_to_char[100]=''

# Making Custom DataGenerator For Our Model

In [None]:
# Making a Custom DataGenerator to get the data from the DataFrame and changing it into custom output required for out CTC LAyer
class DataGenerator(Sequence):
    def __init__(self,dataframe,char_map,batch_size=16,width=200,height=50,downsample_factor=4,max_length=20,shuffle=True):
        self.dataframe=dataframe
        self.char_map=char_map
        self.batch_size=batch_size
        self.width=width
        self.height=height
        self.downsample_factor=downsample_factor
        self.max_length=max_length
        self.shuffle=shuffle
        self.indices = np.arange(len(dataframe))
        self.on_epoch_end()
    def __len__(self):
        return len(self.dataframe)//self.batch_size
    def __getitem__(self,idx):
        curr_batch_idx=self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
        
        # Get the batch images
        batch_images=np.ones((self.batch_size,self.width,self.height,1),dtype=np.float32)
        batch_labels=np.ones((self.batch_size,self.max_length),dtype=np.float32)
        input_length=np.ones((self.batch_size,1),dtype=np.float32)*(self.width//self.downsample_factor-2)
        label_length=np.zeros((self.batch_size,1),dtype=np.int64)
        
        # Starting the loop to get the data
        for i,idx in enumerate(curr_batch_idx):
            img=cv2.imread(self.dataframe['path'].values[idx])
            img=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img=cv2.erode(img,((2,2)))
            img=cv2.resize(img,(self.width,self.height))
            img=img/255   # Normalizing the image
            img=img.T
            img=np.expand_dims(img,axis=-1)
            text=self.dataframe['part 1'].values[idx]
            label=[]
            for j in text: 
                label.append(self.char_map[j])
            label.extend([100]*(20-len(label)))            
            batch_images[i]=img
            batch_labels[i]=label
            label_length[i]=len(label)
            
        batch_inputs= {
                'input_data':batch_images,
                'input_label':batch_labels,
                'input_length':input_length,
                'label_length':label_length
                
            }
        return batch_inputs,np.zeros((self.batch_size),dtype=np.float32)
    def on_epoch_end(self):
        if self.shuffle == True :
            np.random.shuffle(self.indices)

In [None]:
len(train)

In [None]:
train_datagenerator=DataGenerator(train[:150000],char_to_label)
validation_datagenerator=DataGenerator(train[150000:160000],char_to_label)

# Making CTC Layer

In [None]:
class CTCLayer(L.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred, input_length, label_length):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)
        
        # On test time, just return the computed loss
        return loss

# Making the model

In [None]:
# Making the Model now
def make_model():
    inp=L.Input(shape=(200,50,1),dtype=np.float32,name='input_data')
    labels=L.Input(shape=[5],dtype=np.float32,name='input_label')
    input_length=L.Input(shape=[1],dtype=np.int64,name='input_length')
    label_length=L.Input(shape=[1],dtype=np.int64,name='label_length')
    x=L.Conv2D(32,(3,3),activation='relu',padding='same',kernel_initializer='he_normal')(inp)
    x=L.MaxPooling2D(pool_size=(2,2))(x)
    x=L.Conv2D(64,(3,3),activation='relu',padding='same',kernel_initializer='he_normal')(x)
    x=L.MaxPooling2D(pool_size=(2,2))(x)
    new_shape=((200//4),(50//4)*64)
    x=L.Reshape(new_shape)(x)
    x=L.Dense(64,activation='relu')(x)
    x=L.Dropout(0.4)(x)
    x=L.Bidirectional(L.LSTM(128,return_sequences=True,dropout=0.2))(x)
    x=L.Bidirectional(L.LSTM(64,return_sequences=True,dropout=0.25))(x)
    x=L.Dense(len(characters)+1,activation='softmax',kernel_initializer='he_normal',name='Dense_output')(x)
    output=CTCLayer(name='outputs')(labels,x,input_length,label_length)
    model=M.Model([inp,labels,input_length,label_length],output)
    # Optimizer
    sgd = keras.optimizers.SGD(learning_rate=0.0015,
                               decay=1e-6,
                               momentum=0.9,
                               nesterov=True,
                               clipnorm=5)
    model.compile(optimizer=sgd)
    return model
    
    
    

In [None]:
model=make_model()
model.summary()

# Training The Model

In [None]:
# ### Add early stopping
# es = keras.callbacks.EarlyStopping(monitor='val_loss',
#                                    patience=2,
#                                    restore_best_weights=True)

# ### Train the model
# if 'prediction_model.h5' not in os.listdir('./'):
#     history = model.fit(train_datagenerator,
#                         validation_data=validation_datagenerator,
#                         steps_per_epoch=1500,
#                         epochs=8,
#                         callbacks=[es])

# Making Predictions Model

In [None]:
# prediction_model = keras.models.Model(model.get_layer(name='input_data').input,
#                                         model.get_layer(name='Dense_output').output)


# Load Model If Not Training Again

In [None]:
prediction_model=M.load_model('../input/prediction-model-competition/prediction_model_ocr (1).h5')
prediction_model.summary()

In [None]:
label_to_char[100]=''
# A utility to decode the output of the network
def decode_batch_predictions(pred):
    pred = pred[:, :-2]
    input_len = np.ones(pred.shape[0])*pred.shape[1]
    
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, 
                                        input_length=input_len,
                                        greedy=True)[0][0]
    
    # Iterate over the results and get back the text
    output_text = []
    for res in results.numpy():
        outstr = ''
        for c in res:
            if c < len(characters) and c >=0:
                outstr += label_to_char[c]
        output_text.append(outstr)
    
    # return final text results
    return output_text

In [None]:
for p, (inp_value, _) in enumerate(validation_datagenerator):
    bs = inp_value['input_data'].shape[0]
    X_data = inp_value['input_data']
    labels = inp_value['input_label']
    preds = prediction_model.predict(X_data)
    pred_texts = decode_batch_predictions(preds)
    
    
    orig_texts = []
    for label in labels:
        text = ''.join([label_to_char[int(x)] for x in label])
        orig_texts.append(text)
        
    for i in range(bs):
        print(f'Ground truth: {orig_texts[i]} \t Predicted: {pred_texts[i]}')
    break

# The Results Are Not too similar to baseline for sure :)

In [None]:
sample=pd.read_csv('../input/bms-molecular-translation/sample_submission.csv')
sample['path'] = sample['image_id'].progress_apply(
    lambda x: "../input/bms-molecular-translation/test/{}/{}/{}/{}.png".format(
        x[0], x[1], x[2], x))
sample.head()

# Releasing Some RAM

In [None]:
del train
gc.collect()

# Transform Function

In [None]:
def transform(path):
    img=cv2.imread(path)
    img=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img=cv2.erode(img,((2,2)))
    img=cv2.resize(img,(200,50))
    img=img/255   # Normalizing the image
    img=img.T
    img=np.expand_dims(img,axis=-1)
    return img

# Since there is a lot of testing data this might take some time have a snack :)

# One Prediction :)

In [None]:
im=transform('../input/bms-molecular-translation/train/2/0/1/201013c95288.png')
batch_images=np.ones((128,200,50,1),dtype=np.float32)
batch_images[0]=im

x=prediction_model.predict(batch_images)
pred_texts = decode_batch_predictions(x)
plt.imshow(cv2.imread('../input/bms-molecular-translation/train/2/0/1/201013c95288.png'))
print(pred_texts[0])

## You can use this code to make predictions and save it

In [None]:
# predictions=[]
# for i in range(len(sample)//500):
#     print(i*100/3232)
#     clear_output(wait=True)
#     dd=sample[i*500:(i+1)*500]
#     batch_images=np.ones((500,200,50,1),dtype=np.float32)
#     for i in range(500):
#         batch_images[i]=transform(dd['path'].values[i])
#     x=prediction_model.predict(batch_images)
#     pred_texts = decode_batch_predictions(x)
#     predictions.extend(pred_texts)
# dd=sample[3232*500:]
# pt=len(sample)%500
# batch_images=np.ones((pt,200,50,1),dtype=np.float32)
# for j,k in enumerate(dd['path'].values) :
#     im=transform(k)
#     batch_images[j]=im
# x=prediction_model.predict(batch_images)
# pred_texts = decode_batch_predictions(x)
# predictions.extend(pred_texts)
# predictions=np.array(predictions)
# np.save('Text Predicted.npy',predictions)

## Loading Predictions

In [None]:
predictions=np.load('../input/predicted-text/Text Predicted.npy')

# Saving the results

In [None]:
# Baseline answers : label='InChI=1S/'+'C15H22N2O2/'+'c1-1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18(19)20/'+'h1-1,1111,1,,,,,'
label=['InChI=1S/'+i+'/c1-1-3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18(19)20/'+'h1-1,1111,1,,,,,' for i in predictions]

In [None]:
submission=pd.read_csv('../input/bms-molecular-translation/sample_submission.csv')

In [None]:
submission['InChI']=label

In [None]:
submission.to_csv('Submission.csv',index=False)