### This notebook covers following:
* Basic image denoising using opencv
* Extraction of features using Unet for a random sample of 50000 images

Ref: https://www.kaggle.com/paulorzp/denoise-images

In [None]:
# Import libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import collections
import random

import cv2
import matplotlib.pyplot as plt

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from PIL import Image
from numpy import asarray
import pickle
from tqdm import tqdm_notebook


from tensorflow.keras.backend import int_shape
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, Conv3D, MaxPooling2D, MaxPooling3D, UpSampling2D, UpSampling3D, Add, BatchNormalization, Input, Activation, Lambda, Concatenate

In [None]:
tf.__version__

### Read the train data labels into a dataframe

In [None]:
TRAIN_LABELS_PATH = "../input/bms-molecular-translation/train_labels.csv"
# setting the index to the image_id column
df_train_labels = pd.read_csv(TRAIN_LABELS_PATH, index_col=0)

### Visualize train images

In [None]:
# ref: https://www.kaggle.com/ihelon/molecular-translation-exploratory-data-analysis 
def convert_image_id_2_path(image_id: str) -> str:
    return "../input/bms-molecular-translation/train/{}/{}/{}/{}.png".format(
        image_id[0], image_id[1], image_id[2], image_id 
    )

In [None]:
#ref: https://www.kaggle.com/ihelon/molecular-translation-exploratory-data-analysis
def visualize_train_batch(image_ids, labels):
    plt.figure(figsize=(16, 12))
    
    for ind, (image_id, label) in enumerate(zip(image_ids, labels)):
        plt.subplot(3, 3, ind + 1)
        image = cv2.imread(convert_image_id_2_path(image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
#         print(f"{ind}: {label}")
        plt.title(f"{label[:30]}...", fontsize=10)
        plt.axis("off")
    
    plt.show()

In [None]:
#ref: https://www.kaggle.com/ihelon/molecular-translation-exploratory-data-analysis
def visualize_train_image(image_id, label):
    plt.figure(figsize=(10, 8))
    
    image = cv2.imread(convert_image_id_2_path(image_id))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.imshow(image)
    plt.title(f"{label}", fontsize=14)
    plt.axis("off")
    
    plt.show()

In [None]:
def visualize_image_denoise(image_id):
    plt.figure(figsize=(10, 8))  
    image = cv2.imread(convert_image_id_2_path(image_id), cv2.IMREAD_GRAYSCALE)
    _, blackAndWhite = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY_INV)
    nlabels, labels, stats, centroids = cv2.connectedComponentsWithStats(blackAndWhite, None, None, None, 8, cv2.CV_32S)
    sizes = stats[1:, -1] #get CC_STAT_AREA component
    img2 = np.zeros((labels.shape), np.uint8)
    for i in range(0, nlabels - 1):
        if sizes[i] >= 2:   #filter small dotted regions
            img2[labels == i + 1] = 255
    image = cv2.bitwise_not(img2)
    plt.imshow(image)
    plt.title(f"{image_id}", fontsize=14)
    plt.axis("off")
    plt.show()

In [None]:
# Image visualization

sample_row = df_train_labels.sample(5)
for i in range(5):
    visualize_train_image(
        sample_row.index[i], sample_row["InChI"][i]
    )
    visualize_image_denoise(
        sample_row.index[i]
    )
    break

### Some of the statistics from train data:
*As we can see each of the chemical identifier is unique*

In [None]:
print('Length of training-data:',len(df_train_labels))
print('Number of unique chemical identifier:',len(df_train_labels['InChI'].value_counts().index))
print('Max count of any chemical identifier in training data:',max(df_train_labels['InChI'].value_counts().values))

#### Extract image_path and caption to store as key-value pair in a dictionary

In [None]:
image_path_to_caption = collections.defaultdict(list)
for idx,path in enumerate(df_train_labels.index):
    caption = df_train_labels['InChI'].iloc[idx]
    image_path = convert_image_id_2_path(path)
    image_path_to_caption[image_path].append(caption)

#### Sample images from the complete dataset  

In [None]:
image_paths = list(image_path_to_caption.keys())
random.shuffle(image_paths)
# Let us take just first 6000 images for training now 
train_image_paths = image_paths[:50000]
print(len(train_image_paths))

In [None]:
# create a list of image paths and corresponding captions
train_captions = []
img_name_vector = []

for image_path in train_image_paths:
  caption_list = image_path_to_caption[image_path]
  train_captions.extend(caption_list)
  img_name_vector.extend([image_path] * len(caption_list))

### Basic EDA

*Extract width and height pixels distribution*

In [None]:
# h_shape=[]
# w_shape=[]
# aspect_ratio=[]
# for image_path in train_image_paths:
#     image = cv2.imread(image_path)
#     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#     h_shape.append(image.shape[0])
#     w_shape.append(image.shape[1])
#     aspect_ratio.append(1.0 * (image.shape[1] / image.shape[0]))

In [None]:
# print("Mean of height, width for a random sample of {} images is: ({}, {}) ".format(len(train_image_paths), sum(h_shape) / len(h_shape), sum(w_shape) / len(w_shape)))

### Importing additional libraries for training and feature extraction

In [None]:
# Set some parameters
im_width = 224
im_height = 224
border = 5
channels = 1

#### Keras custom data generator

In [None]:
def generator(samples, batch_size=32,shuffle_data=True,resize=224):
    """
    Yields the next training batch.
    Suppose `samples` is an array [[image1_filename,label1], [image2_filename,label2],...].
    """
    num_samples = len(samples)
    while True: # Loop forever so the generator never terminates
        samples = shuffle(samples)

        # Get index to start each batch: [0, batch_size, 2*batch_size, ..., max multiple of batch_size <= num_samples]
        for offset in range(0, num_samples, batch_size):
            # Get the samples you'll use in this batch
            batch_samples = samples[offset:offset+batch_size]

            # Initialise X_train and y_train arrays for this batch
#             X_train = []
#             y_train = []
            X_train = np.zeros((len(batch_samples), im_height, im_width, channels), dtype=np.float32)
            y_train = np.zeros((len(batch_samples), im_height, im_width, channels), dtype=np.float32)

            # For each batch
            for n, batch_sample in enumerate(batch_samples):
                
                # Denoise, resize and normalize images 
                img = cv2.imread(batch_sample, cv2.IMREAD_GRAYSCALE)
                _, blackAndWhite = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
                nlabels, labels, stats, centroids = cv2.connectedComponentsWithStats(blackAndWhite, None, None, None, 8, cv2.CV_32S)
                sizes = stats[1:, -1] #get CC_STAT_AREA component
                img2 = np.zeros((labels.shape), np.uint8)
                for i in range(0, nlabels - 1):
                    if sizes[i] >= 2:   #filter small dotted regions
                        img2[labels == i + 1] = 255
                image = cv2.bitwise_not(img2)
                
                img = cv2.resize(image,(resize,resize))
                img = np.expand_dims(img, axis=-1)
                img = img/255.0
                
                # Add example to numpy arrays
                X_train[n] = img
                y_train[n] = img

            # The generator-y part: yield the next training batch            
            yield X_train, y_train

In [None]:
# this will create a generator object
encode_train = sorted(set(img_name_vector))
train_datagen = generator(encode_train,batch_size=8)

x,y = next(train_datagen)
print(x.shape, y.shape)

In [None]:
# Split train and valid
X_train, X_valid, y_train, y_valid = train_test_split(encode_train, encode_train, test_size=0.1, random_state=42)
len(X_valid), len(y_valid), len(X_train), len(y_train)

### Unet model structure

In [None]:
def conv2d_block(input_tensor, n_filters, kernel_size = 3, batchnorm = True):
    """Function to add 2 convolutional layers with the parameters passed to it"""
    # first layer
    x = tf.keras.layers.Conv2D(filters = n_filters, kernel_size = (kernel_size, kernel_size),\
              kernel_initializer = 'he_normal', padding = 'same')(input_tensor)
    if batchnorm:
        x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    
    # second layer
    x = tf.keras.layers.Conv2D(filters = n_filters, kernel_size = (kernel_size, kernel_size),\
              kernel_initializer = 'he_normal', padding = 'same')(input_tensor)
    if batchnorm:
        x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    
    return x

In [None]:
def get_unet(input_img, n_filters = 16, dropout = 0.1, batchnorm = True):
    """Function to define the UNET Model"""
    # Contracting Path
    c1 = conv2d_block(input_img, n_filters * 1, kernel_size = 3, batchnorm = batchnorm)
    p1 = tf.keras.layers.MaxPooling2D((2, 2))(c1)
    p1 = tf.keras.layers.Dropout(dropout)(p1)
    
    c2 = conv2d_block(p1, n_filters * 2, kernel_size = 3, batchnorm = batchnorm)
    p2 = tf.keras.layers.MaxPooling2D((2, 2))(c2)
    p2 = tf.keras.layers.Dropout(dropout)(p2)
    
    c3 = conv2d_block(p2, n_filters * 4, kernel_size = 3, batchnorm = batchnorm)
    p3 = tf.keras.layers.MaxPooling2D((2, 2))(c3)
    p3 = tf.keras.layers.Dropout(dropout)(p3)
    
    c4 = conv2d_block(p3, n_filters * 8, kernel_size = 3, batchnorm = batchnorm)
    p4 = tf.keras.layers.MaxPooling2D((2, 2))(c4)
    p4 = tf.keras.layers.Dropout(dropout)(p4)
    
    c5 = conv2d_block(p4, n_filters = n_filters * 16, kernel_size = 3, batchnorm = batchnorm)
    
    # Expansive Path
    u6 = tf.keras.layers.Conv2DTranspose(n_filters * 8, (3, 3), strides = (2, 2), padding = 'same')(c5)
    u6 = tf.keras.layers.concatenate([u6, c4])
    u6 = tf.keras.layers.Dropout(dropout)(u6)
    c6 = conv2d_block(u6, n_filters * 8, kernel_size = 3, batchnorm = batchnorm)
    
    u7 = tf.keras.layers.Conv2DTranspose(n_filters * 4, (3, 3), strides = (2, 2), padding = 'same')(c6)
    u7 = tf.keras.layers.concatenate([u7, c3])
    u7 = tf.keras.layers.Dropout(dropout)(u7)
    c7 = conv2d_block(u7, n_filters * 4, kernel_size = 3, batchnorm = batchnorm)
    
    u8 = tf.keras.layers.Conv2DTranspose(n_filters * 2, (3, 3), strides = (2, 2), padding = 'same')(c7)
    u8 = tf.keras.layers.concatenate([u8, c2])
    u8 = tf.keras.layers.Dropout(dropout)(u8)
    c8 = conv2d_block(u8, n_filters * 2, kernel_size = 3, batchnorm = batchnorm)
    
    u9 = tf.keras.layers.Conv2DTranspose(n_filters * 1, (3, 3), strides = (2, 2), padding = 'same')(c8)
    u9 = tf.keras.layers.concatenate([u9, c1])
    u9 = tf.keras.layers.Dropout(dropout)(u9)
    c9 = conv2d_block(u9, n_filters * 1, kernel_size = 3, batchnorm = batchnorm)
    
    outputs = tf.keras.layers.Conv2D(1, (1, 1), activation='sigmoid')(c9)
    model = tf.keras.Model(inputs=[input_img], outputs=[outputs])
    return model

In [None]:
input_img = tf.keras.Input((im_height, im_width, 1), name='img')
model = get_unet(input_img, n_filters=16, dropout=0.05, batchnorm=True)
model.compile(optimizer=tf.keras.optimizers.Adam(), loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=5, verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(factor=0.1, patience=5, min_lr=0.00001, verbose=1),
    tf.keras.callbacks.ModelCheckpoint('model-unet.h5', verbose=1, save_best_only=True, save_weights_only=True)
]

In [None]:
train_generator = generator(X_train, batch_size=32)
valid_generator = generator(X_valid, batch_size=32)
batch_size=32

### Train the unet model

In [None]:
model.fit(train_generator,
            steps_per_epoch=len(X_train) // batch_size,
            epochs=10,
            validation_data=valid_generator,
            validation_steps=len(X_valid) // batch_size)

#### Download the features and model structure for reuse

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")