In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:4]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

In [None]:
class CONFIG(object):
  """CONFIG"""
  def __init__(self):
    self.img_size = (256, 256)
    self.base = '../input/shopee-product-matching/'
    self.df = '../input/shopee-product-matching/train.csv'
    self.batch_size = 14
    self.val_split = 0.1
    self.seed = 22
    self.n_epochs = 40
    
    
cfg= CONFIG()

In [None]:
df= pd.read_csv(cfg.df)
df.head()

In [None]:
def load_img(img_id):
    path = cfg.base + 'train_images/' + img_id
    img = cv2.imread(path)
    img = cv2.resize(img, cfg.img_size)
    return img

In [None]:
def build_decoder(with_labels=True, target_size=cfg.img_size, ext='jpg'):
    def decode(path):
        file_bytes = tf.io.read_file(path)
        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)
        else:
            raise ValueError("Image extension not supported")
            
        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, target_size)
        img = tf.cast((img> 0.9), tf.float32)
        return img
    
    def decode_with_labels(path):
        x = decode(path)
        return x, x
    
    return decode_with_labels if with_labels else decode

img_decoder = build_decoder(with_labels=True, target_size= cfg.img_size,  ext='jpg')

In [None]:
# TPU or GPU detection
def auto_select_accelerator():
    """
    Reference: 
        * https://www.kaggle.com/mgornergoogle/getting-started-with-100-flowers-on-tpu
        * https://www.kaggle.com/xhlulu/ranzcr-efficientnet-tpu-training
    """
    try:  # detect TPUs
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  ## detect TPUs
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
    except ValueError:
        strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
        #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
        #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines
        
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    return strategy

In [None]:
def Build_dataset(paths, labels= None, batch= cfg.batch_size,
                  decode_fn=img_decoder,repeat= True, shuffle= cfg.seed):
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(batch).prefetch(AUTO)
    
    return dset

In [None]:
DATASET_NAME  = "shopee-product-matching"
strategy = auto_select_accelerator()
BATCH_SIZE = strategy.num_replicas_in_sync * cfg.batch_size

tpu_bsize= cfg.batch_size * strategy.num_replicas_in_sync
tpu_bsize

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path(DATASET_NAME)
GCS_DS_PATH

In [None]:
img_paths = GCS_DS_PATH + '/train_images/' + pd.Series(os.listdir('../input/shopee-product-matching/train_images/'))

# Train test split
(train_paths, valid_paths)\
    = train_test_split(img_paths, test_size=cfg.val_split, random_state=11)

print(train_paths.shape, valid_paths.shape)

In [None]:
# Build the tensorflow datasets
img_gen = Build_dataset(train_paths, labels= None, repeat=False, shuffle=False)

val_gen = Build_dataset(valid_paths, labels= None, repeat=False, shuffle=False)

In [None]:
data, _ = img_gen.take(2)
images = data[0].numpy()

In [None]:
images.shape

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(15,10))
axes = axes.flatten()
for img, ax in zip(images, axes):
    ax.imshow(img, aspect= True)
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
from tensorflow.keras import Input
from tensorflow.keras.layers import Conv2D, Dropout, MaxPooling2D, Conv2DTranspose, concatenate, SeparableConv2D

In [None]:
def unet_model(input_size = (cfg.img_size[0], cfg.img_size[1], 3)):
    #https://github.com/lhelontra/squeeze-unet/blob/master/squeezeunet.py
    
    inp= Input(input_size)
    #inp= keras.layers.Lambda(lambda x: x/255)(inp)
    
    #Contraction path
    conv_d0= Conv2D(32, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(inp)
    conv_d0= Dropout(0.4)(conv_d0)
    conv_d0= SeparableConv2D(32, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_d0)
    pool_d0=MaxPooling2D()(conv_d0)

    conv_d1= Conv2D(64, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(pool_d0)
    conv_d1= Dropout(0.3)(conv_d1)
    conv_d1= SeparableConv2D(64, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_d1)
    pool_d1=MaxPooling2D()(conv_d1)
    
    conv_d2= SeparableConv2D(128, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(pool_d1)
    conv_d2= Dropout(0.2)(conv_d2)
    conv_d2= SeparableConv2D(128, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_d2)
    pool_d2=MaxPooling2D()(conv_d2)
    
    conv_d3= SeparableConv2D(256, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(pool_d2)
    conv_d3= Dropout(0.3)(conv_d3)
    conv_d3= SeparableConv2D(256, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_d3)
    pool_d3=MaxPooling2D()(conv_d3)
    
    conv_d4= SeparableConv2D(512, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(pool_d3)
    conv_d4= Dropout(0.2)(conv_d4)
    conv_d4= SeparableConv2D(512, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_d4)
    pool_d4=MaxPooling2D()(conv_d4)
    
    conv_d5= SeparableConv2D(1024, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(pool_d4)
    conv_d5= Dropout(0.3)(conv_d5)
    conv_d5= SeparableConv2D(1024, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_d5)
    
    
    
    #Expansive path 
    conv_u4= Conv2DTranspose(512, (2, 2), strides=(2, 2), padding='same')(conv_d5)
    conv_u4= concatenate([conv_u4,conv_d4])
    conv_u4= Conv2D(512, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u4)
    conv_u4= Dropout(0.2)(conv_u4)
    conv_u4= Conv2D(512, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u4)
    
    conv_u3= Conv2DTranspose(256, (2, 2), strides=(2, 2), padding='same')(conv_u4)
    conv_u3= concatenate([conv_u3,conv_d3])
    conv_u3= Conv2D(256, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u3)
    conv_u3= Dropout(0.4)(conv_u3)
    conv_u3= Conv2D(256, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u3)
    
    conv_u2= Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same')(conv_u3)
    conv_u2= concatenate([conv_u2,conv_d2])
    conv_u2= Conv2D(128, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u2)
    conv_u2= Dropout(0.3)(conv_u2)
    conv_u2= Conv2D(128, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u2)
    
    conv_u1= Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(conv_u2)
    conv_u1= concatenate([conv_u1,conv_d1])
    conv_u1= Conv2D(64, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u1)
    conv_u1= Dropout(0.3)(conv_u1)
    conv_u1= Conv2D(64, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u1)

    conv_u0= Conv2DTranspose(32, (2, 2), strides=(2, 2), padding='same')(conv_u1)
    conv_u0= concatenate([conv_u0,conv_d0])
    conv_u0= Conv2D(32, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u0)
    conv_u0= Dropout(0.3)(conv_u0)
    conv_u0= Conv2D(32, (3,3), activation='relu', padding='same', kernel_initializer='he_normal')(conv_u0)
    
    out=Conv2D(3, (1,1), activation='sigmoid')(conv_u0)
    
    return keras.Model(inputs=inp, outputs=out)
    

In [None]:
with strategy.scope():
    model=unet_model()
    model.compile(optimizer='adam',
      loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001),
      metrics=['accuracy'])
model.summary()

In [None]:
#callbacks
rlr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 3, verbose = 1, 
                                min_delta = 1e-4, min_lr = 1e-6, mode = 'min', cooldown=1)
        
ckp = ModelCheckpoint('Unet_model.h5', monitor = 'val_loss',
                      verbose = 1, save_best_only = True, mode = 'min')
        
es = EarlyStopping(monitor = 'val_loss', min_delta = 1e-4, patience = 7, mode = 'min', 
                    restore_best_weights = True, verbose = 1)

steps_per_epoch = (train_paths.shape[0] // cfg.batch_size)/10

In [None]:
history = model.fit(img_gen,                      
                    validation_data=val_gen,                                       
                    epochs=15,
                    callbacks=[rlr,es,ckp],
                    steps_per_epoch=steps_per_epoch,
                    verbose=1)

In [None]:
plt.figure(figsize = (12, 6))
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot( history.history["loss"], label = "Training Loss", marker='o')
plt.plot( history.history["val_loss"], label = "Validation Loss", marker='+')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
imgs = [
    '7e63048e86bc35e6bf927f00e6746e4e.jpg',
    'bbe4ea3bb36ccf6308d6220edfb13343.jpg',
    '286db00383814bfdb2191d7a9dee6d22.jpg'
]

fig, axes = plt.subplots(1, 3, figsize=(15,10))
axes = axes.flatten()
for img, ax in zip(imgs, axes):
    ax.imshow(load_img(img), aspect= True)
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
images = np.array([
    img_decoder(GCS_DS_PATH + '/train_images/' + img)[0] for img in imgs
])

In [None]:
pred= model.predict(images)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15,10))
axes = axes.flatten()
for img, ax in zip(pred, axes):
    ax.imshow(img, aspect= True)
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15,10))
axes = axes.flatten()
for img, ax in zip(pred, axes):
    ax.imshow(img * 255, aspect= True)
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

In [None]:
from scipy import spatial

def cosine_distance(a, b):
    return spatial.distance.cosine(a.flatten(), b.flatten())

In [None]:
print('Euclidean distance between image:')
for i in range(len(pred) - 1):
    for j in range(i + 1, len(pred)):
        print("{} and {}: {}".format(i, j, euclidean_distance(pred[i], pred[j])))

In [None]:
print('Cosine distance between:')
for i in range(len(pred) - 1):
    for j in range(i + 1, len(pred)):
        print("Image {} and image {}: {}".format(i, j, cosine_distance(pred[i], pred[j])))