In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames[:4]:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.layers import Conv2D

In [None]:
class CONFIG(object):
  """CONFIG"""
  def __init__(self):
    self.img_size = (224, 224)
    self.base = '../input/shopee-product-matching/'
    self.df = '../input/shopee-product-matching/train.csv'
    self.batch_size = 32
    self.val_split = 0.25
    self.seed = 22
    self.n_epochs = 40
    
    
cfg= CONFIG()

In [None]:
df= pd.read_csv(cfg.df)
df.head()

In [None]:
def load_img(img_id):
    path = cfg.base + 'train_images/' + img_id
    img = cv2.imread(path)
    img = cv2.resize(img, cfg.img_size)
    return img

In [None]:
def build_decoder(with_labels=True, target_size=cfg.img_size, ext='jpg'):
    def decode(path):
        file_bytes = tf.io.read_file(path)
        if ext == 'png':
            img = tf.image.decode_png(file_bytes, channels=3)
        elif ext in ['jpg', 'jpeg']:
            img = tf.image.decode_jpeg(file_bytes, channels=3)
        else:
            raise ValueError("Image extension not supported")
            
        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, target_size)
#         img = tf.cast((img> 0.9), tf.float32)
        return img
    
    def decode_with_labels(path):
        x = decode(path)
        return x, x
    
    return decode_with_labels if with_labels else decode

img_decoder = build_decoder(with_labels=True, target_size= cfg.img_size,  ext='jpg')

In [None]:
# TPU or GPU detection
def auto_select_accelerator():
    """
    Reference: 
        * https://www.kaggle.com/mgornergoogle/getting-started-with-100-flowers-on-tpu
        * https://www.kaggle.com/xhlulu/ranzcr-efficientnet-tpu-training
    """
    try:  # detect TPUs
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  ## detect TPUs
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
        print("Running on TPU:", tpu.master())
    except ValueError:
        strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
        #strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
        #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines
        
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    return strategy

In [None]:
data_augmentation = keras.Sequential([
    preprocessing.RandomRotation([-0.07, 0.07]),
    preprocessing.RandomFlip('horizontal'),
    preprocessing.RandomTranslation([-0.1, 0.1], [-0.1, 0.1]),
    preprocessing.RandomZoom([0, 0.1]),
    preprocessing.RandomContrast(0.05)
])

In [None]:
def augment(img, label):
    img = tf.reshape(img, [-1, 224, 224, 3])
    img = data_augmentation(img)
    img = tf.reshape(img, [224, 224, 3])
    return img, img

In [None]:
def Build_dataset(paths, labels= None, apply_aug=False, batch= cfg.batch_size,
                  decode_fn=img_decoder,repeat= True, shuffle= cfg.seed):
    AUTO = tf.data.experimental.AUTOTUNE
    slices = paths if labels is None else (paths, labels)
    
    dset = tf.data.Dataset.from_tensor_slices(slices)
    dset = dset.map(decode_fn, num_parallel_calls=AUTO)
    if apply_aug:
        dset = dset.map(augment, num_parallel_calls=AUTO)
    dset = dset.repeat() if repeat else dset
    dset = dset.shuffle(shuffle) if shuffle else dset
    dset = dset.batch(batch).prefetch(AUTO)
    
    return dset

In [None]:
DATASET_NAME  = "shopee-product-matching"
strategy = auto_select_accelerator()
BATCH_SIZE = strategy.num_replicas_in_sync * cfg.batch_size

tpu_bsize= cfg.batch_size * strategy.num_replicas_in_sync
tpu_bsize

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path(DATASET_NAME)
GCS_DS_PATH

In [None]:
img_paths = GCS_DS_PATH + '/train_images/' + pd.Series(os.listdir('../input/shopee-product-matching/train_images/'))

# Train test split
(train_paths, valid_paths)\
    = train_test_split(img_paths, test_size=cfg.val_split, random_state=11)

print(train_paths.shape, valid_paths.shape)

In [None]:
# Build the tensorflow datasets
img_gen1 = Build_dataset(train_paths, labels= None, repeat=False, shuffle=False)
img_gen2 = Build_dataset(train_paths, labels=None, apply_aug=True, repeat=False, shuffle=False)
img_gen = img_gen1.concatenate(img_gen2)

val_gen = Build_dataset(valid_paths, labels= None, repeat=False, shuffle=False)

In [None]:
img_gen

In [None]:
data, _ = img_gen.take(2)
images = data[0].numpy()

In [None]:
images.shape

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(15,10))
axes = axes.flatten()
for img, ax in zip(images, axes):
    ax.imshow(img, aspect= True)
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
pip install efficientunet

In [None]:
sgd = keras.optimizers.SGD(learning_rate=0.01)
adam = keras.optimizers.Adam(learning_rate=0.01)

In [None]:
from efficientunet import *

with strategy.scope():
    model = get_efficient_unet_b0((224, 224, 3), pretrained=True, block_type='transpose', concat_input=True)
    out = Conv2D(3, (1,1), activation='sigmoid')(model.layers[-2].output)
    model = keras.Model(inputs=model.input, outputs=out)
    for layer in model.layers: # Freeze first 4 blocks
        if 'blocks_9' in layer.name:
            break
        layer.trainable = False
        
    model.compile(optimizer=sgd,
              loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001),
              # loss=tf.keras.losses.MeanSquaredError(),
              metrics=['accuracy'])
        
model.summary(line_length=120)

In [None]:
!git config --global user.name "kaggle-notebook"
!git config --global user.email theinnocentman1@gmail.com

In [None]:
!xargs -a /kaggle/input/kaggleutilities/github_credential.txt -I {} git clone https://{}@github.com/khiemledev/kaggle-shopee

In [None]:
model.load_weights('./kaggle-shopee/EfficientNetB0_Unet.h5')

In [None]:
#callbacks
rlr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.1, patience = 3, verbose = 1, 
                                min_delta = 1e-4, min_lr = 1e-6, mode = 'min', cooldown=1)
        
ckp = ModelCheckpoint('Unet_model.h5', monitor = 'val_loss',
                      verbose = 1, save_best_only = True, mode = 'min')
        
es = EarlyStopping(monitor = 'val_loss', min_delta = 1e-4, patience = 7, mode = 'min', 
                    restore_best_weights = True, verbose = 1)

steps_per_epoch = (train_paths.shape[0] // cfg.batch_size) / 20

In [None]:
history = model.fit(img_gen,                      
                    validation_data=val_gen,                                       
                    epochs=30,
                    callbacks=[rlr,es,ckp],
                    steps_per_epoch=steps_per_epoch,
                    verbose=1)

In [None]:
model.evaluate(val_gen)

In [None]:
plt.figure(figsize = (12, 6))
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.plot( history.history["loss"], label = "Training Loss", marker='o')
plt.plot( history.history["val_loss"], label = "Validation Loss", marker='+')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
label_group_count = df.groupby(['label_group']).size().reset_index()
label_group_count.columns = ['label_group', 'count']
label_group_count.sort_values(by='count', ascending=False, inplace=True)
label_group_count

In [None]:
img_list = df[df.label_group == 1163569239].image.values

In [None]:
timages = [load_img(img) for img in img_list]
fig, axes = plt.subplots(1, 4, figsize=(15,10))
axes = axes.flatten()
for img, ax in zip(timages, axes):
    ax.imshow(img, aspect= True)
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
images = np.array([img_decoder(GCS_DS_PATH + '/train_images/{}'.format(img))[0] for img in img_list])

In [None]:
images.shape

In [None]:
img_list2 = df[df.label_group == 159351600].image.values

In [None]:
timages = [load_img(img) for img in img_list2]
fig, axes = plt.subplots(1, 4, figsize=(15,10))
axes = axes.flatten()
for img, ax in zip(timages, axes):
    ax.imshow(img, aspect= True)
    ax.axis('off')
plt.tight_layout()
plt.show()

In [None]:
images2 = np.array([img_decoder(GCS_DS_PATH + '/train_images/{}'.format(img))[0] for img in img_list2])

In [None]:
images2.shape

In [None]:
new_model = keras.Model(inputs=model.input, outputs=model.layers[-48].output)
# new_model.summary()

In [None]:
y_pred= new_model.predict(images)

In [None]:
y_pred2= new_model.predict(images2)

In [None]:
from scipy import spatial
def cosine_distance(a, b):
    a = a.reshape(-1)
    b = b.reshape(-1)
    dist = spatial.distance.cosine(a, b)
    return dist

In [None]:
pred_group1 = y_pred

In [None]:
group_distance = []
for pred1 in pred_group1:
    for pred2 in y_pred:
        if pred1 is pred2: continue
        group_distance.append(cosine_distance(pred1, pred2))
plt.ylim([0, 200])
_ = plt.hist(group_distance, bins=50, range=[0, 0.05])

In [None]:
pred_group1 = y_pred2

In [None]:
group_distance = []
for pred1 in pred_group1:
    for pred2 in y_pred2:
        if pred1 is pred2: continue
        group_distance.append(cosine_distance(pred1, pred2))
plt.ylim([0, 200])
_ = plt.hist(group_distance, bins=50, range=[0, 0.05])

In [None]:
pred_group1 = y_pred2

In [None]:
group_distance = []
for pred1 in pred_group1:
    for pred2 in y_pred:
        if pred1 is pred2: continue
        group_distance.append(cosine_distance(pred1, pred2))
plt.ylim([0, 200])
_ = plt.hist(group_distance, bins=50, range=[0, 0.05])