# library

In [None]:
import json
import random
random.seed(27)
from functools import partial
from collections import defaultdict
from multiprocessing import Pool

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
import cv2
import albumentations as A

In [None]:
import tensorflow as tf
# import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
from transformers import BertTokenizer

# Data

## Load csv

In [None]:
path = '/kaggle/input/shopee-product-matching/'

In [None]:
train_csv = pd.read_csv(path+'train.csv')
train_csv.head()

In [None]:
train_img_name = train_csv['image']
train_img_path = '/kaggle/input/shopee-product-matching/train_images/' + train_csv['image']
train_title = train_csv['title']
train_label = train_csv['label_group']


In [None]:
train_img_path[0]

Title property

In [None]:
print("Max words: {}".format(train_title.map(lambda x: len(x.split())).max()))
print("Min words: {}".format(train_title.map(lambda x: len(x.split())).min()))
print("Mean words: {}".format(train_title.map(lambda x: len(x.split())).mean()))

Since we have 61 words at max and the token take extra dims, it will tokenize as 300 dim for safe

In [None]:
text_token_dims = 300

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
train_title_token = np.array(tokenizer(train_title.tolist(),
                                       padding='max_length',
                                       truncation=True , 
                                       max_length=text_token_dims)['input_ids'],
                             dtype = np.uint32)

## Load image

In [None]:
#train_img = (np.array(list(map(lambda x: cv2.imread(path+'train_images/'+x), train_img_name))))

crush due to not enough memory, change the data loading method to lazy loading

# Data Generator

## Triplet Data Problems

In [None]:
print('Number of training data = {}'.format(len(train_img_name)))
print('Number of unique lable = {}'.format(len(set(train_img_name))))

In [None]:
def count_label(train_label):
    unique, counts = np.unique(train_label, return_counts=True)
    unique, counts = np.unique(counts, return_counts=True)
    print(dict(zip(unique, counts)))
    plt.plot(counts)
    plt.title("Distribution of the number of images in each labels class")
    plt.xlabel("the number of images in each labels class")
    plt.ylabel("Count")
    plt.show()
count_label(train_label)

One problems is that most of the image has only 2-3 pair, which mean the varience in positive image is not as much as negative image. One soluation is that we can use those images with storng argumentation as the positive image.

## Triplet Data 

In [None]:
class TripletDataGenerator():
    def __init__(self, imgs_path, title_token, labels, text_token_dims = text_token_dims, resize = None):
        ##imgs_path: list of imgs path
        ##labels shape:(num, )
        self.imgs_path = imgs_path
        self.labels = labels
        self.resize = resize

        self.title_token = title_token

        self.Y2X = defaultdict(list)
        for i in range(len(labels)):
            self.Y2X[labels[i]].append([imgs_path[i], self.title_token[i]])
        self.Y2X_keys = list(self.Y2X.keys())

    def augment(self, img):
        augmentation_option = A.Compose([
            A.Rotate(limit = (-20,20), p=0.5),
            A.RandomScale(scale_limit = (0.7, 0.9), p=0.5),
            A.RandomCrop(width=img.shape[1]*3//4, height=img.shape[0]*3//4, p = 0.7),
            A.JpegCompression(quality_lower=90, quality_upper=100, p=0.5),
            A.GaussNoise(var_limit=(20.0, 50.0), p=0.5),
            A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p=0.5),
            A.RandomGamma(gamma_limit=(171, 200), p=0.5)
        ])
        return augmentation_option(image=img)["image"]
    def augment_st(self, img):
        augmentation_option = A.Compose([
            A.Rotate(limit = (-20,20), p=0.8),
            A.RandomScale(scale_limit = (0.7, 0.9), p=0.8),
            A.RandomCrop(width=img.shape[1]*3//4, height=img.shape[0]*3//4, p = 0.8),
            A.JpegCompression(quality_lower=90, quality_upper=100, p=0.8),
            A.GaussNoise(var_limit=(20.0, 50.0), p=0.8),
            A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p=0.8),
            A.RandomGamma(gamma_limit=(171, 200), p=0.8)
        ])
        return augmentation_option(image=img)["image"]

    def __call__(self, anchor_idx):
        anchor_label = self.labels[anchor_idx]
        anchor_path = self.imgs_path[anchor_idx]

        PositiveCandidate = self.Y2X[anchor_label]
        Positive_choice = random.choice(PositiveCandidate)
        

        Negative_label_Candidate = random.choice(self.Y2X_keys)
        while (anchor_label == Negative_label_Candidate):
            Negative_label_Candidate = random.choice(self.Y2X_keys)
        Negative_Candidate = self.Y2X[Negative_label_Candidate]
        Negative_choice = random.choice(Negative_Candidate)
        
        anchor_img = cv2.imread(anchor_path)[...,[2,1,0]]
        if(anchor_path == Positive_choice[0]):
            positive_img = self.augment_st(cv2.imread(Positive_choice[0])[...,[2,1,0]])
        else:
            positive_img = self.augment(cv2.imread(Positive_choice[0])[...,[2,1,0]])
        negative_img = cv2.imread(Negative_choice[0])[...,[2,1,0]]

        if (self.resize != None):
            dim = (self.resize,self.resize)
            anchor_img = cv2.resize(anchor_img, dim, interpolation = cv2.INTER_AREA)
            positive_img = cv2.resize(positive_img, dim, interpolation = cv2.INTER_AREA)
            negative_img = cv2.resize(negative_img, dim, interpolation = cv2.INTER_AREA)

        anchor_title = self.title_token[anchor_idx]
        Positive_title = Positive_choice[1]
        Negative_title = Negative_choice[1]

        return [anchor_img, positive_img, negative_img, anchor_title, Positive_title, Negative_title]

example_Generator = TripletDataGenerator(train_img_path, train_title_token ,train_label, resize = 224, text_token_dims = text_token_dims)

In [None]:
def show_triplet(x):
    num = 5
    plt.figure(figsize=(20,12))
    for i in range(num):
        anchor_img, positive_img, negative_img, anchor_title, Positive_title, Negative_title  = x(i)
        plt.subplot(3,num,i+1)
        plt.imshow(anchor_img)
        plt.title('Ancher')
        print(anchor_title)
        plt.subplot(3,num,i+num+1)
        plt.imshow(positive_img)
        plt.title('Positive')
        print(Positive_title)
        plt.subplot(3,num,i+num+num+1)
        plt.imshow(negative_img)
        plt.title('Negative')
        print(Negative_title)
        print('-------------------')
    plt.show()
show_triplet(example_Generator)

## Train test split

the val size need to be small in order to fit in memory

In [None]:
train_img_idx = np.arange(len(train_label))
train_img_idx_split, train_label_split = train_img_idx, train_label

## train set piple line

In [None]:
train_idx_dataset = tf.data.Dataset.from_tensor_slices(train_img_idx_split)## IF Turning, add [:10]


Base model |resolution
 --- | ---
EfficientNetB0 | 224
EfficientNetB1 | 240
EfficientNetB2 | 260
EfficientNetB3 | 300
EfficientNetB4 | 380
EfficientNetB5 | 456
EfficientNetB6 | 528
EfficientNetB7 | 600

ref: https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning/

We are going to use B1, so 240

In [None]:
image_size = 224
batch_size = 40 ##Depends on GPU memory size

TripletGenerate = TripletDataGenerator(train_img_path, train_title_token, train_label, resize = image_size, text_token_dims = text_token_dims)

In [None]:
def GetData(idx, size):
    Triplet = tf.numpy_function(func=TripletGenerate, inp=[idx], Tout=[tf.uint8, tf.uint8, tf.uint8, tf.uint32, tf.uint32, tf.uint32])
    for i in Triplet[:3]:
        i.set_shape((size,size,3))
    return {"Anchor_img": Triplet[0], "Positive_img": Triplet[1], "Negative_img": Triplet[2], "Anchor_title": Triplet[3], "Positive_title": Triplet[4], "Negative_title": Triplet[5]}


In [None]:
TripletDataSet_train = train_idx_dataset.map(partial(GetData, size=image_size), num_parallel_calls=tf.data.AUTOTUNE).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [None]:
test_batch = next(iter(TripletDataSet_train))

In [None]:
def show_batch(batch):
    num = 5
    plt.figure(figsize=(20,12))
    for i in range(num):
        plt.subplot(3,num,i+1)
        plt.imshow(batch['Anchor_img'][i])
        plt.title('Ancher')
        plt.subplot(3,num,i+num+1)
        plt.imshow(batch['Positive_img'][i])
        plt.title('Positive')
        plt.subplot(3,num,i+num+num+1)
        plt.imshow(batch['Negative_img'][i])
        plt.title('Negative')
    plt.show()
show_batch(test_batch)

# SiameseNet

## Triplet Loss

In [None]:
class TripletLossLayer(layers.Layer):
    def __init__(self, alpha, **kwargs):
        self.alpha = alpha
        super(TripletLossLayer, self).__init__(**kwargs)
    
    def triplet_loss(self, inputs):
        anchor, positive, negative = inputs
        
        anchor = tf.math.l2_normalize(anchor, axis=1)
        positive = tf.math.l2_normalize(positive, axis=1)
        negative = tf.math.l2_normalize(negative, axis=1)

        p_dist = tf.math.reduce_sum(tf.math.square(anchor-positive), axis=-1)
        n_dist = tf.math.reduce_sum(tf.math.square(anchor-negative), axis=-1)
        return tf.math.reduce_sum(tf.math.maximum(p_dist - n_dist + self.alpha, 0), axis=0)
    
    def call(self, inputs):
        loss = self.triplet_loss(inputs)
        self.add_loss(loss)
        return loss

## Transformer 

embedding layer

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
token_layer = TokenAndPositionEmbedding(text_token_dims, tokenizer.vocab_size, 50)

In [None]:
test_batch['Anchor_title'][0]

In [None]:
token_layer(test_batch['Anchor_title'][0])

Transformer block

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

## Network build

In [None]:
def model_builder(resolution, text_token_dims = text_token_dims, vocab_size = tokenizer.vocab_size, embedding_size = 128):
    ##IMAGE
    BackBone_img = keras.applications.EfficientNetB0(
        include_top=False, weights='imagenet',
        pooling = 'avg',
        input_tensor=layers.Input((resolution,resolution,3), name = 'image')
    )
    net_image_flatten = layers.Flatten()(BackBone_img.layers[-1].output)

    ##TEXT
    input_text_embed_dim = 70 # Embedding size for each token
    num_heads = 3 # Number of attention heads
    out_dim = 100 # Hidden layer size in feed forward network inside transformer

    inputs = layers.Input(shape=(text_token_dims,), name = 'title')
    embedding_layer = TokenAndPositionEmbedding(text_token_dims, vocab_size, input_text_embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(input_text_embed_dim, num_heads, out_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)

    #Connect two network
    net_concate = layers.Concatenate()([net_image_flatten, x])

    output = layers.Dense(embedding_size)(net_concate)

    BackBone = keras.Model(inputs=[BackBone_img.layers[0].output, inputs], outputs=output, name = 'EfficientNetB0_SimpleTransformer')


    img_input_anchor = layers.Input((resolution,resolution,3), name='Anchor_img')
    img_input_positive = layers.Input((resolution,resolution,3), name='Positive_img')
    img_input_negative = layers.Input((resolution,resolution,3), name='Negative_img')

    title_input_anchor = layers.Input((text_token_dims,), name='Anchor_title')
    title_input_positive = layers.Input((text_token_dims,), name='Positive_title')
    title_input_negative = layers.Input((text_token_dims,), name='Negative_title')


    anchor_embedding= BackBone([img_input_anchor, title_input_anchor])
    positive_embedding = BackBone([img_input_positive, title_input_positive])
    negative_embedding = BackBone([img_input_negative, title_input_negative])

    margin = 1
    loss_layer = TripletLossLayer(alpha=margin, name='triplet_loss_layer')([anchor_embedding, positive_embedding, negative_embedding])
    Triplet_Net = keras.Model(inputs=[img_input_anchor, title_input_anchor, img_input_positive, title_input_positive, img_input_negative, title_input_negative], outputs=loss_layer)
    Triplet_Net.compile(optimizer=keras.optimizers.RMSprop())
    return BackBone, Triplet_Net

Embedding_Net, Triplet_Net = model_builder(image_size)

In [None]:
Triplet_Net.summary()

In [None]:
keras.utils.plot_model(Triplet_Net, show_shapes=True)

In [None]:
Embedding_Net.summary()

In [None]:
keras.utils.plot_model(Embedding_Net, show_shapes=True)

# Training

In [None]:
!mkdir -p /kaggle/working/checkpoint
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='/kaggle/working/checkpoint/{epoch:02d}-{loss:.6f}.hdf5',
    save_weights_only=True,
    monitor='loss',
    save_best_only=False)

In [None]:
History = Triplet_Net.fit(TripletDataSet_train, epochs=8, callbacks=[model_checkpoint_callback])

In [None]:
def plot_loss(history):
    plt.figure(figsize=(10,5))
    plt.subplot(1, 2, 1)
    plt.plot(range(1,len(history.history['loss'])+1), history.history['loss'], label='train loss')
    plt.xlabel('Epoch')
    plt.ylabel('loss')
    plt.xticks(range(1,len(history.history['loss'])+1))
    plt.legend()
    plt.grid(True)
plot_loss(History)