# Introduction

![Shopee](https://upload.wikimedia.org/wikipedia/commons/thumb/f/fe/Shopee.svg/1200px-Shopee.svg.png)

## **Description of competition**

Two different images of similar wares may represent the same product or two completely different items. Retailers want to avoid misrepresentations and other issues that could come from conflating two dissimilar products. Currently, a combination of deep learning and traditional machine learning analyzes image and text information to compare similarity. But major differences in images, titles, and product descriptions prevent these methods from being entirely effective. In this competition, you’ll apply your machine learning skills to build a model that predicts which items are the same products.

## **About Shopee**

### Website : [Shopee](https://shopee.com/)
Shopee is the leading e-commerce platform in Southeast Asia and Taiwan. Customers appreciate its easy, secure, and fast     online shopping experience tailored to their region. The company also provides strong payment and logistical support      along with a 'Lowest Price Guaranteed' feature on thousands of Shopee's listed products.

In [None]:
######################
##installing efficientne net models
######################
!/opt/conda/bin/python3.7 -m pip install --upgrade pip
! pip install -q efficientnet

In [None]:
########################
##importing necessary libraries
#########################
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import PIL
import efficientnet.tfkeras as efn
from kaggle_datasets import KaggleDatasets

# Checking for TPU

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Data Visualization

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
TRAIN_PATH = GCS_DS_PATH + "/train_images/"

train_df = pd.read_csv("../input/shopee-product-matching/train.csv")

label2id = dict(zip(range(train_df.label_group.nunique()),train_df.label_group.unique()))
id2label = dict(zip(train_df.label_group.unique(),range(train_df.label_group.nunique())))
train_df["label_group"] = train_df["label_group"].map(id2label)
train_df.index = train_df["image"]

train_df.head()

In [None]:
'''h_ls,w_ls = [],[]
for i,img in enumerate(train_df.image):
    im = PIL.Image.open(os.path.join(TRAIN_PATH,img))
    h_ls.append(im.height)
    w_ls.append(im.width)
    
train_df["height"] = h_ls
train_df["width"]  = w_ls
num_labels = train_df.label_group.nunique()
num_ids = train_df.label_group.nunique()
num_imgs = train_df.image.nunique()
num_phashs = train_df.image_phash.nunique()
num_titles = train_df.title.nunique()


plt.figure(figsize=(18,5))
plt.subplot(1,3,1)
plt.title("Histogram of Image Height",fontweight ="bold")
plt.xlabel("Image Height")
plt.ylabel("Number of Images")
plt.bar(np.array(train_df.height.value_counts().index),train_df.height.value_counts().values)
plt.xlim(80,1100)
plt.ylim(0,2000)


plt.subplot(1,3,2)
plt.title("Histogram of Image Width",fontweight ="bold")
plt.xlabel("Image width")
plt.ylabel("Number of Images")
plt.bar(np.array(train_df.width.value_counts().index),train_df.width.value_counts().values)
plt.xlim(80,1100)
plt.ylim(0,2000)

plt.subplot(1,3,3)
plt.bar(["Label","Posting Id","Image","Image_Phash","Title"],[num_labels,num_ids,num_imgs,num_phashs,num_titles])
plt.title("Unique values",fontweight ="bold")
plt.xlabel("Columns")
plt.ylabel("Num Unique values")
plt.show()'''
print("Time Taking")

In [None]:
#------------------
##Here I have done grouping of training id's based on labels.
#------------------

train_df["posting_label"] = np.arange(len(train_df))
tmp = train_df.groupby('label_group').posting_id.agg('unique').to_dict()
train_df['posting_ids'] = train_df.label_group.map(tmp)
train_df['posting_ids'] = train_df['posting_ids'].apply(lambda x: list(x))

tmp = train_df.groupby('label_group').posting_label.agg('unique').to_dict()
train_df['posting_labels'] = train_df.label_group.map(tmp)
train_df['posting_labels'] = train_df['posting_labels'].apply(lambda x: list(x))
train_df = train_df.drop_duplicates(subset=['image'])
train_df.head()

In [None]:
##############################
###Initializing necessary constants
##############################

NUM_CLASSES =  train_df['label_group'].nunique()
BATCH_SIZE = BATCH_SIZE = 16 * strategy.num_replicas_in_sync
HEIGHT,WIDTH = 512,512
CHANNELS = 3
SPLIT = int(0.8*len(train_df))
AUTO = tf.data.experimental.AUTOTUNE
STEPS_PER_EPOCH  = SPLIT//BATCH_SIZE
VALID_STEPS = (len(train_df)-SPLIT)//BATCH_SIZE
SEED = 143

In [None]:
##############################
###Display Samples
##############################

train_df.index = train_df["posting_id"]

def filepath_to_arr(filepath):
    img = tf.keras.preprocessing.image.load_img(filepath,target_size= (HEIGHT,WIDTH))
    arr = tf.keras.preprocessing.image.img_to_array(img)/255.
    return arr

def display_img(training_ids):
    num_imgs = len(training_ids)
 
    plt.figure(figsize = (5*num_imgs,10))
    for i,_id in enumerate(training_ids):
        plt.subplot(1,num_imgs+1,i+1)
        
        filepath = os.path.join("../input/shopee-product-matching/train_images",train_df.loc[_id]["image"])
        plt.title("Image : "+str(i+1))
        arr = filepath_to_arr(filepath)
        plt.imshow(arr)
        plt.axis("off")
    plt.show()

for j in range(5):
    display_img(train_df.iloc[j]["posting_ids"])
    

train_df.index = train_df["image"]

# Data Pipeline

In [None]:
#------------------
##processing image
#------------------
def process_img(filepath,label):
    image = tf.io.read_file(filepath)
    image = tf.image.decode_jpeg(image, channels=CHANNELS)
    image = tf.image.convert_image_dtype(image, tf.float32) 
    image = tf.image.resize(image, [HEIGHT,WIDTH])
    return image,label


#-----------------------------------
##adding augmentations to image data
#-----------------------------------
def data_augment(image, label):
    p_spatial = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_rotate = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_1 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_2 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_3 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_crop = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
            
    # Flips
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    if p_spatial > .75:
        image = tf.image.transpose(image)
        
    # Rotates
    if p_rotate > .75:
        image = tf.image.rot90(image, k=3) 
    elif p_rotate > .5:
        image = tf.image.rot90(image, k=2) 
    elif p_rotate > .25:
        image = tf.image.rot90(image, k=1) 
        
    
    if p_pixel_1 >= .4:
        image = tf.image.random_saturation(image, lower=.7, upper=1.3)
    if p_pixel_2 >= .4:
        image = tf.image.random_contrast(image, lower=.8, upper=1.2)
    if p_pixel_3 >= .4:
        image = tf.image.random_brightness(image, max_delta=.1)
        
    
    if p_crop > .7:
        if p_crop > .9:
            image = tf.image.central_crop(image, central_fraction=.7)
        elif p_crop > .8:
            image = tf.image.central_crop(image, central_fraction=.8)
        else:
            image = tf.image.central_crop(image, central_fraction=.9)
    elif p_crop > .4:
        crop_size = tf.random.uniform([], int(HEIGHT*.8), HEIGHT, dtype=tf.int32)
        image = tf.image.random_crop(image, size=[crop_size, crop_size, CHANNELS])
    
    image = tf.image.resize(image, [HEIGHT,WIDTH])
    return image,label

In [None]:
#------------------------
##preparing data pipeline
#------------------------

files_ls = tf.io.gfile.glob(TRAIN_PATH + '*.jpg')
#labels = np.array(train_df.label_group)
labels = np.zeros((len(files_ls),))

for i,file in enumerate(files_ls):
    file = file.split("/")[-1]
    labels[i] = train_df.loc[file]["label_group"]
    #tf.one_hot(train_df.loc[file]["label_group"] ,depth = NUM_CLASSES, dtype = tf.int32)
    
    
dataset = tf.data.Dataset.from_tensor_slices((files_ls,labels))
dataset = dataset.map(process_img,num_parallel_calls=AUTO)
ds = dataset.map(data_augment,num_parallel_calls=AUTO)

train_ds = ds.take(SPLIT)
val_ds = ds.skip(SPLIT)

train_ds = train_ds.cache().repeat().shuffle(2048).batch(BATCH_SIZE).prefetch(AUTO)
val_ds = val_ds.cache().repeat().batch(BATCH_SIZE).prefetch(AUTO)
print("Data Pipeline")

# Creating Model

In [None]:
class NormalizeLayer(tf.keras.layers.Layer):
    def __init__(self,**kwargs):
        super(NormalizeLayer, self).__init__()

    def call(self, inputs, mask=None):
        return tf.math.l2_normalize(inputs, axis=-1)
    def get_config(self):
        base_config = super().get_config()
        return {**base_config}
    
norm_layer = NormalizeLayer()

def create_model():
    pretrained = efn.EfficientNetB4(include_top=False, weights='noisy-student',input_shape=[HEIGHT,WIDTH, 3])
            
    x = pretrained.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    outputs = norm_layer (x)
        
    model = tf.keras.Model(pretrained.input, outputs)
    return model

model = create_model()
#model.summary()

# Compiling Model

In [None]:
############
#Custom Triplet Loss
############

def pairwise_distances(embeddings):
    dot_product = tf.linalg.matmul(embeddings, tf.transpose(embeddings))
    square_norm = tf.linalg.diag_part(dot_product)
    distances = tf.expand_dims(square_norm, 1) - 2.0 * dot_product + tf.expand_dims(square_norm, 0)
    distances = tf.math.maximum(distances, 0.0)

    mask = tf.cast(tf.equal(distances, 0.0),tf.float32)
    distances = distances + mask * 1e-16
    distances = tf.math.sqrt(distances)
    distances = distances * (1.0 - mask)

    return distances

def get_anchor_positive_triplet_mask(labels):
    indices_equal = tf.cast(tf.eye(tf.shape(labels)[0]), tf.bool)
    indices_not_equal = tf.math.logical_not(indices_equal)

    labels_equal = tf.math.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1))
    mask = tf.math.logical_and(indices_not_equal, labels_equal)

    return mask

def get_anchor_negative_triplet_mask(labels):
    labels_equal = tf.math.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1))
    mask = tf.math.logical_not(labels_equal)

    return mask

def get_triplet_mask(labels):
    indices_equal = tf.cast(tf.eye(tf.shape(labels)[0]), tf.bool)
    indices_not_equal = tf.math.logical_not(indices_equal)
    i_not_equal_j = tf.expand_dims(indices_not_equal, 2)
    i_not_equal_k = tf.expand_dims(indices_not_equal, 1)
    j_not_equal_k = tf.expand_dims(indices_not_equal, 0)

    distinct_indices = tf.math.logical_and(tf.math.logical_and(i_not_equal_j, i_not_equal_k), j_not_equal_k)


    label_equal = tf.math.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1))
    i_equal_j = tf.expand_dims(label_equal, 2)
    i_equal_k = tf.expand_dims(label_equal, 1)

    valid_labels = tf.math.logical_and(i_equal_j, tf.logical_not(i_equal_k))

    mask = tf.math.logical_and(distinct_indices, valid_labels)

    return mask


class TripletLossFn(tf.keras.losses.Loss):
    def __init__(self,margin=1.0,**kwargs):
        super().__init__(**kwargs)
        self.margin = margin
  
    def call(self,y_true,y_pred):

        labels = tf.convert_to_tensor(y_true)
        labels = tf.squeeze(labels,axis=-1)
        embeddings = tf.convert_to_tensor(y_pred)

        pairwise_dist = pairwise_distances(embeddings)

        mask_anchor_positive = get_anchor_positive_triplet_mask(labels)
        mask_anchor_positive = tf.cast(mask_anchor_positive,tf.float32)

        anchor_positive_dist = tf.math.multiply(mask_anchor_positive, pairwise_dist)

        hardest_positive_dist = tf.math.reduce_max(anchor_positive_dist, axis=1, keepdims=True)


        mask_anchor_negative = get_anchor_negative_triplet_mask(labels)
        mask_anchor_negative = tf.cast(mask_anchor_negative,tf.float32)

        max_anchor_negative_dist = tf.math.reduce_max(pairwise_dist, axis=1, keepdims=True)
        anchor_negative_dist = pairwise_dist + max_anchor_negative_dist * (1.0 - mask_anchor_negative)


        hardest_negative_dist = tf.math.reduce_min(anchor_negative_dist, axis=1, keepdims=True)
    

        triplet_loss = tf.math.maximum(hardest_positive_dist - hardest_negative_dist + self.margin, 0.0)

        triplet_loss = tf.math.reduce_mean(triplet_loss)

        return triplet_loss
    
    def get_config(self):
        base_config = super().get_config()
        return {**base_config,"margin":self.margin}

In [None]:
import tensorflow_addons as tfa

def compile_model(model, lr=0.0001):
    
    optimizer = tf.keras.optimizers.Adam(lr=lr)
    
    #loss = tf.keras.losses.SparseCategoricalCrossentropy()
    loss = TripletLossFn(0.7) 
    metrics = [
       tf.keras.metrics.SparseCategoricalAccuracy(name='acc')
    ]

    model.compile(optimizer=optimizer, loss=loss)

    return model

# Callbacks

In [None]:
def create_callbacks():
    
    cpk_path = './best_model.h5'
    
    reducelr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        mode='min',
        factor=0.1,
        patience=3,
        verbose=0
    )
    
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=cpk_path,
        monitor='val_loss',
        mode='min',
        save_best_only=True,
        verbose=1,
    )

    earlystop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        mode='min',
        patience=10, 
        verbose=1
    )
    
    callbacks = [checkpoint, reducelr, earlystop]         
    
    return callbacks

# Training

In [None]:
EPOCHS= 50
VERBOSE =1

tf.keras.backend.clear_session()

with strategy.scope():
    
    model = create_model()
    model = compile_model(model, lr=0.0001)
   
    callbacks = create_callbacks()
    
    history = model.fit(train_ds, 
                        epochs=EPOCHS,
                        callbacks=callbacks,
                        validation_data = val_ds,
                        verbose=VERBOSE,
                        steps_per_epoch = STEPS_PER_EPOCH,
                        validation_steps=VALID_STEPS
                       )

# History Plotting

In [None]:
'''acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(len(history.history['val_loss']))
plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')
plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()'''
print("Plotting the History")

# Embeddings of training images

In [None]:
#------------------------------------------
##Taking last but one layer for embeddings
#------------------------------------------

with tf.device('/device:CPU:0'):
    dataset = dataset.batch(BATCH_SIZE)
    embed_model = tf.keras.models.load_model("./best_model.h5",
                                       custom_objects={'TripletLossFn': TripletLossFn,'NormalizeLayer':NormalizeLayer})
    #embed_model = tf.keras.Model(model.layers[0].input,model.layers[-2].output)
    embeddings = embed_model.predict(dataset)
    np.save("./embeddings.npy",embeddings)

### Hope this notebook is helpful. If you have any doubts or suggestions feel free to comment here. 

## An upvote will be very much encouraging for me. 

# Happy kaggling❤