# Introduction
<img src = "https://upload.wikimedia.org/wikipedia/commons/thumb/f/fe/Shopee.svg/1200px-Shopee.svg.png" height = "400" width = "400">

## Description of the competition

Two different images of similar wares may represent the same product or two completely different items. Retailers want to avoid misrepresentations and other issues that could come from conflating two dissimilar products. Currently, a combination of deep learning and traditional machine learning analyzes image and text information to compare similarity. But major differences in images, titles, and product descriptions prevent these methods from being entirely effective.

## About Shopee

**Website : [Shopee](https://shopee.com/)** <br>
Shopee is the leading e-commerce platform in Southeast Asia and Taiwan. Customers appreciate its easy, secure, and fast online shopping experience tailored to their region. The company also provides strong payment and logistical support along with a 'Lowest Price Guaranteed' feature on thousands of Shopee's listed products.

## What we need to do
In this competition, you’ll apply your machine learning skills to build a model that predicts which items are the same products.

## Other Details

- Evaluation criteria : **F1 Score**
- Accelerator used for Training : **TPU**
- Technique : **Siamese model**
- Loss Function  : **Contrastive Loss**

**Note : Thanks to @tanulsingh077  for creating dataset for Siamese model <br>
Dataset : https://www.kaggle.com/tanulsingh077/shopee-siamese-training <br>
Preparation notebook : https://www.kaggle.com/tanulsingh077/code-for-data-generation-for-siamese-training/ <br>**

In [None]:
#----------------------------
##installing efficient net models
#----------------------------
!/opt/conda/bin/python3.7 -m pip install --upgrade pip
! pip install -q efficientnet

# Import necessary Libraries

In [None]:
#-------------------------
#importing necessary libraries
#-------------------------

import tensorflow as tf
import tensorflow_addons as tfa
import efficientnet.tfkeras as efn

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt
import PIL

import os

from kaggle_datasets import KaggleDatasets

# Checking TPU access

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
GCS_DS_PATH = KaggleDatasets().get_gcs_path("shopee-product-matching")
TRAIN_PATH = GCS_DS_PATH + "/train_images/"

train_df = pd.read_csv("../input/shopee-product-matching/train.csv")

label2id = dict(zip(range(train_df.label_group.nunique()),train_df.label_group.unique()))
id2label = dict(zip(train_df.label_group.unique(),range(train_df.label_group.nunique())))
train_df["label_group"] = train_df["label_group"].map(id2label)
train_df.index = train_df["image"]


tmp = train_df.groupby('label_group').posting_id.agg('unique').to_dict()
train_df['posting_ids'] = train_df.label_group.map(tmp)
train_df['posting_ids'] = train_df['posting_ids'].apply(lambda x: list(x))

train_df.head()

# Data Visualization

In [None]:
HEIGHT,WIDTH = 512,512
CHANNELS = 3

#--------------------
#Display Samples
#--------------------

train_df.index = train_df["posting_id"]

def filepath_to_arr(filepath):
    img = tf.keras.preprocessing.image.load_img(filepath,target_size= (HEIGHT,WIDTH))
    arr = tf.keras.preprocessing.image.img_to_array(img)/255.
    return arr

def display_img(training_ids):
    num_imgs = len(training_ids)
 
    plt.figure(figsize = (5*num_imgs,10))
    for i,_id in enumerate(training_ids):
        plt.subplot(1,num_imgs+1,i+1)
        
        filepath = os.path.join("../input/shopee-product-matching/train_images",train_df.loc[_id]["image"])
        plt.title("Image : "+str(i+1))
        arr = filepath_to_arr(filepath)
        plt.imshow(arr)
        plt.axis("off")
    plt.show()

x = np.random.randint(0,30000,size=1)
for j in range(5):
    display_img(train_df.iloc[x[0] + j]["posting_ids"])
    
train_df = train_df.drop_duplicates(subset=['image'])
train_df.index = train_df["image"]

# Loading Siamese pairs

In [None]:
#------------------------
#siamese dataframe
#------------------------

siamese_df = pd.read_csv("../input/shopee-siamese-training/siamese_data.csv")
siamese_df.replace(1,2,inplace = True)
siamese_df.replace(0,1, inplace = True)
siamese_df.replace(2,0, inplace = True)
siamese_df["label"] = siamese_df["label"].astype("float32")
siamese_df.head()

# Functions for preprocessing

In [None]:
#------------------
##processing image
#------------------
def process_img(filepath):
    image = tf.io.read_file(filepath)
    image = tf.image.decode_jpeg(image, channels=CHANNELS)
    image = tf.image.convert_image_dtype(image, tf.float32) 
    image = tf.image.resize(image, [HEIGHT,WIDTH])
    return image

#-----------------------------------
##adding augmentations to image data
#-----------------------------------
def data_augment(image):
    p_spatial = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_rotate = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_1 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_2 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_pixel_3 = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
    p_crop = tf.random.uniform([], 0, 1.0, dtype=tf.float32)
            
    # Flips
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    if p_spatial > .75:
        image = tf.image.transpose(image)
        
    # Rotates
    if p_rotate > .75:
        image = tf.image.rot90(image, k=3) 
    elif p_rotate > .5:
        image = tf.image.rot90(image, k=2) 
    elif p_rotate > .25:
        image = tf.image.rot90(image, k=1) 
        
    
    if p_pixel_1 >= .4:
        image = tf.image.random_saturation(image, lower=.7, upper=1.3)
    if p_pixel_2 >= .4:
        image = tf.image.random_contrast(image, lower=.8, upper=1.2)
    if p_pixel_3 >= .4:
        image = tf.image.random_brightness(image, max_delta=.1)
        
    
    if p_crop > .7:
        if p_crop > .9:
            image = tf.image.central_crop(image, central_fraction=.7)
        elif p_crop > .8:
            image = tf.image.central_crop(image, central_fraction=.8)
        else:
            image = tf.image.central_crop(image, central_fraction=.9)
    elif p_crop > .4:
        crop_size = tf.random.uniform([], int(HEIGHT*.8), HEIGHT, dtype=tf.int32)
        image = tf.image.random_crop(image, size=[crop_size, crop_size, CHANNELS])
    
    image = tf.image.resize(image, [HEIGHT,WIDTH])
    return image


#------------------
#concat two arrays of image pairs
#------------------
def process_img_pair(file_pair,label):
    im1 = process_img(file_pair[0])
    im1 = tf.expand_dims(im1,axis=-1)
    
    im2 = process_img(file_pair[1])
    im2 = tf.expand_dims(im2,axis=-1)
    
    im_pair = tf.concat([im1,im2],axis=-1)
    
    return im_pair,label

#------------------------
#adding augmentation to image pair
#------------------------
def augment_img_pair(image_pair,label):
    im1 = image_pair[:,:,:,0]
    im1 = data_augment(im1)
    im1 = tf.expand_dims(im1,axis=-1)
    
    im2 = image_pair[:,:,:,1]
    im2 = data_augment(im2)
    im2 = tf.expand_dims(im2,axis=-1)
    
    im_pair = tf.concat([im1,im2],axis=-1)
    return im_pair,label

# Data Pipeline

In [None]:
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
SPLIT = int(0.8*len(siamese_df))

STEPS_PER_EPOCH  = SPLIT//BATCH_SIZE
VALID_STEPS = (len(siamese_df)-SPLIT)//BATCH_SIZE
SEED = 143

x = siamese_df.image_1.to_list()
y = siamese_df.image_2.to_list()
filepairs = [[os.path.join(TRAIN_PATH,i),os.path.join(TRAIN_PATH,j)] for i,j in zip(x,y)]

labels = siamese_df.label.to_list()

dataset = tf.data.Dataset.from_tensor_slices((filepairs,labels))
dataset = dataset.map(process_img_pair,num_parallel_calls=AUTO)
dataset = dataset.map(augment_img_pair,num_parallel_calls=AUTO)

train_ds = dataset.take(SPLIT)
val_ds = dataset.skip(SPLIT)

AUTO = tf.data.experimental.AUTOTUNE

train_ds = train_ds.cache().repeat().shuffle(BATCH_SIZE*20).batch(BATCH_SIZE).prefetch(AUTO)
val_ds = val_ds.repeat().batch(BATCH_SIZE).prefetch(AUTO)
print("Data Pipeline")

In [None]:
for batch in val_ds.take(1):
    im,label = batch
    print(im.shape)
    print(label)

# Create Model

In [None]:
def create_model():
    inputs = tf.keras.Input(shape = (HEIGHT,WIDTH,CHANNELS,2,))
    
    input_a = inputs[:,:,:,:,0]
    input_b = inputs[:,:,:,:,1]
    
    pretrained = efn.EfficientNetB0(include_top=False, weights='noisy-student',input_shape=[HEIGHT,WIDTH, 3])
            
    x = pretrained.output
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
        
    embed_model = tf.keras.Model(pretrained.input, x)
    
    embed_a = embed_model(input_a)
    embed_b = embed_model(input_b)
    
    l1_layer = tf.keras.layers.Lambda(lambda tensors:tf.linalg.norm(tensors[0] - tensors[1], axis=1))
    outputs = l1_layer([embed_a,embed_b])
    
    
    model = tf.keras.Model(inputs,outputs)
    
    return model

model = create_model()
model.summary()

# Compile the Model

In [None]:
def compile_model(model, lr=0.0001):
    
    optimizer = tf.keras.optimizers.Adam(lr=lr)
    
    loss = tfa.losses.ContrastiveLoss(margin = 1.0,name = 'loss')
    
    metrics = [
       tf.keras.metrics.Accuracy(name='acc')
    ]

    model.compile(optimizer=optimizer, loss=loss)

    return model

# Creating Callbacks

In [None]:
metric = "val_loss"
mode = "min"

def create_callbacks():
    
    cpk_path = './best_model.h5'
    
    reducelr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor=metric,
        mode=mode,
        factor=0.1,
        patience=3,
        verbose=0
    )
    
    checkpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=cpk_path,
        monitor=metric,
        mode=mode,
        save_best_only=True,
        verbose=1,
    )

    earlystop = tf.keras.callbacks.EarlyStopping(
        monitor= metric,
        mode=mode,
        patience=10, 
        verbose=1
    )
    
    callbacks = [checkpoint, reducelr, earlystop]         
    
    return callbacks

# Training

In [None]:
EPOCHS= 2
VERBOSE =1


tf.keras.backend.clear_session()

with strategy.scope():
    
    #model = create_model()
    model = tf.keras.models.load_model("../input/shopee-contrastiveloss-tensorflow-tpu-training/best_model.h5")
    model = compile_model(model, lr=0.0001)
   
    callbacks = create_callbacks()
    
    history = model.fit(train_ds, 
                        epochs=EPOCHS,
                        callbacks=callbacks,
                        validation_data = val_ds,
                        verbose=VERBOSE,
                        steps_per_epoch = STEPS_PER_EPOCH,
                        validation_steps=VALID_STEPS
                       )

# History plotting

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs_range = range(len(history.history['val_loss']))
plt.figure(figsize=(8, 8))
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()
print("Plotting the History")

# Training embeddings

In [None]:
files_ls = tf.io.gfile.glob(TRAIN_PATH + '*.jpg')
ds = tf.data.Dataset.from_tensor_slices(files_ls)
ds = ds.map(process_img,num_parallel_calls=AUTO)
ds = ds.map(data_augment,num_parallel_calls=AUTO)
ds = ds.batch(BATCH_SIZE)

with strategy.scope():
    model = tf.keras.models.load_model("./best_model.h5")
    embed_model = tf.keras.Model(model.layers[-2].input,model.layers[-2].output)
    embeddings = embed_model.predict(ds)
np.save("./embeddings.npy",embeddings)

## Hope this notebook is helpful. If you have any doubts or suggestions feel free to comment here.
## An upvote will be very much encouraging for me.
# Happy kaggling❤