# Import Your Trustwhorty Libraries

In [None]:
# We always do this
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

# Mainly for deep learning
import keras
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow.keras.backend as K

# Array powered-CUDA
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

# Python std. libraries
import os, threading, logging, gc, tqdm
import math

In [None]:
# Parameter
BATCH = 64
IMAGE_SIZE = (512, 512)
N_CLASS = 11014

DATA_PATH = "../input/shopee-product-matching/"
TRAIN_PATH = DATA_PATH + "train_images/"
TEST_PATH = DATA_PATH + "test_images/"
SPLITS = 100 # for spliting dataset

# True: for CV , False: for Commit
GET_CV = False 

In [None]:
# RESTRICT TENSORFLOW TO 2GB OF GPU RAM
# SO THAT WE HAVE 14GB RAM FOR RAPIDS
LIMIT = 2.0
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(
            gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
# Pandas dataframe
train = pd.read_csv(DATA_PATH + "train.csv")
test = pd.read_csv(DATA_PATH + "test.csv")

# RAPIDS dataframe
train_cuda = cudf.read_csv(DATA_PATH + 'train.csv')
test_cuda = cudf.read_csv(DATA_PATH + 'test.csv')

if GET_CV:
    # Use train data
    df = train
    df_cuda = train_cuda
    MAIN_PATH = TRAIN_PATH
    
else:
    # Use test data
    df = test
    df_cuda = test_cuda
    MAIN_PATH = TEST_PATH

train_cuda.head()

# A little sanity checks on dataframe

In [None]:
train.info()

In [None]:
test.head()

In [None]:
test.info()

In [None]:
# How many class in here
train["label_group"].nunique()

# Baseline Model

In [None]:
if GET_CV:
    target = df.groupby("label_group").posting_id.agg("unique").to_dict()
    df["target"] = df["label_group"].map(target)

pred_phash = df.groupby("image_phash").posting_id.agg("unique").to_dict()
df["pred_phash"] = df["image_phash"].map(pred_phash)

In [None]:
if GET_CV:
    # Metrics: F1 Score
    def get_metric(col):
        def f1_score(row):
            n = len( np.intersect1d(row.target,row[col]) )
            return 2*n / (len(row.target)+len(row[col]))
        return f1_score

    df["f1_phash"] = df.apply(get_metric("pred_phash"), axis=1) # axis=1 will return df row
    print("F1 score with baseline model (phash) {}".format(df["f1_phash"].mean()))

Image_phash LB score: 0.559

# Image Features


## Backbone model: EfficientNetB0

In [None]:
# Build pretrained model and load its weights
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input

weight_path = "../input/keras-pretrained-models/EfficientNetB0_NoTop_ImageNet.h5"
pre_CNN = EfficientNetB0(include_top=False, 
                         weights=weight_path,
                         pooling="avg")

pre_CNN.trainable = False

In [None]:
# Read image in dataset
def read_im(path):
    file = tf.io.read_file(path)
    image = tf.io.decode_jpeg(file, channels=0)
    image = tf.image.resize(image, (512, 512))
    image = tf.cast(image, dtype=tf.float32) / 255.
    
    return image

# Dataset pipeline from filenames --> image arrays
def get_image_dataset(filenames):
    image_dataset = tf.data.Dataset.from_tensor_slices(filenames)
    image_dataset = image_dataset.map(read_im, num_parallel_calls=tf.data.AUTOTUNE)\
                                 .prefetch(tf.data.AUTOTUNE)
    return image_dataset
    

def get_embedding(filenames):
    embeds = []
    # So, we split our array into N-splits and extract their features one by one.
    splits = np.array_split(filenames, SPLITS)
    for split in tqdm.tqdm( splits ):
        
        # when no data left, stop immediately (when test data len is still 3)
        if not split.any(): 
            break
        dataset = get_image_dataset(split)
        features = pre_CNN.predict(dataset)
        embeds.append(features)
        
    return tf.concat(embeds,axis=0)

In [None]:
image_path = MAIN_PATH + df["image"]
image_embedding = get_embedding(image_path)

In [None]:
def im_preds(df, feature, splits):
    feature_chunks = cupy.array_split(feature, splits)
    preds = []

    for fc in tqdm.tqdm(feature_chunks):

        # Dot product of unit vector = Cosine Similarity
        # When their dot product got higher, the closer they are
        dp = cupy.matmul(feature, fc.T).T
        
        # This mask is consist of series of true-false value
        # It is True when the dot product is above the limit
        mask = cupy.where(dp > 1., True, False)
        for m in mask:
            preds.append( df.posting_id[m.get()].values ) # we use .get() to convert cupy to np

    return preds

In [None]:
im_norm = normalize(image_embedding, axis=1)
im_norm = cupy.array(im_norm)
image_preds = im_preds(df, im_norm, SPLITS)

# Delete variable and perform garbage collection
# RAM: You know, this CUDA-thing made me full. I feels relaxed, now.
del im_norm
gc.collect()

In [None]:
df["pred_images"] = image_preds

if GET_CV:
    df["f1_images"] = df.apply(get_metric("pred_images"), axis=1)
    df["f1_images"].mean()

In [None]:
# Ignore this, just tuning the threshold

# z = normalize(image_embedding, axis=1)
# z = (z @ z[:2].T).T
# x = np.where(z > 0.9999999, True, False)
# for y in x:
#     print(df[y].posting_id)
#     print()

In [None]:
# # Label for each distinct group
# labels = train["label_group"].unique()
# labels_map = {label: index for index, label in enumerate(labels)}
# train["label"] = train["label_group"].map(labels_map)
# image_labels = train["label"].values

# Text Features

In [None]:
tfidf = TfidfVectorizer(stop_words=None, binary=True, max_features=25000)
text_embed = tfidf.fit_transform(df_cuda.title).toarray()
text_embed = cupy.array(text_embed)

In [None]:
splits = 100

def text_preds(df, feature, splits):
    feature_chunks = cupy.array_split(feature, splits)
    preds = []

    for fc in tqdm.tqdm(feature_chunks):

        # Dot product of unit vector = Cosine Similarity
        # When their dot product got higher, the closer they are
        dp = cupy.matmul(feature, fc.T).T
        
        # This mask is consist of series of true-false value
        # It is True when the dot product is above the limit
        mask = cupy.where(dp > 0.7, True, False)
        for m in mask: 
            preds.append( df.posting_id[m.get()].values ) # we use .get() to convert cupy to np
            
    return preds

preds = text_preds(df, text_embed, splits)

# Delete variable and perform garbage collection
# RAM: Okay, I'm taking my vacation, good luck !
del text_embed
gc.collect()

In [None]:
# New column of text embed predictions
df["pred_text_embed"] = preds

if GET_CV:
    # Compute f1 score
    df["f1_text"] = df.apply(get_metric("pred_text_embed"), axis=1)
    f1 = df["f1_text"].mean()
    print("f1_score with text embeddings: {}".format(f1))

# Combine Baseline, Images and Text features predictions

In [None]:
def concat_pred(row):
    preds = np.concatenate([row.pred_phash, row.pred_images, row.pred_text_embed])
    return np.unique(preds)

df["matches"] = df.apply(concat_pred, axis=1)

if GET_CV:
    df["f1_match"] = train.apply(get_metric("matches"), axis=1)
    df["f1_match"].mean()

# Submission

In [None]:
submission = pd.DataFrame({"posting_id": df.posting_id, "matches": df.matches})
submission["matches"] = submission["matches"].map(lambda x: " ".join(x))
submission.to_csv("submission.csv", index=False)
submission