# <font color=redred> <b><center> Shopee - Price Match Guarantee </font>
<font><center>![](https://storage.googleapis.com/kaggle-competitions/kaggle/24286/logos/thumb76_76.png?t=2020-11-20-21-03-50)

# 0. Preparation:

In [None]:
import numpy as np 
import pandas as pd 
import cudf,cuml,cupy
#from cuml.feature_extraction.text import CountVectorizer
#from cuml.cluster import KMeans
#from cuml.manifold import TSNE
import matplotlib.pyplot as plt 
import os 
import tensorflow as tf 
import cv2 as cv 
from PIL import Image
import seaborn as sns
import plotly.express as px
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import warnings
from tqdm import tqdm
import gc
warnings.filterwarnings("ignore")
from tensorflow.keras.utils import Sequence
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from sklearn.metrics.pairwise import cosine_distances
from collections import Counter
import nltk

In [None]:
# Load data for training sets
training_csv =pd.read_csv("../input/shopee-product-matching/train.csv")
training_img = "../input/shopee-product-matching/train_images"

In [None]:
# Load data for test sets
test_csv = pd.read_csv("../input/shopee-product-matching/test.csv")
test_img = "../input/shopee-product-matching/test_images"

In [None]:
# Get look at the datas.
training_csv.head()

# 1. EDA:

In [None]:
# explorate number duplicates per label_group
# And visualize the most 100 duplicated label_group
ser= training_csv["label_group"].value_counts()
ser = pd.DataFrame(ser).reset_index(drop=False).rename(columns={"label_group":"Occurences","index":"label_group"})
ser["label_group"] = ser["label_group"].astype("str")
fig = px.bar(ser[:100],x="label_group",y="Occurences",color= "Occurences",\
             title="TOP 100 most duplicated label_group ")
fig.update_layout(title ={"x":0.475,"y":0.9,"xanchor":"center","yanchor":"top"})
fig.show()

In [None]:
# Create new feature who summarize the list of posting_id associated to each label_group value. 
lab=training_csv.groupby("label_group")["posting_id"].agg("unique")
training_csv["target"]= training_csv.label_group.map(lab)

In [None]:
def display_related_products(feature_value,df=training_csv,feature="label_group",title=""):
    """ Display related photos based on the introduced feature criterion and the value of this
        feature introduced as parametr.
      @ args :
      feature_value(int) : 
      df(DataFrame) : the dataframe that we will use in our function (default = training_csv)
      feature(str) : the name of feature, used to group items in the same collection.
      title (str) : The title to assigne to the whole displayed images 
        
    """
    
    related_photos = df.loc[df[feature] ==feature_value,["image","title"]]
    l = len(related_photos)
    range_k = [2,3,4,5,6,7]
    k = 0
    for j in range_k :
        if l%j == 0 :
            k=j
            break
    if k == 0 :
        k = 5
    nb_l = l //k
    nb_l += int((l%k) !=0)
         
    fig,ax = plt.subplots(nb_l,k,figsize=(k*10,nb_l*10))
    i = 0
    for row in related_photos.iterrows() :
        
        chemin = os.path.join(training_img,row[1]["image"])
        image = Image.open(chemin)
        image = np.array(image)
        if nb_l == 1 :
            ax[i%k].imshow(image)
            ax[i%k].set_title(row[1]["title"],fontsize=10,fontweight="bold")
        else :
            ax[i//k,i%k].imshow(image)
            ax[i//k,i%k].set_title(row[1]["title"],fontsize=12,fontweight="bold")
        i += 1
    plt.suptitle(title,fontsize =36,\
                 size=32,color="red",fontweight="bold")
    plt.show()

In [None]:
# Display related images for the most duplicated label_group items.
most_label_group = int(ser.iloc[0].label_group)
display_related_products(most_label_group,title="RELATED IMAGES FOR THE MOST DUPLICATED LABEL_GROUP ITEMS")

In [None]:
# Display related images for the smallest duplicated label_group items.
smallest_label_group = int (ser.iloc[-1].label_group)
display_related_products(smallest_label_group,title="SMALLEST DUPLICATED LABEL_GROUP ITEMS")

We can notice from images above :
- Many related  items have same photos and approximately same title.
- Many differents images of the same product , but with approximately the same title.
- Some images are idtentiques or very similar with differents title.

==> This lead us to conclude , that the photos and title should used to determine duplicated products.

# 2.Modelisation

In [None]:
def clean(title):
    """This function, allows to clean title from useless characters and symbols.
    
    @ params :
    title(str) : the title text that the function will clean up.
    
    @ returns :
    title(str) : cleaned title

    
    """
    title = title.lower()
    title = re.sub(r"\-"," ",title)
    title = re.sub(r"\+"," ",title)
    title = re.sub (r"&","and",title)
    title = re.sub(r"\|"," ",title)
    title = re.sub(r"\\"," ",title)
    title = re.sub(r"\W"," ",title)
    for p in string.punctuation :
        title = re.sub(r"f{p}"," ",title)
    
    title = re.sub(r"\s+"," ",title)
    
    return title

In [None]:
test_cdf = cudf.read_csv("../input/shopee-product-matching/train.csv")
test = training_csv
test["cleaned_title"] = test["title"].map(clean)
test_cdf["cleaned_title"] = test["cleaned_title"]
#test_cdf = cudf.concat([test_cdf,test_cdf,test_cdf[:],axis=0,ignore_index=False)
#test = pd.concat([test,test,test[:4000]],axis=0,ignore_index=False)
submission = True 
images = training_img
if len(test_csv) > 3 :
   test_cdf = cudf.read_csv("../input/shopee-product-matching/test.csv")
   test = pd.read_csv("../input/shopee-product-matching/test.csv")
   test["cleaned_title"] = test["title"].map(clean)
   test_cdf["cleaned_title"] = test["cleaned_title"]
   images = test_img
   submission = False

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target,row[col]) )
        return 2*n / (len(row.target)+len(row[col]))
    return f1score

In [None]:
def sub_matches(row):
    return " ".join(row.pred_tf)

## 2.2 tf_idf :

In [None]:
corpus = []
for tx in test["cleaned_title"].values:
    text = tx.lower()
    corpus.extend(text.split())
words = set(corpus)

In [None]:
nuniques_words = len(words)

In [None]:
counter = Counter(corpus)

In [None]:
seuil = [0.01,0.025,0.05,0.1,0.2 ,0.4 ,0.6 ]

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
if submission :
 bseuil = 0
 bscore = 0
 bd = 0
 for sl in tqdm(seuil) : 
    seuil = int (sl * nuniques_words)
    stop_words = list(zip(*counter.most_common(seuil)))[0]
    sw = set()
    sw.update(stop_words)
    sw.update(nltk.corpus.stopwords.words("english"))
    tf_idf = TfidfVectorizer(stop_words=stop_words,max_features=25000,binary=True)
    embedding = tf_idf.fit_transform(test_cdf["cleaned_title"]).toarray()
    tf_distance = NearestNeighbors(n_neighbors=50,metric="cosine")
    tf_distance.fit(embedding)
    chunk = 4 * 1024
    cls = len(test)//chunk
    cls += int((len(test)% chunk) != 0)
    d = [0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]
    best_d = 0
    best_score = 0
    for di in d :
     prediction = []
     for i in range(cls) :
       a = i * chunk 
       b = (i+1) * chunk 
       b = min(b,len(test))
       distances , indices = tf_distance.kneighbors(embedding[a:b,])
       for j in range(b-a) :
         distance = distances[j,:]
         ind = np.where(distance < di )[0]
         ind = indices[j,ind]
         ind = cupy.asnumpy(ind)
         prediction.append(test.iloc[ind].posting_id.values)
     test["pred_tfidf"] = prediction 
     test["f5"] = test.apply(getMetric("pred_tfidf"),axis=1)
     sc = test.f5.mean()
     if sc > best_score :
            best_score = sc 
            best_d = di 
    if best_score >  bscore :
     bseuil =  sl 
     bscore = best_score
     bd = best_d
            
            

In [None]:
if submission : 
    print('CV score for tf_idf embedding text = ',bscore)
    print("best threshold to use to define our stops words= ",bseuil)
    print("best distance to use to define similarity = ",bd)

In [None]:
stop_words = list(zip(*counter.most_common(int(0.01 * nuniques_words))))[0]
sw = set()
sw.update(stop_words)
sw.update(nltk.corpus.stopwords.words("english"))
tf_idf = TfidfVectorizer(stop_words=sw,max_features=25000,binary=True)
embedding = tf_idf.fit_transform(test_cdf["cleaned_title"]).toarray()

In [None]:
kn = NearestNeighbors(n_neighbors=50,metric="cosine")
kn.fit(embedding)

In [None]:
prediction = []
chunk = 4 *1024 
cls = len(test) // chunk 
cls += int((len(test) % chunk) !=0)
for i in tqdm(range(cls)):
    a = i * chunk 
    b = (i+1) * chunk 
    b = min (b,len(test))
    distances , indices = kn.kneighbors(embedding[a:b,])
    for j in range(b-a):
        distance = distances[j,:]
        ind = np.where(distance < 0.45)[0]
        ind = indices[j,ind]
        ind = cupy.asnumpy(ind)
        prediction.append(test.iloc[ind].posting_id.values)
    

In [None]:
test["pred_tfidf"] = prediction 
if submission : 
    test["f5"] = test.apply(getMetric("pred_tfidf"),axis=1)
    
    print('CV score for tf embedding text =',test.f5.mean())

## Resnet0:

In [None]:
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus :
    try :
       tf.config.experimental.set_virtual_device_configuration(gpus[0],\
                                                           [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
       logical_gpus = tf.config.experimental.list_logical_devices("GPU")
    
    except RuntimeError as e :
       print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
class DataGenerator(Sequence):
    
    def __init__(self,df,img_size=224,path = images,batch_size = 32):
        
        self.df = df 
        self.img_size = img_size
        self.path = path 
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.df))
    def __len__(self) :
        
        cl = len(self.df) // self.batch_size
        cl += int((len(self.df) % self.batch_size) !=0)
        return cl
    def __getitem__(self,index):
        
        indices = self.indexes[index * self.batch_size :(index + 1) * self.batch_size]
        X = self.__data_generation(indices)
        return X
    def __data_generation(self,indices) :
        
        images = np.zeros((len(indices),self.img_size,self.img_size,3),dtype = "float32")
        ddf = self.df.iloc[indices]
        for i , (j,row) in enumerate(ddf.iterrows()):
            img = cv.imread(os.path.join(self.path,row.image))
            #img = load_img(os.path.join(self.path,row.image),target_size = (self.img_size,self.img_size))
            #img = img_to_array(img)
            img = cv.resize(img,(self.img_size,self.img_size))
            images[i,] = img
        return images 
            
        

In [None]:
WGT = "../input/effnetb0/efficientnetb0_notop.h5"
model = EfficientNetB0(weights=WGT,input_shape=None,include_top = False,pooling="avg")


In [None]:
chunk = 1024 * 4 
cls = len(test) // chunk 
cls += int (len(test) % chunk != 0)
image_embedding = []
for i in tqdm(range(cls)) :
    
    a = i * chunk 
    b = (i+1) * chunk 
    b = min(b,len(test))
    data = DataGenerator(test.iloc[a:b])
    emb = model.predict(data,use_multiprocessing=True,workers = 4)
    image_embedding.append(emb)

del(model)
image_embedding = np.concatenate(image_embedding,axis=0)
gc.collect()

In [None]:
#from numpy.linalg.linalg import norm
#Norm = norm(image_embedding,axis=1)

In [None]:
#Normed_embedding = image_embedding/Norm.reshape(-1,1)

In [None]:
model = NearestNeighbors(n_neighbors=50,metric="cosine")
model.fit(image_embedding)

In [None]:
chunk = 4 *1024 
cl = len(test) // chunk 
cl += int((len(test) % chunk) !=0)
pred_img = []
for i in tqdm(range(cl)) :
    a = i * chunk
    b = (i+1) * chunk
    b = min(len(test),b)
    distances,indices = model.kneighbors(image_embedding[a:b,])
    for j in range(b-a):
        distance = distances[j,:]
        #d = distance[distance !=0]
        #minim = float(np.min(d)) * 10
        ind = np.where(distance < 0.2)[0]
        IND = indices[j,ind]
        pred_img.append(test.iloc[IND].posting_id.values)
test["pred_img"] = pred_img

In [None]:
if submission :
    
    test["f2"] = test.apply(getMetric("pred_img"),axis=1)
    
    print('CV score for tf embedding image =',test.f2.mean())

## image_phash related images:

In [None]:
image_phash = test.groupby("image_phash").posting_id.unique()
test["pred_phash"] = test.image_phash.map(image_phash)

In [None]:
def combine(row):
    x = np.concatenate([row.pred_img,row.pred_tfidf,row.pred_phash])
   
    return np.unique(x)
def combine_matches(row):
    return " ".join(row.pred)

In [None]:
if submission :
    
    test["f3"] = test.apply(getMetric("pred_phash"),axis=1)
    
    print('CV score for tf image phash related image =',test.f3.mean())

In [None]:
test["pred"] = test.apply(combine,axis=1)
if submission :
    
    test["f"] = test.apply(getMetric("pred"),axis=1)
    
    print('CV score for baseline =',test.f.mean())
test["matches"] = test.apply(combine_matches,axis=1)

In [None]:
test[["posting_id","matches"]].to_csv("submission.csv",index = False)
sub = pd.read_csv('submission.csv')
sub.head()

# <font color=red> We came back soon , please upvote if you like it !