In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from os.path import join
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Determine the path of the current document

In [None]:
!pwd

In [None]:
import cupy
import cudf
import os
from os.path import join
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
import logging
logging.basicConfig()
import struct
# use keras backend (K) to force channels-last ordering
import tensorflow.keras.applications.resnet50 as resnet
from tensorflow.keras.applications import EfficientNetB0
import gc

In [None]:
import cv2, matplotlib.pyplot as plt
import skimage
default_dir = '../input/shopee-product-matching'

## compute_test

* This committed notebook computes CV score but when we submit this notebook it does not compute CV. Instead it will load the 70,000 row test.csv file and compute matches in the test dataset. Because the variable `compute_test = True` when we commit this notebook. But when we submit this notebook to Kaggle then the length of test.csv will be longer than 3 and the if-statement below will change to `compute_test=False`.

In [None]:
compute_test=False
if compute_test:
    train_data=pd.read_csv('/kaggle/input/shopee-product-matching/test.csv')    
else:   
    train_data=pd.read_csv('/kaggle/input/shopee-product-matching/train.csv')

In [None]:
import tensorflow as tf
from cuml.feature_extraction.text import TfidfVectorizer
from cuml import PCA
from cuml.neighbors import NearestNeighbors

### Explore the relevant information of the training data, including the dimensions and data types of the training data

In [None]:
print(train_data.head())
print(train_data.shape)
print(train_data.info())

### Explore the number of post_id under each image, and check the data through several samples.

In [None]:
count_stats=train_data.groupby(['image']).count().reset_index()
count_stats.sort_values(by=['posting_id'],ascending=False).head()

In [None]:
train_data[train_data.image=='0cca4afba97e106abd0843ce72881ca4.jpg']

# Image Embedding

## Generate batch data

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

### different image model
* Here we use transfer learning to explore multiple image models: efficientnetb0, vgg16 and ResNet50, vgg16 and ResNet50 are not good, efficientnetb0 has better results, so we finally use the efficientnetb0 model

In [None]:
choose_model='efficientnetb0'
if choose_model=='VGG':
    model = VGG16(weights='imagenet', include_top=False,pooling='avg')
elif choose_model=='ResNet50':
    # create an instance of the model w/o the last layer
    model = resnet.ResNet50(weights='imagenet',
    include_top=False, # remove the classification layer
    pooling='avg')
else:
    WGT = '../input/efficientnetb0/efficientnetb0_notop.h5'
    model = EfficientNetB0(weights=WGT, include_top=False, pooling='avg', input_shape=None)

### Select the path of the picture set according to the symbol set by compute_test

In [None]:
if compute_test:
    BASE = join(default_dir, 'test_images/')
else: BASE = join(default_dir, 'train_images/')

In [None]:
embeds = []
CHUNK = 1024 * 4

print('Computing image embeddings...')
CTS = len(train_data) // CHUNK
if len(train_data) % CHUNK != 0: CTS += 1
for i, j in enumerate(range(CTS)):

    a = j * CHUNK
    b = (j+1) * CHUNK
    b = min(b, len(train_data))
    print('chunk', a, 'to', b)

    test_gen = DataGenerator(train_data.iloc[a:b], img_size=512, batch_size=8, path=BASE)
    image_embeddings = model.predict(test_gen, verbose=1, use_multiprocessing=True, workers=4)
    embeds.append(image_embeddings)
image_embeddings = np.concatenate(embeds)

# Saving a NumPy Array to CSV File
del model
np.savetxt('image_embeddings_vgg.csv', image_embeddings, delimiter=',')
print('image embeddings shape',image_embeddings.shape)

## check Similar Images
Again, we will now ignore the ground truth and try to find similar items in train data using only the item's image. First we will extract image embeddings using EffNetB0. We will then compare image embeddings with RAPIDS cuML KNN to find images that are similar.

In [None]:
image_embeddings = np.loadtxt('/kaggle/input/img-embed/image_embeddings.csv',
                             delimiter=',')
print('image embeddings shape is',image_embeddings.shape)

In [None]:
KNN = 30
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)
distances, indices = model.kneighbors(image_embeddings)

In [None]:
BASE = join(default_dir, 'train_images/')
def displayDF(train, random=False, COLS=6, ROWS=4, path=BASE):
    for k in range(ROWS):
        plt.figure(figsize=(20,5))
        for j in range(COLS):
            if random: row = np.random.randint(0,len(train))
            else: row = COLS*k + j
            name = train.iloc[row,1]
            title = train.iloc[row,3]
            title_with_return = ""
            for i,ch in enumerate(title):
                title_with_return += ch
                if (i!=0)&(i%20==0): title_with_return += '\n'
            img = cv2.imread(path+name)
            
            # color fixing
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            plt.subplot(1,COLS,j+1)
            plt.title(title_with_return)
            plt.axis('off')
            plt.imshow(img)
        plt.show()

In [None]:
for k in range(100,105):
    plt.figure(figsize=(20,3))
    plt.plot(np.arange(30),cupy.asnumpy(distances[k,]),'o-')
    plt.title('Image Distance From Train Row %i to Other Train Rows'%k,size=16)
    plt.ylabel('Distance to Train Row %i'%k,size=14)
    plt.xlabel('Index Sorted by Distance to Train Row %i'%k,size=14)
    plt.show()
    
    cluster = train_data.loc[cupy.asnumpy(indices[k,:8])] 
    displayDF(cluster, random=False, ROWS=2, COLS=4)

### We tried different cluster values, and we can see from the above that imgae_embedding has a good effect

### select NearestNeighbors
Please Note! As stated in competition's evaluation page:<br>
* Group sizes were capped at 50, so there is no benefit to predict more than 50 matches.* <br>
* AS we can see, if length of data is 3 ,then we select 2 NearestNeighbors,else we set NearestNeighbors is 100 *

In [None]:
from cuml.neighbors import NearestNeighbors

KNN = 100
if len(train_data) == 3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar images...')
CTS = len(image_embeddings) // CHUNK
if len(image_embeddings) % CHUNK != 0: CTS += 1
for j in range(CTS):
    
    a = j * CHUNK
    b = (j+1) * CHUNK
    b = min(b, len(image_embeddings))
    print('chunk', a, 'to', b)
    distances, indices = model.kneighbors(image_embeddings[a:b, ])
    
    for k in range(b-a):
        IDX = np.where(distances[k, ] < 6.0)[0]
        IDS = indices[k, IDX]
        o = train_data.iloc[IDS].posting_id.values
        preds.append(o)
        
del model, distances, indices, image_embeddings # embeds
_ = gc.collect()
train_data['preds2'] = preds
train_data.head()

# Title Embedding

In [None]:
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
print('RAPIDS',cuml.__version__)

### We use TFIDF to extract text features, we set `max_features=30000` to ensure that the model does not lose too much text information

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

print('Computing text embeddings...')
model = TfidfVectorizer(stop_words=None, binary=True, max_features=30000)
text_embeddings = model.fit_transform(cudf.Series(train_data['title'].tolist())).toarray()

print('text embeddings shape',text_embeddings.shape)

### Because the array data type of cudf cannot be stored by numpy, we need to convert to numpy array for storage

In [None]:
# text_embeddings=text_embeddings.get()
# text_embeddings.shape
# np.savetxt('text_embeddings.csv', text_embeddings, delimiter=',')

### check title similar

In [None]:
KNN = 50
model = NearestNeighbors(n_neighbors=KNN)
model.fit(text_embeddings)
distances, indices = model.kneighbors(text_embeddings)

In [None]:
for k in range(3):
    plt.figure(figsize=(20,3))
    plt.plot(np.arange(50),cupy.asnumpy(distances[k,]),'o-')
    plt.title('Text Distance From Train Row %i to Other Train Rows'%k,size=16)
    plt.ylabel('Distance to Train Row %i'%k,size=14)
    plt.xlabel('Index Sorted by Distance to Train Row %i'%k,size=14)
    plt.show()
    
    print( train_data.loc[cupy.asnumpy(indices[k,:10]),['title','label_group']] )

### We tried different cluster values, and we can see from the above that imgae_embedding has a good effect

###  

In [None]:
KNN = 100 #50
if len(train_data) == 3: KNN = 2
model = NearestNeighbors(n_neighbors = KNN)
model.fit(text_embeddings)

In [None]:
COSINE_SIMILARITY=True
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(train_data) // CHUNK
if len(train_data) % CHUNK != 0: CTS += 1
for j in range(CTS):
    
    a = j * CHUNK
    b = (j+1) * CHUNK
    b = min(b, len(train_data))
    print('chunk', a, 'to', b)
    
    if COSINE_SIMILARITY:
        # COSINE SIMILARITY DISTANCE
        cts = cupy.matmul(text_embeddings, text_embeddings[a:b].T).T

        for k in range(b-a):
            IDX = cupy.where(cts[k, ] > 0.7)[0]
            o = train_data.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
    
    else:
        # KNN
        distances, indices = model.kneighbors(text_embeddings[a:b,])
        
        for k in range(b-a):
            IDX = cupy.where(indices[k, ] < 6.0)[0]
            o = train_data.iloc[cupy.asnumpy(IDX)].posting_id.values
            preds.append(o)
            
            # IDX = np.where(distances[k, ] < 6.0)[0]
            # IDS = indices[k, IDX]
            # o = test.iloc[IDS].posting_id.values
            # preds.append(o)
            
            # TypeError: Implicit conversion to a NumPy array is not allowed. Please use `.get()` to construct a NumPy array explicitly.
            # https://stackoverflow.com/questions/65008297/attempting-numpy-conversion-when-not-needed-in-cupy
            
del model, text_embeddings
_ = gc.collect()
train_data['preds'] = preds
train_data.head()

## Use Phash Feature

We will predict all items with the same phash as duplicates:<br>

### Here we choose several images of one label_group

### Calculate the difference between two hash values of the images

In [None]:
def campHash(hash1, hash2):
    n = 0
    # hash长度不同返回-1,此时不能比较
    if len(hash1) != len(hash2):
        return -1
    # 如果hash长度相同遍历长度
    for i in range(len(hash1)):
        if hash1[i] != hash2[i]:
            n = n+1
    return n
# print(campHash('e925873ed09cd08f','e9b5833e929e909c'))
# print(campHash('e925873ed09cd08f','ea97861c926a71e3'))
# print(campHash('e9b5833e929e909c','ea97861c926a71e3'))

In [None]:
tmp = train_data.groupby('image_phash').posting_id.agg('unique').to_dict()
train_data['preds3'] = train_data.image_phash.map(tmp)
train_data.head()

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target, row[col]) )
        return 2*n / (len(row.target) + len(row[col]))
    return f1score

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.preds,row.preds2, row.preds3])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.preds2, row.preds3])
    return np.unique(x)

In [None]:
if compute_test:
    train_data['matches'] = train_data.apply(combine_for_sub,axis=1)
else:
    tmp = train_data.groupby('label_group').posting_id.agg('unique').to_dict()
    train_data['target'] = train_data.label_group.map(tmp)
    train_data['oof'] = train_data.apply(combine_for_cv,axis=1)
    train_data['f1'] = train_data.apply(getMetric('oof'),axis=1)
    print('CV Score =', train_data.f1.mean())


# Write Submission CSV

In this notebook, the submission file below looks funny containing train information. But when we submit this notebook, the size of `test.csv` dataframe will be longer than 3 rows and the variable `compute_test` will subsequently set to `False`. Then our submission notebook will compute the correct matches using the real test dataset and our submission csv for LB will be ok.

In [None]:
train_data[['posting_id', 'matches']].to_csv('submission.csv', index=False)