# Load libraries

In [None]:
import numpy as np, pandas as pd, gc
import cv2, matplotlib.pyplot as plt
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0

In [None]:
# RESTRICT TENSORFLOW TO 1GB OF GPU RAM
# SO THAT WE HAVE 15GB RAM FOR RAPIDS
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

# Load traindata

In [None]:
train = pd.read_csv('../input/shopee-product-matching/train.csv')
train.head()

# Display random traindata

In [None]:
def displayTrainData(train, random=False, COLS=6, ROWS=4, path='../input/shopee-product-matching/train_images/'):
    for k in range(ROWS):
        plt.figure(figsize=(20,5))
        for j in range(COLS):
            if random: row = np.random.randint(0,len(train))
            else: row = COLS*k + j
            name = train.iloc[row,1]
            title = train.iloc[row,3]
            title_with_return = ""
            for i,ch in enumerate(title):
                title_with_return += ch
                if (i!=0)&(i%20==0): title_with_return += '\n'
            img = cv2.imread(path+name)
            plt.subplot(1,COLS,j+1)
            plt.title(title_with_return)
            plt.axis('off')
            plt.imshow(img)
        plt.show()
        
displayTrainData(train,random=True)

# Display Train data group by label_group
Show first 3 groups

In [None]:
groups = train.label_group.value_counts()

def displayTrainDataBLG(groups, COLS=6, ROWS=4, groups_count=3):
    for k in range(groups_count):
        print('label_group: ',groups.index[k])
        
        top = train.loc[train.label_group==groups.index[k]]
        displayTrainData(top, random=False, ROWS=ROWS, COLS=COLS)

displayTrainDataBLG(groups)

# Number of items in the group (descend)
Show first 10 groups

In [None]:
plt.figure(figsize=(20,5))
plt.bar(groups.index.values[:10].astype('str'),groups.values[:10])
plt.xticks(rotation = 45)
plt.ylabel('Number of items',size=14)
plt.xlabel('Label Group',size=14)
plt.title('Top 10 groups',size=16)
plt.show()

# Load test
Since public test data only has 3 posts, we use training data for testing.

In [None]:
USE_TRAIN_DATA = True

test = pd.read_csv('../input/shopee-product-matching/test.csv')
# Nếu submit thì không dùng traindata
if len(test)>3: USE_TRAIN_DATA = False

In [None]:
train = pd.read_csv('../input/shopee-product-matching/train.csv')
tmp = train.groupby('label_group').posting_id.agg('unique').to_dict()
# EXPECT RESULT
train['target'] = train.label_group.map(tmp)
train.head()

In [None]:
if USE_TRAIN_DATA:
    test = pd.read_csv('../input/shopee-product-matching/train.csv')
    test_gf = cudf.DataFrame(test)
    print('Dùng traindata để test khi chưa submit' )
else:
    #     Real test data
    test = pd.read_csv('../input/shopee-product-matching/test.csv')
    test_gf = cudf.read_csv('../input/shopee-product-matching/test.csv')
    print('SUBMITTED')
test_gf.head()

# Use image_phash
Posts always self-match. So use image_phash to add self-post.

In [None]:
tmp = test.groupby('image_phash').posting_id.agg('unique').to_dict()
test['use_image_phash'] = test.image_phash.map(tmp)
test.head()

# Use title
Extract Text Embeddings with TfidfVectorizer. Finding similar titles, use cosine similarity instead of KNN.

In [None]:
print('Computing text embeddings...')
model = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
text_embeddings = model.fit_transform(test_gf.title).toarray()

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    
    # COSINE SIMILARITY DISTANCE
    cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T
    
    for k in range(b-a):
        IDX = cupy.where(cts[k,]>0.7)[0]
        o = test.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)
        
del model, text_embeddings
_ = gc.collect()

In [None]:
test['use_title'] = preds
test.head()

# Use image
Use KNN to find similar images.

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size))
        return X

In [None]:
BASE = '../input/shopee-product-matching/test_images/'
if USE_TRAIN_DATA: BASE = '../input/shopee-product-matching/train_images/'

WGT = '../input/effnetb0/efficientnetb0_notop.h5'
model = EfficientNetB0(weights=WGT,include_top=False, pooling='avg', input_shape=None)

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for i,j in enumerate( range( CTS ) ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    
    test_gen = DataGenerator(test.iloc[a:b], batch_size=32, path=BASE)
    image_embeddings = model.predict(test_gen,verbose=1,use_multiprocessing=True, workers=4)
    embeds.append(image_embeddings)
    
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)

In [None]:
KNN = 60
if len(test)==3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar images...')
CTS = len(image_embeddings)//CHUNK
if len(image_embeddings)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(image_embeddings))
    distances, indices = model.kneighbors(image_embeddings[a:b,])
    
    for k in range(b-a):
        IDX = np.where(distances[k,]<6.0)[0]
        IDS = indices[k,IDX]
        o = test.iloc[IDS].posting_id.values
        preds.append(o)
        
del model, distances, indices, image_embeddings, embeds
_ = gc.collect()

In [None]:
test['use_image'] = preds
test.head()

# Submission CSV
Create submission.csv

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.use_image,row.use_title, row.use_image_phash])
    return ' '.join( np.unique(x) )

test['matches'] = test.apply(combine_for_sub,axis=1)

In [None]:
test[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()