Thanks to: https://www.kaggle.com/cdeotte/rapids-cuml-tfidfvectorizer-and-knn

In [None]:
import cudf, cuml, cupy
import cv2
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import textwrap

In [None]:
# !pip install git+https://github.com/jmcarpenter2/swifter.git

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0

# RESTRICT TENSORFLOW TO 12GB OF GPU RAM
# SO THAT WE HAVE GPU RAM FOR RAPIDS CUML KNN
LIMIT = 12
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('Restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('so RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
phase = 'train'
# phase = 'test'
BASE = f'../input/shopee-product-matching/{phase}_images/'
train = pd.read_csv(f"../input/shopee-product-matching/{phase}.csv")
train_gf = cudf.read_csv(f"../input/shopee-product-matching/{phase}.csv")
WGT = '../input/effnetb0/efficientnetb0_notop.h5'

In [None]:
KNN = 50
if train.shape[0] == 3:
    KNN = 3

In [None]:
KNN

In [None]:
train.head()

In [None]:
# samples = train.sample(9)                    
# fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(16, 20))
# count=0
# for row in ax:
#     for col in row:
#         col.imshow(plt.imread('../input/shopee-product-matching/test_images/'+samples.iloc[count]['image']))
#         col.set_title('\n'.join(textwrap.wrap(samples.iloc[count]['title'], 35)))
#         count += 1
# plt.show()

### Similarity in title only

In [None]:
model = TfidfVectorizer(stop_words='english', binary=True)
text_embeddings = model.fit_transform(train_gf.title).toarray()

In [None]:
model = NearestNeighbors(n_neighbors=KNN)

#Unsupervised learning finding k nearest neighbours for each row
model.fit(text_embeddings)

#Distances has the distances and corresponding indices are in indices
distances, text_indices = model.kneighbors(text_embeddings)

In [None]:
# # Let us check the result
# for i in range(5):
#     print(train_gf.iloc[text_indices[i, 0:5]][['title']])

### Similarity in Images

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=BASE): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X        

#### Get embeddings for each image (for comparing)

In [None]:
model = EfficientNetB0(weights=WGT, include_top=False, pooling='avg', input_shape=None)
train_gen = DataGenerator(train, batch_size=128)
image_embeddings = model.predict(train_gen,verbose=1)
image_embeddings.shape

In [None]:
#49 neighbors
model = NearestNeighbors(n_neighbors=KNN)
#Unsupervised learning finding k nearest neighbours for each row
model.fit(image_embeddings)
#Distances has the distances and corresponding indices are in indices
distances, image_indices = model.kneighbors(image_embeddings)

In [None]:
# indices_df = pd.DataFrame(image_indices)
# samples = indices_df.sample(9)                    
# fig, ax = plt.subplots(nrows=9, ncols=4, figsize=(26, 80))
# line=0
# for row in ax:
#     column=0
#     for col in row:
#         item = train.iloc[samples.iloc[line:line+1, column:column+1].values[0][0]]
#         col.imshow(plt.imread('../input/shopee-product-matching/train_images/'+item['image']))
#         col.set_title('\n'.join(textwrap.wrap(item['title'], 35)))
#         column += 1
#     line += 1
# plt.show()

In [None]:
submission = pd.DataFrame()
submission['posting_id'] = train['posting_id']

In [None]:
indices = np.hstack((image_indices, text_indices.get()))

In [None]:
def get_top_text(x, len=50):
    x = np.unique(x)
    if x.shape[0] < 50: len = x.shape[0]
    return ' '.join(train.iloc[x]['posting_id'].values[0:len])


In [None]:
l = np.apply_along_axis(get_top_text, 1, indices )

In [None]:
submission['matches'] = l
# submission['matches'] = matchFunction(indices)

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)