# Import lib

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2, matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
print('TF',tf.__version__)
#Text Color
from termcolor import colored
from wordcloud import WordCloud, STOPWORDS

In [None]:
# RESTRICT TENSORFLOW TO 12GB OF GPU RAM
# SO THAT WE HAVE GPU RAM FOR RAPIDS CUML KNN
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('Restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('so RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

In [None]:
def getMetric(col):
    def f1score(row):
        n = len( np.intersect1d(row.target, row[col]) )
        return 2*n / (len(row.target) + len(row[col]))
    return f1score

## Compute RAPIDS Model CV and Infer Submission

In [None]:
COMPUTE_CV = True
train = pd.read_csv('../input/shopee-product-matching/train.csv')
test = pd.read_csv('../input/shopee-product-matching/test.csv')
if len(test)>3: COMPUTE_CV = False
else: print('this submission notebook will compute CV score but commit notebook will not')

In [None]:
import cudf

if COMPUTE_CV:
    test     = pd.read_csv('../input/shopee-product-matching/train.csv')
    new_test = [test, test[:1500]]
    test_pd  = pd.concat(new_test)
    test     = test_pd
    test_gf  = cudf.DataFrame(test)
    print('Test shape is', test_gf.shape)
    print('Using train as test to compute CV (since commit notebook). Shape is', test_gf.shape)
else:
    test    = pd.read_csv('../input/shopee-product-matching/test.csv')
    test_gf = cudf.DataFrame(test)
    print('Test shape is', test_gf.shape)
test_gf.head()

### Use Image Embeddings

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [None]:
from tensorflow.keras.applications import EfficientNetB0, EfficientNetB3, EfficientNetB5,EfficientNetB7
import gc
TRAIN_IMGS = '../input/shopee-product-matching/train_images/'
TEST_IMGS  = '../input/shopee-product-matching/test_images/'

MODEL = EfficientNetB0
if COMPUTE_CV:
    BASE = TRAIN_IMGS
else:
    BASE = TEST_IMGS
    
if MODEL == EfficientNetB0:
    WGT = '../input/tfkerasefficientnetimagenetnotop/efficientnetb0_notop.h5'
    model = EfficientNetB0(weights=WGT, include_top=False, pooling='avg', input_shape=None)
elif MODEL == EfficientNetB3:
    WGT = '../input/tfkerasefficientnetimagenetnotop/efficientnetb3_notop.h5'
    model = EfficientNetB3(weights=WGT, include_top=False, pooling='avg', input_shape=None)
elif MODEL == EfficientNetB5:
    WGT = '../input/tfkerasefficientnetimagenetnotop/efficientnetb5_notop.h5'
    model = EfficientNetB5(weights=WGT, include_top=False, pooling='avg', input_shape=None)
elif MODEL == EfficientNetB7:
    WGT = '../input/tfkerasefficientnetimagenetnotop/efficientnetb7_notop.h5'
    model = EfficientNetB7(weights=WGT, include_top=False, pooling='avg', input_shape=None)
       

In [None]:
embeds = []
CHUNK = 1024 * 4

print('Computing image embeddings...')
CTS = len(test) // CHUNK

if len(test) % CHUNK != 0: CTS += 1
print(CTS)
for i, j in enumerate(range(CTS)):
    
    a = j * CHUNK
    b = (j+1) * CHUNK
    b = min(b, len(test))
    print('chunk', a, 'to', b)
    
    test_gen = DataGenerator(test.iloc[a:b], batch_size=32, path=BASE)
    image_embeddings = model.predict(test_gen, verbose=1, use_multiprocessing=True, workers=4)
    embeds.append(image_embeddings)

In [None]:
del model
_ = gc.collect()
image_embeddings = np.concatenate(embeds)
print('image embeddings shape',image_embeddings.shape)


#### Retrieval method1: use KNN

In [None]:
from cuml.neighbors import NearestNeighbors

KNN           = 50 #50
if len(test) == 3: KNN = 2
model = NearestNeighbors(n_neighbors=KNN)
model.fit(image_embeddings)

In [None]:
preds = []
CHUNK = 1024*4

print('Finding similar images...')
CTS = len(image_embeddings) // CHUNK
if len(image_embeddings) % CHUNK != 0: CTS += 1
for j in range(CTS):
    
    a = j * CHUNK
    b = (j+1) * CHUNK
    b = min(b, len(image_embeddings))
    print('chunk', a, 'to', b)
    distances, indices = model.kneighbors(image_embeddings[a:b, ])
    for k in range(b-a):
        IDX = np.where(distances[k, ] < 6.0)[0] # 6.0, 3.6
        IDS = indices[k, IDX]
        o = test.iloc[IDS].posting_id.values
        preds.append(o)
        
del model, distances, indices, image_embeddings, embeds
_ = gc.collect()

In [None]:
test['preds2'] = preds
test.head()

### Use Text Embeddings

In [None]:
from cuml.feature_extraction.text import TfidfVectorizer

print('Computing text embeddings...')
model = TfidfVectorizer(stop_words='english', binary=True, max_features=25_000)
text_embeddings = model.fit_transform(test_gf.title).toarray()
print('text embeddings shape',text_embeddings.shape)

In [None]:
import cupy
preds = []
CHUNK = 1024*4

print('Finding similar titles...')
CTS = len(test)//CHUNK
if len(test)%CHUNK!=0: CTS += 1
for j in range( CTS ):
    
    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test))
    print('chunk',a,'to',b)
    
    # COSINE SIMILARITY DISTANCE
    cts = cupy.matmul( text_embeddings, text_embeddings[a:b].T).T
    
    for k in range(b-a):
        
        IDX = cupy.where(cts[k,]>0.7)[0]
        o = test.iloc[cupy.asnumpy(IDX)].posting_id.values
        preds.append(o)
       
        
del model, text_embeddings
_ = gc.collect()

In [None]:
test['preds'] = preds
test.head()

### Use  Phash Feature

In [None]:
tmp = test.groupby('image_phash').posting_id.agg('unique').to_dict()
test['preds3'] = test.image_phash.map(tmp)
test.head()

### Compute CV Score

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.preds, row.preds2, row.preds3])
    return ' '.join( np.unique(x) )

def combine_for_cv(row):
    x = np.concatenate([row.preds, row.preds2, row.preds3])
    return np.unique(x)

In [None]:
if COMPUTE_CV:
    tmp = test.groupby('label_group').posting_id.agg('unique').to_dict()
    test['target'] = test.label_group.map(tmp)
    test['oof'] = test.apply(combine_for_cv,axis=1)
    test['f1'] = test.apply(getMetric('oof'),axis=1)
    print('CV Score =', test.f1.mean() )
    print("CV for image :", round(test.apply(getMetric('preds2'),axis=1).mean(), 3))
    print("CV for text  :", round(test.apply(getMetric('preds'),axis=1).mean(), 3))
    print("CV for phash :", round(test.apply(getMetric('preds3'),axis=1).mean(), 3))

test['matches'] = test.apply(combine_for_sub,axis=1)

## submission

In [None]:
test[['posting_id','matches']].to_csv('submission.csv',index=False)
sub = pd.read_csv('submission.csv')
sub.head()