In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
  #  for filename in filenames:
   #     print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import cv2
import matplotlib.pyplot as plt
import cudf, cuml, cupy

from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.applications import EfficientNetB3

import gc   # garbage collect

### Restrict TensorFlow to 1GB OF GPU RAM so that we have 15GB RAM for RAPIDS

In [None]:
LIMIT = 1

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    
  except RuntimeError as e:
    print(e)

In [None]:
COMPUTE_CV = True

test = pd.read_csv('../input/shopee-product-matching/test.csv')

if(len(test) > 3): COMPUTE_CV = False
else: print('This submission notebook will compute CV score, but commit notebook will not')

In [None]:
train = pd.read_csv('../input/shopee-product-matching/train.csv')

temp = train.groupby('label_group').posting_id.agg('unique').to_dict()

train['target'] = train.label_group.map(temp)

train.head()

In [None]:
def getMetric(col):
    def f1score(row):
        num_intersect = len(np.intersect1d(row['target'], row[col]))
        return 2 * num_intersect / (len(row['target']) + len(row[col]))
    return f1score

## Compute Baseline CV score

In [None]:
temp = train.groupby('image_phash').posting_id.agg('unique').to_dict()

train['oof'] = train.image_phash.map(temp)

In [None]:
train['f1'] = train.apply(getMetric('oof'), axis = 1)

print('Baseline CV score : {}'.format(train['f1'].mean()))

## Compute RAPIDS Model CV

In [None]:
if COMPUTE_CV:
    test = pd.read_csv('../input/shopee-product-matching/train.csv')
    test_cdf = cudf.DataFrame(test)
    print('Commit is On, i.e using train as test\n')
else:
    test = pd.read_csv('../input/shopee-product-matching/test.csv')
    test_cdf = cudf.read_csv('../input/shopee-product-matching/test.csv')
    print('Submission is On\n')
    
 
print('Shape : {}'.format(test_cdf.shape))

In [None]:
test_cdf.head()

## Using Image Embeddings

In [None]:
if COMPUTE_CV:
    base = '../input/shopee-product-matching/train_images/'
else:
    base = '../input/shopee-product-matching/test_images/'


In [None]:
# model = EfficientNetB3(weights = 'imagenet', include_top = False, pooling = 'avg', input_shape = None)

In [None]:
# model.save_weights('modelweights.h5')

In [None]:
model = EfficientNetB3(weights = None, include_top = False, pooling = 'avg', input_shape = None)

In [None]:
model.load_weights('../input/efficientnetb3-imagenet-weights/modelweights(1).h5')

### Custom DataGenrator for generating Data

In [None]:
import math

class DataGenerator(tf.keras.utils.Sequence):
    # Generates data for keras'
    
    def __init__(self, df, img_size = 256, batch_size = 32, path = ''):
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indices = np.arange(len(self.df))
        
    def __len__(self):                     # Denotes the number of batches per epoch
        return math.ceil(len(self.df) / self.batch_size)
    
    def __getitem__(self, index):   # Generates one batch of data
        
        indices = self.indices[ index*self.batch_size : min((index+1)*self.batch_size, len(self.df))]
        X = np.zeros((len(indices), self.img_size, self.img_size, 3))
        df = self.df.iloc[indices]
        
        for i , (index, row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path + row.image)
            X[i,] = cv2.resize(img, (self.img_size, self.img_size))
        
        return X
            
        



#### To prevent memory errors, we will compute image embeddings in chunks 

In [None]:
embeds = []
chunk_size = 1024*4

num_chunks = math.ceil(len(test) / chunk_size)

for i in range(num_chunks):
    low = i*chunk_size
    high = min((i+1)*chunk_size , len(test))
    
    print('chunk : {} - {}'.format(low, high))
    
    test_gen = DataGenerator(test.iloc[low : high], path = base)
    
    image_embeddings = model.predict(test_gen, verbose = 1, use_multiprocessing = True, workers = 4)
    
    embeds.append(image_embeddings)
    

image_embeddings = np.concatenate(embeds)

print('image embeddings shape : ',image_embeddings.shape)


In [None]:
del model   # model will delete it but the TF graph will have no changes.

_ = gc.collect()  

#### We will find similar images with RAPIDS cuML KNN in chunks

In [None]:
KNN = 50

if(len(test) == 3): KNN = 2

model = NearestNeighbors(n_neighbors = KNN)

model.fit(image_embeddings)


In [None]:
preds = []
chunk_size = 1024*4

num_chunks = math.ceil(len(image_embeddings)/chunk_size)

for i in range(num_chunks):
    
    low = i * chunk_size
    high = min((i+1) * chunk_size , len(image_embeddings))
    
    print('chunk : {} - {}'.format(low, high))
    
    distances, indices = model.kneighbors(image_embeddings[low:high])
    
    for k in range(high-low):
        ind = np.where(distances[k,] < 6.0)[0]
        ids = indices[k, ind]
        sim_img_ids = test.iloc[ids].posting_id.values
        preds.append(sim_img_ids)
        
del model, distances, indices, embeds, image_embeddings
_ = gc.collect()

In [None]:
test['pred2'] = preds
test.head()

## Use Text Embeddings

In [None]:
model = TfidfVectorizer(stop_words = 'english', binary = True, max_features = 25000)

text_embeddings = model.fit_transform(test_cdf.title).toarray()

print('text embeddings shape : ',text_embeddings.shape)

In [None]:
preds = []
chunk_size = 1024*4

num_chunks = math.ceil(len(test) / chunk_size)

for i in range(num_chunks):
    low = i * chunk_size
    high = min((i+1)*chunk_size, len(test))
    
    print('chunk : {} - {}'.format(low, high))
    
    distances = cupy.matmul(text_embeddings, text_embeddings[low:high].T).T
    
    for k in range(high-low):
        ind = cupy.where(distances[k,] > 0.7)[0]
        #sim_titles = test.iloc[ind].posting_id.values
        sim_titles = test.iloc[cupy.asnumpy(ind)].posting_id.values
        preds.append(sim_titles)
        
del model, text_embeddings
_ = gc.collect()

In [None]:
test['preds'] = preds
test.head()

### Using phash feature

In [None]:
temp = test.groupby('image_phash').posting_id.agg('unique').to_dict()

test['preds3']  = test['image_phash'].map(temp)

test.head()

### Compute CV score

In [None]:
def combine_for_sub(row):
    x = np.concatenate([row.preds, row.pred2, row.preds3])
    return " ".join(np.unique(x))

def combine_for_cv(row):
    x = np.concatenate([row.preds, row.pred2, row.preds3])
    return np.unique(x)

In [None]:
if COMPUTE_CV:
    temp = test.groupby('label_group').posting_id.agg('unique').to_dict()
    test['target'] = test['label_group'].map(temp)
    test['oof'] = test.apply(combine_for_cv, axis = 1)
    test['f1'] = test.apply(getMetric('oof'), axis = 1)
    print('CV score : ',test['f1'].mean())
    
test['matches'] = test.apply(combine_for_sub,axis=1)

In [None]:
test[['posting_id', 'matches']].to_csv('submission.csv',index=False)

sub = pd.read_csv('submission.csv')

sub.head()