In [None]:
# part 1
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt

# additional for part 2
from cuml.neighbors import NearestNeighbors
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
import cv2
import gc

# Credits to cdeotte ([from this kernel](https://www.kaggle.com/cdeotte/part-2-rapids-tfidfvectorizer-cv-0-700)):
* Restricting tensorflow GPU
* Commit/submit flags
* DataGenerator class
* Weights dataset
* Embedding code
* NearestNeighbors code

In [None]:
# RESTRICT TENSORFLOW TO 1GB OF GPU RAM
# SO THAT WE HAVE 15GB RAM FOR RAPIDS
LIMIT = 1
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    #print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    print(e)
print('We will restrict TensorFlow to max %iGB GPU RAM'%LIMIT)
print('then RAPIDS can use %iGB GPU RAM'%(16-LIMIT))

# Part 1 - choose cutoff for what's considered an item having a "close" image match

## Read in data from other sources

In [None]:
train_df = pd.read_csv("../input/shopee-product-matching/train.csv")
two_way_close_img_distances_df = pd.read_csv("../input/shoppee-efficientnetb0-distances/EfficientNetB0_distances.csv")
two_way_close_img_distances_df.head(5)

In [None]:
closest_distances = two_way_close_img_distances_df.groupby("posting_id")["img_distance"].min()
closest_non_true_distances = two_way_close_img_distances_df.query("~true_match").groupby("posting_id")["img_distance"].min()

## Choose cutoff through manual exploration

In [None]:
# see the figs below
# we will probe the test set to see if follows the blue distribution or the orange - or something in between
# we'll use that as a proxy to estimate product overlap with the training set

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5))
ax1.set_title("distribution of items' closest image distances")
closest_distances.hist(ax=ax1, bins=10)
ax2.set_title("distribution of items' closest image distances from different product-groups")
closest_non_true_distances.hist(color="orange", ax=ax2, bins=20)
plt.ylim([0,12500])
pass

In [None]:
# manually play around with different cutoffs
# we want something that is that has a high percentage for the training distribution, but there's a big difference between whether we include true matches or not
cutoff = .10
total = len(train_df)
print("Using cutoff of:", cutoff)
print(f"A. percent of items with matches within {cutoff} cosine distance: {(closest_distances<cutoff).sum() / total: .1%}")
print(f"B. percent of items with different-group matches within {cutoff} cosine distance:{(closest_non_true_distances<cutoff).sum() / total: .1%}")
print(f"C. Ratio of B/A: {(closest_distances<cutoff).sum() / (closest_non_true_distances<cutoff).sum(): .1f}")

In [None]:
# Set cutoff

# .1 gives us a nice percentage of items with "close" matches in the training set,
# as well as a big difference whether we include same-product matches or not
CUTOFF = .1

# 2 - Calculate Percent of items in test set, with "close" match in training set

## load test_df

In [None]:
test_df = pd.read_csv("../input/shopee-product-matching/test.csv")
if len(test_df) == 3:
    HAVE_TEST = False
else:
    HAVE_TEST = True
    
if not HAVE_TEST:
    test_df = train_df.iloc[:5000]

## create test_df embeddings

In [None]:
if HAVE_TEST:
    BASE = '../input/shopee-product-matching/test_images/'
else:
    BASE = '../input/shopee-product-matching/train_images/'

In [None]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, df, img_size=256, batch_size=32, path=''): 
        self.df = df
        self.img_size = img_size
        self.batch_size = batch_size
        self.path = path
        self.indexes = np.arange( len(self.df) )
        
    def __len__(self):
        'Denotes the number of batches per epoch'
        ct = len(self.df) // self.batch_size
        ct += int(( (len(self.df)) % self.batch_size)!=0)
        return ct

    def __getitem__(self, index):
        'Generate one batch of data'
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indexes)
        return X
            
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' 
        X = np.zeros((len(indexes),self.img_size,self.img_size,3),dtype='float32')
        df = self.df.iloc[indexes]
        for i,(index,row) in enumerate(df.iterrows()):
            img = cv2.imread(self.path+row.image)
            X[i,] = cv2.resize(img,(self.img_size,self.img_size)) #/128.0 - 1.0
        return X

In [None]:
WGT = '../input/effnetb0/efficientnetb0_notop.h5'
model = EfficientNetB0(weights=WGT,include_top=False, pooling='avg', input_shape=None)

embeds = []
CHUNK = 1024*4

print('Computing image embeddings...')
NUM_CHUNKS = len(test_df)//CHUNK
if len(test_df)%CHUNK!=0: NUM_CHUNKS += 1

for i in range(NUM_CHUNKS):

    a = i*CHUNK
    b = (i+1)*CHUNK
    b = min(b,len(test_df))
    print('chunk',a,'to',b)
    
    train_gen = DataGenerator(test_df.iloc[a:b], batch_size=32, path=BASE)
    image_embeddings = model.predict(train_gen,verbose=1,use_multiprocessing=True, workers=4)
    embeds.append(image_embeddings)
    
del model
_ = gc.collect()
test_image_embeddings = np.concatenate(embeds)
print('image embeddings shape',test_image_embeddings.shape)

## load train embeddings, to quickly make the nearest neighbor model

In [None]:
with open("../input/shoppee-create-distance-sets-df/train_img_embeddings.pkl", "rb") as f:
    train_image_embeddings = pkl.load(f)
    
if HAVE_TEST:
    n_neighbors=1
else:
    n_neighbors=2 # first neighbor in train will always be itself
    
model = NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
model.fit(train_image_embeddings)

## Find Neighbors

In [None]:
CHUNK = 1024*4

print('Finding similar images...')
NUM_CHUNKS = len(test_image_embeddings)//CHUNK
if len(test_image_embeddings)%CHUNK!=0: NUM_CHUNKS += 1

all_distances = []
    
for j in range(NUM_CHUNKS):

    a = j*CHUNK
    b = (j+1)*CHUNK
    b = min(b,len(test_image_embeddings))
    print('chunk',a,'to',b)
    distances, indices = model.kneighbors(test_image_embeddings[a:b,])

    distances = np.round(distances,4)
    
    all_distances.append(distances)

In [None]:
all_distances = np.concatenate(all_distances)
all_distances = all_distances[:,n_neighbors-1] # see code above for how we set n_neighbors

In [None]:
percent = (all_distances<CUTOFF).sum() / len(test_df)