In [None]:
import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

In [None]:
import tensorflow as tf
import numpy as np
from numpy.linalg import norm
import pickle
from tqdm import tqdm, tqdm_notebook
import pandas as pd
import math

In [None]:
train_df = pd.read_csv('../input/shopee-product-matching/train.csv')
test_df = pd.read_csv('../input/shopee-product-matching/test.csv')
len_data_train  = len(train_df)
len_data_test  = len(test_df)
BATCH_SIZE = 32
TRAIN_BATCHES = math.ceil(len_data_train/BATCH_SIZE)
TEST_BATCHES = math.ceil(len_data_test/BATCH_SIZE)
train_images = '../input/shopee-product-matching/train_images'
test_images = '../input/shopee-product-matching/test_images'
IMG_SIZE = 299

In [None]:
def myzip(s,t):
    return [(s[i], t[i]) for i in range(len(s))]

In [None]:
labels = list(set(train_df.label_group.tolist()))
 
labels.sort()
no_classes = len(labels)

label=[]
mapped=[]
for index,value in enumerate(labels):
    label.append(value)
    mapped.append(index)
zipper = myzip(label, mapped)
reverse = myzip(mapped,label)
label_dict = dict(zipper)
reverse_dict = dict(reverse)

for index,label in enumerate(train_df.label_group):    
    train_df.at[index,'label_group'] = label_dict[label]

In [None]:
train_df['image'] = train_df.image.map(lambda x: '../input/shopee-product-matching/train_images/' + x)
test_df['image'] = test_df.image.map(lambda x: '../input/shopee-product-matching/test_images/' + x)

In [None]:
xs_train_image = train_df.image.to_numpy()
xs_test_image = test_df.image.to_numpy()
class_ids = train_df.label_group.to_numpy()

In [None]:
filenames = xs_train_image

In [None]:
import random,cv2
num_images = len(filenames)
BATCHES = math.ceil(num_images/BATCH_SIZE)
ORB_F = 384
F_L = 32

In [None]:
def get_xs(xs_batch,batch,xs):
    orb = cv2.ORB_create()
    orb.setMaxFeatures(ORB_F)
    for index,image in enumerate(xs_batch):
        path = image        
        img = cv2.imread(path,0)
        kp = orb.detect(img,None)
        _, Descriptors = orb.compute(img, kp)
        try:
            img = Descriptors.flatten()
        except:
            print(path)
            img = np.zeros(ORB_F * F_L)
        if len(img) < ORB_F * F_L:
            img=np.concatenate((img,np.zeros(ORB_F * F_L-len(img))),axis=None)
        elif len(img) > ORB_F * F_L:
            img = img[:ORB_F * F_L]
        xs[BATCH_SIZE*batch+index] = img
        
    return

In [None]:
def get_images(data_len,batch_type,filenames):
    xs = np.empty([data_len,ORB_F * F_L])
    for batch in tqdm_notebook(range(batch_type)):
        get_xs(filenames[batch * BATCH_SIZE : min(BATCH_SIZE*batch + BATCH_SIZE,data_len)],batch,xs)
    return xs

In [None]:
feature_list = get_images(num_images,BATCHES,filenames)

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Perform PCA over the features
num_feature_dimensions=110      # Set the number of features
pca = PCA(n_components = num_feature_dimensions)
pca.fit(feature_list)
feature_list_compressed = pca.transform(feature_list)

In [None]:
def plot_images(similar_image_paths, distances): 
    plt.figure(figsize=(20,20))
    for i,imagepath in enumerate(similar_image_paths[:25]):
        plt.subplot(5,5,i+1)
        plt.xticks([])
        plt.yticks([])
        plt.grid(False)
        plt.imshow(mpimg.imread(imagepath))
        ipath = "/".join(imagepath.split("/")[-2:])
        if i == 0:
            plt.xlabel(f'Original: self d {distances[i]:.1f}')
            plt.ylabel(f'{ipath}')
        else:
            plt.xlabel(f'Near match: {distances[i]:.5f}')
            plt.ylabel(f'{ipath}')

In [None]:
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline      
neighbors = NearestNeighbors(n_neighbors=5, algorithm='brute',
metric='euclidean').fit(feature_list_compressed)

In [None]:
for i in range(6):
    random_image_index = random.randint(0,num_images)
    distances, indices = neighbors.kneighbors([feature_list_compressed[random_image_index]])
    # don't take the first closest image as it will be the same image
    similar_image_paths = [   filenames[random_image_index]   ] + [     filenames[    indices[0][i]   ] for i in range(1,len(indices[0]) )     ]
    plot_images(similar_image_paths, distances[0])