In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

'''
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
'''
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Read in the train.csv file and set up file-pathes for all training images

In [None]:
import tensorflow as tf

#IMAGE_SIZE = [512, 512]
#IMAGE_SIZE = (224, 224)
IMAGE_SIZE = (299, 299)
AUTO = tf.data.experimental.AUTOTUNE

df = pd.read_csv('../input/shopee-product-matching/train.csv')

###
### for efficiency only use the first 1001 examples here
### for any serious work, you want to use all data,
### so simply delete or comment out this line in your forks
###
df = df.iloc[:1001]

image_paths = '../input/shopee-product-matching/train_images/' + df['image']

for i in range(10):
    print(i, image_paths[i])



Defining a few utility functions, the latter two are copied from https://www.kaggle.com/muhammad4hmed/b3-tfidf-knn-boom-p 

In [None]:
def get_labels(indices, df):
    return [df.iloc[i]['label_group'] for i in indices]

    
# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our test image and return image
def read_image(image):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image



Load a pretrained model, from the Keras Model "zoo".

In [None]:
model = tf.keras.applications.Xception(
    include_top=False,
    weights="imagenet",
    #input_tensor=None,
    input_shape=(299,299,3),
    pooling='max',
    #classes=1000,
    #classifier_activation="softmax",
)

model

Now compute all embeddings, this does take a while. We should use mini-batches instead of single images for some speed-up here.

In [None]:
all_embeddings = list()

for i,path in enumerate(image_paths):

    if i % 1000 == 0:
        print(i, len(all_embeddings))

    img0 = read_image(path).numpy()
    images = img0[np.newaxis, :, :, :]
    images = tf.keras.applications.xception.preprocess_input(images)
    embedding = model.predict(images)
    all_embeddings.append(embedding[0])
    
len(all_embeddings)

Compute k-Nearest Neighbours

In [None]:
from sklearn.neighbors import NearestNeighbors

X = np.array(all_embeddings)
nbrs = NearestNeighbors(n_neighbors=51, algorithm='ball_tree').fit(X)
distances, indices = nbrs.kneighbors(X)
print(indices[:3])
print([get_labels(indices0, df) for indices0 in indices[:3]])
print(distances[:3])



How many examples do we have for each label?

In [None]:
Y = df['label_group'].to_numpy()

tmp = df.groupby('label_group').posting_id.agg('unique').to_dict()
num_pos = np.array([ len(tmp[label]) for label in Y])

num_pos[:10]

Compute f1 for each example: *x* is a list of the kNNs of each point,*y* is the correct label for each point, and *num_pos* has the number of examples for each label.

In [None]:
def f1(x, y, num_pos):
    tp_count = np.count_nonzero(x == y[:, np.newaxis], axis=1)
    ### fudge for kNN ties
    tp_count[ tp_count == 0 ] = 1
    precision = tp_count / x.shape[1]
    recall = tp_count / num_pos
    f1 = precision*recall / (precision + recall)
    return 2.0 * f1



Compute the average F1 for each different number of neighbours: here we go from 1 up to 51.

In [None]:
for i in range(1,52):
    f = f1(X[:, :i], Y, num_pos)
    avg = f.sum() / len(f)
    print('F1', i, avg)

print('DONE')